## ----style, echo = FALSE, results = 'asis'------------------------------------
BiocStyle::markdown()

## ----message = FALSE----------------------------------------------------------
library("MetaboCoreUtils")
ls(pos = "package:MetaboCoreUtils")

## -----------------------------------------------------------------------------
library(MetaboCoreUtils)

## -----------------------------------------------------------------------------
adductNames()

## -----------------------------------------------------------------------------
masses <- c(123, 842, 324)
mass2mz(masses, adduct = c("[M+H]+", "[M+Na]+"))

## -----------------------------------------------------------------------------
formula2mz(c("C6H12O6", "C8H10N4O2"), adduct = c("[M+H]+", "[M+Na]+"))

## -----------------------------------------------------------------------------
frml <- "Na3C4"
frml <- standardizeFormula(frml)
frml

## -----------------------------------------------------------------------------
frml <- addElements(frml, "H2O")
frml

## -----------------------------------------------------------------------------
frml <- subtractElements(frml, "H")
frml

## -----------------------------------------------------------------------------
countElements(frml)

## -----------------------------------------------------------------------------
adductFormula(c("C6H12O6", "C8H10N4O2"), adduct = c("[M+H]+", "[M+Na]+"))

## -----------------------------------------------------------------------------
calculateMass(c("C6H12O6", "[13C3]C3H12O6"))

## -----------------------------------------------------------------------------
lipid_masses <- c(760.5851, 762.6007, 762.5280)
calculateKmd(lipid_masses)

## -----------------------------------------------------------------------------
lipid_rkmd <- calculateRkmd(lipid_masses)
isRkmd(lipid_rkmd)

## -----------------------------------------------------------------------------
rti <- read.table(system.file("retentionIndex",
                              "rti.txt",
                              package = "MetaboCoreUtils"),
                  header = TRUE,
                  sep = "\t")

rtime <- read.table(system.file("retentionIndex",
                                "metabolites.txt",
                                package = "MetaboCoreUtils"),
                    header = TRUE,
                    sep = "\t")

## -----------------------------------------------------------------------------
head(rti)

## -----------------------------------------------------------------------------
rtime$rindex_r <- indexRtime(rtime$rtime, rti)

## -----------------------------------------------------------------------------
head(rtime)

## -----------------------------------------------------------------------------
ref <- data.frame(rindex = c(1709.8765, 553.7975),
                  refindex = c(1700, 550))

rtime$rindex_cor <- correctRindex(rtime$rindex_r, ref)

## -----------------------------------------------------------------------------
vals <- read.table(system.file("txt", "feature_values.txt",
                               package = "MetaboCoreUtils"), sep = "\t")
vals <- as.matrix(vals)
head(vals)

## -----------------------------------------------------------------------------
#' Define a data frame with the injection index
sdata <- data.frame(injection_index = seq_len(ncol(vals)))

#' Identify columns representing QC samples
qc_index <- grep("^POOL", colnames(vals))

length(qc_index)

## -----------------------------------------------------------------------------
#' Fit linear models explaining observed abundances by injection index.
#' Linear models are fitted row-wise to data of QC samples.
qc_lm <- fit_lm(y ~ injection_index,
                data = sdata[qc_index, , drop = FALSE],
                y = log2(vals[, qc_index]),
                minVals = 9)

## -----------------------------------------------------------------------------
qc_lm[[1]]

## -----------------------------------------------------------------------------
sum(is.na(qc_lm))

## ----fig.cap = "Abundance of an example feature along injection index. Open circles represent measurements in study samples, filled circles in QC samples. The black solid line represents the estimated signal drift."----
plot(x = sdata$injection_index, y = log2(vals[1, ]),
     xlab = "injection_index", ylab = expression(log[2]~abundance))
#' Indicate QC samples
points(x = sdata$injection_index[qc_index],
       y = log2(vals[1, qc_index]), pch = 16, col = "#00000080")
grid()
abline(qc_lm[[1]])

## ----fig.cap = "Abundance of an example feature along injection index. Open circles represent measurements in study samples, filled circles in QC samples. The black solid line represents the estimated signal drift."----
plot(x = sdata$injection_index, y = log2(vals[2, ]),
     xlab = "injection_index", ylab = expression(log[2]~abundance))
#' Indicate QC samples
points(x = sdata$injection_index[qc_index],
       y = log2(vals[2, qc_index]), pch = 16, col = "#00000080")
grid()
abline(qc_lm[[2]])

## -----------------------------------------------------------------------------
#' Extract slope, F-statistic and R squared from each model, skipping
#' features for which no model was fitted.
qc_lm_summary <- lapply(qc_lm, function(z) {
    if (length(z) > 1) {
        s <- summary(z)
        c(slope = coefficients(s)[2, "Estimate"],
          p.value = coefficients(s)[2, 4],
          adj.r.squared = s$adj.r.squared)
    } else c(slope = NA_real_, F = NA_real_,
             adj.r.squared = NA_real_) # returning NA for skipped models
}) |> do.call(what = rbind)

head(qc_lm_summary)

## -----------------------------------------------------------------------------
plot(qc_lm_summary[, "slope"], -log10(qc_lm_summary[, "p.value"]),
     xlab = "injection order dependency", ylab = expression(-log[10](p~value)),
     pch = 21, col = "#00000080", bg = "#00000040")
grid()
abline(h = -log10(0.05))

## -----------------------------------------------------------------------------
idx <- which.max(qc_lm_summary[, "slope"])
plot(x = sdata$injection_index, y = log2(vals[idx, ]),
     xlab = "injection_index", ylab = expression(log[2]~abundance))
#' Indicate QC samples
points(x = sdata$injection_index[qc_index],
       y = log2(vals[idx, qc_index]), pch = 16, col = "#00000080")
grid()
abline(qc_lm[[idx]])

## -----------------------------------------------------------------------------
qc_lm_summary[idx, ]

## -----------------------------------------------------------------------------
idx2 <- which(qc_lm_summary[, "slope"] > 0.01 &
              qc_lm_summary[, "p.value"] > 0.05)
plot(x = sdata$injection_index, y = log2(vals[idx2, ]),
     xlab = "injection_index", ylab = expression(log[2]~abundance))
points(x = sdata$injection_index[qc_index],
       y = log2(vals[idx2, qc_index]), pch = 16, col = "#00000080")
grid()
abline(qc_lm[[idx2]])

## -----------------------------------------------------------------------------
qc_lm_summary[idx2, ]

## -----------------------------------------------------------------------------
qc_lm[qc_lm_summary[, "p.value"] > 0.05] <- NA

## -----------------------------------------------------------------------------
#' Adjust the data for the estimated signal drift
vals_adj <- adjust_lm(log2(vals), data = sdata, lm = qc_lm)

#' Transform data again into natural scale
vals_adj <- 2^vals_adj

## ----fig.cap = "Feature abundances before (open circles) and after adjustment (filled circles). The grey dashed line represents the injection index dependent signal drift estimated on the raw data and the solid grey line the same model estimated on the adjusted data."----
plot(x = sdata$injection_index, y = log2(vals[2, ]),
     xlab = "injection_index", ylab = expression(log[2]~abundance),
     col = "#00000080")
points(x = sdata$injection_index, y = log2(vals_adj[2, ]),
       pch = 16, col = "#00000080")
grid()
abline(qc_lm[[2]], col = "grey", lty = 2)

#' fit a model to the QC samples of the adjusted data
l <- lm(log2(vals_adj[2, qc_index]) ~ sdata$injection_index[qc_index])
abline(l, col = "grey")


## -----------------------------------------------------------------------------
#' Identify features for which the adjustment was performed
fts_adj <- which(!is.na(qc_lm))

#' Define a function to calculate the correlation
cor_fun <- function(i, y) {
    values <- y[i, qc_index]
    if (sum(!is.na(values)) >= 9)
        cor(values, sdata$injection_index[qc_index],
            method = "spearman", use = "pairwise.complete.obs")
    else NA_real_
}

#' Calculate correlations for raw data, skipping features
#' with less than 9 non-missing values
cor_raw <- vapply(seq_along(qc_lm), cor_fun, numeric(1), y = vals)

## -----------------------------------------------------------------------------
#' Calculate correlations for adjusted data
cor_adj <- vapply(seq_along(qc_lm), cor_fun, numeric(1), y = vals_adj)

## ----fig.cap = "Correlation between abundances and injection index for QC samples before (black) and after adjustment (red). Filled circles represent the features for which the drift was adjusted for. Correlation coefficients are sorted."----
plot(sort(cor_raw), col = "#00000080", main = "QC samples", ylab = "rho",
     xlab = "rank")
idx <- order(cor_adj)
bg <- rep(NA, length(cor_adj))
bg[fts_adj] <- "#ff000040"
points(cor_adj[idx], pch = 21, col = "#ff000080", bg = bg[idx])

## -----------------------------------------------------------------------------
?quality_assessment

## -----------------------------------------------------------------------------
# Define sample data for metabolomics analysis
set.seed(123)
metabolomics_data <- matrix(rnorm(100), nrow = 10)
colnames(metabolomics_data) <- paste0("Sample", 1:10)
rownames(metabolomics_data) <- paste0("Feature", 1:10)

## -----------------------------------------------------------------------------
# Calculate and display the coefficient of variation
cv_result <- rowRsd(metabolomics_data)
print(cv_result)

## -----------------------------------------------------------------------------
# Generate QC samples
qc_samples <- matrix(rnorm(40), nrow = 10)
colnames(qc_samples) <- paste0("QC", 1:4)

# Calculate D-ratio and display the result
dratio_result <- rowDratio(metabolomics_data, qc_samples)
print(dratio_result)

## -----------------------------------------------------------------------------
# Introduce missing values in the data
metabolomics_data[sample(1:100, 10)] <- NA

# Calculate and display the percentage of missing values
missing_result <- rowPercentMissing(metabolomics_data)
print(missing_result)

## -----------------------------------------------------------------------------
# Generate blank samples
blank_samples <- matrix(rnorm(30), nrow = 10)
colnames(blank_samples) <- paste0("Blank", 1:3)

# Detect rows where mean(test) > 2 * mean(blank)
blank_detection_result <- rowBlank(metabolomics_data, blank_samples)
print(blank_detection_result)

## -----------------------------------------------------------------------------
# Set filtering thresholds
cv_threshold <- 8
dratio_threshold <- 0.8

# Apply filters
filtered_data <- metabolomics_data[
  cv_result <= cv_threshold &
  dratio_result <= dratio_threshold &
  missing_result <= 10 &
  !blank_detection_result, , drop = FALSE]

# Display the filtered data
print(filtered_data)

## ----sessioninfo, echo=FALSE--------------------------------------------------
sessionInfo()