--- title: "SECOM Tutorial" author: - Huang Lin$^1$ - $^1$NIEHS, Research Triangle Park, NC 27709, USA date: '`r format(Sys.Date(), "%B %d, %Y")`' output: rmarkdown::html_vignette bibliography: bibliography.bib vignette: > %\VignetteIndexEntry{SECOM Tutorial} %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} --- ```{r setup, message = FALSE, warning = FALSE, comment = NA} knitr::opts_chunk$set(message = FALSE, warning = FALSE, comment = NA, fig.width = 6.25, fig.height = 5) library(ANCOMBC) library(tidyverse) ``` ```{r helper} get_upper_tri = function(cormat){ cormat[lower.tri(cormat)] = NA diag(cormat) = NA return(cormat) } ``` # 1. Introduction Sparse Estimation of Correlations among Microbiomes (SECOM) [@lin2022linear] is a methodology that aims to detect both linear and nonlinear relationships between a pair of taxa within an ecosystem (e.g., gut) or across ecosystems (e.g., gut and tongue). SECOM corrects both sample-specific and taxon-specific biases and obtains a consistent estimator for the correlation matrix of microbial absolute abundances while maintaining the underlying true sparsity. For more details, please refer to the [SECOM](https://doi.org/10.1038/s41467-022-32243-x) paper. # 2. Installation Download package. ```{r getPackage, eval=FALSE} if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install("ANCOMBC") ``` Load the package. ```{r load, eval=FALSE} library(ANCOMBC) ``` # 3. Example Data The HITChip Atlas dataset contains genus-level microbiota profiling with HITChip for 1006 western adults with no reported health complications, reported in [@lahti2014tipping]. The dataset is available via the microbiome R package [@lahti2017tools] in phyloseq [@mcmurdie2013phyloseq] format. ```{r} data(atlas1006, package = "microbiome") tse = mia::makeTreeSummarizedExperimentFromPhyloseq(atlas1006) # subset to baseline tse = tse[, tse$time == 0] # Re-code the bmi group tse$bmi = recode(tse$bmi_group, obese = "obese", severeobese = "obese", morbidobese = "obese") # Subset to lean, overweight, and obese subjects tse = tse[, tse$bmi %in% c("lean", "overweight", "obese")] # Create the region variable tse$region = recode(as.character(tse$nationality), Scandinavia = "NE", UKIE = "NE", SouthEurope = "SE", CentralEurope = "CE", EasternEurope = "EE", .missing = "unknown") # Discard "EE" as it contains only 1 subject # Discard subjects with missing values of region tse = tse[, ! tse$region %in% c("EE", "unknown")] print(tse) ``` # 4. Run SECOM on a Single Ecosystem ## 4.1 Run secom functions ```{r} set.seed(123) # Linear relationships res_linear = secom_linear(data = list(tse), assay_name = "counts", tax_level = "Phylum", pseudo = 0, prv_cut = 0.5, lib_cut = 1000, corr_cut = 0.5, wins_quant = c(0.05, 0.95), method = "pearson", soft = FALSE, thresh_len = 20, n_cv = 10, thresh_hard = 0.3, max_p = 0.005, n_cl = 2) # Nonlinear relationships res_dist = secom_dist(data = list(tse), assay_name = "counts", tax_level = "Phylum", pseudo = 0, prv_cut = 0.5, lib_cut = 1000, corr_cut = 0.5, wins_quant = c(0.05, 0.95), R = 1000, thresh_hard = 0.3, max_p = 0.005, n_cl = 2) ``` ## 4.2 Visualizations {.tabset} ### Pearson correlation with thresholding ```{r} corr_linear = res_linear$corr_th cooccur_linear = res_linear$mat_cooccur # Filter by co-occurrence overlap = 10 corr_linear[cooccur_linear < overlap] = 0 df_linear = data.frame(get_upper_tri(corr_linear)) %>% rownames_to_column("var1") %>% pivot_longer(cols = -var1, names_to = "var2", values_to = "value") %>% filter(!is.na(value)) %>% mutate(value = round(value, 2)) tax_name = sort(union(df_linear$var1, df_linear$var2)) df_linear$var1 = factor(df_linear$var1, levels = tax_name) df_linear$var2 = factor(df_linear$var2, levels = tax_name) heat_linear_th = df_linear %>% ggplot(aes(var2, var1, fill = value)) + geom_tile(color = "black") + scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "grey", midpoint = 0, limit = c(-1,1), space = "Lab", name = NULL) + scale_x_discrete(drop = FALSE) + scale_y_discrete(drop = FALSE) + geom_text(aes(var2, var1, label = value), color = "black", size = 4) + labs(x = NULL, y = NULL, title = "Pearson (Thresholding)") + theme_bw() + theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1, face = "italic"), axis.text.y = element_text(size = 12, face = "italic"), strip.text.x = element_text(size = 14), strip.text.y = element_text(size = 14), legend.text = element_text(size = 12), plot.title = element_text(hjust = 0.5, size = 15), panel.grid.major = element_blank(), axis.ticks = element_blank(), legend.position = "none") + coord_fixed() heat_linear_th ``` ### Pearson correlation with p-value filtering ```{r} corr_linear = res_linear$corr_fl cooccur_linear = res_linear$mat_cooccur # Filter by co-occurrence overlap = 10 corr_linear[cooccur_linear < overlap] = 0 df_linear = data.frame(get_upper_tri(corr_linear)) %>% rownames_to_column("var1") %>% pivot_longer(cols = -var1, names_to = "var2", values_to = "value") %>% filter(!is.na(value)) %>% mutate(value = round(value, 2)) tax_name = sort(union(df_linear$var1, df_linear$var2)) df_linear$var1 = factor(df_linear$var1, levels = tax_name) df_linear$var2 = factor(df_linear$var2, levels = tax_name) heat_linear_fl = df_linear %>% ggplot(aes(var2, var1, fill = value)) + geom_tile(color = "black") + scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "grey", midpoint = 0, limit = c(-1,1), space = "Lab", name = NULL) + scale_x_discrete(drop = FALSE) + scale_y_discrete(drop = FALSE) + geom_text(aes(var2, var1, label = value), color = "black", size = 4) + labs(x = NULL, y = NULL, title = "Pearson (Filtering)") + theme_bw() + theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1, face = "italic"), axis.text.y = element_text(size = 12, face = "italic"), strip.text.x = element_text(size = 14), strip.text.y = element_text(size = 14), legend.text = element_text(size = 12), plot.title = element_text(hjust = 0.5, size = 15), panel.grid.major = element_blank(), axis.ticks = element_blank(), legend.position = "none") + coord_fixed() heat_linear_fl ``` ### Distance correlation with p-value filtering ```{r} corr_dist = res_dist$dcorr_fl cooccur_dist = res_dist$mat_cooccur # Filter by co-occurrence overlap = 10 corr_dist[cooccur_dist < overlap] = 0 df_dist = data.frame(get_upper_tri(corr_dist)) %>% rownames_to_column("var1") %>% pivot_longer(cols = -var1, names_to = "var2", values_to = "value") %>% filter(!is.na(value)) %>% mutate(value = round(value, 2)) tax_name = sort(union(df_dist$var1, df_dist$var2)) df_dist$var1 = factor(df_dist$var1, levels = tax_name) df_dist$var2 = factor(df_dist$var2, levels = tax_name) heat_dist_fl = df_dist %>% ggplot(aes(var2, var1, fill = value)) + geom_tile(color = "black") + scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "grey", midpoint = 0, limit = c(-1,1), space = "Lab", name = NULL) + scale_x_discrete(drop = FALSE) + scale_y_discrete(drop = FALSE) + geom_text(aes(var2, var1, label = value), color = "black", size = 4) + labs(x = NULL, y = NULL, title = "Distance (Filtering)") + theme_bw() + theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1, face = "italic"), axis.text.y = element_text(size = 12, face = "italic"), strip.text.x = element_text(size = 14), strip.text.y = element_text(size = 14), legend.text = element_text(size = 12), plot.title = element_text(hjust = 0.5, size = 15), panel.grid.major = element_blank(), axis.ticks = element_blank(), legend.position = "none") + coord_fixed() heat_dist_fl ``` # 5. Run SECOM on Multiple Ecosystems ## 5.1 Data manipulation To compute correlations whithin and across different ecosystems, one needs to make sure that there are samples in common across these ecosystems. ```{r} # Select subjects from "CE" and "NE" tse1 = tse[, tse$region == "CE"] tse2 = tse[, tse$region == "NE"] # Rename samples to ensure there is an overlap of samples between CE and NE colnames(tse1) = paste0("Sample-", seq_len(ncol(tse1))) colnames(tse2) = paste0("Sample-", seq_len(ncol(tse2))) print(tse1) print(tse2) ``` ## 5.2 Run secom functions ```{r} set.seed(123) # Linear relationships res_linear = secom_linear(data = list(CE = tse1, NE = tse2), assay_name = c("counts", "counts"), tax_level = c("Phylum", "Phylum"), pseudo = 0, prv_cut = 0.5, lib_cut = 1000, corr_cut = 0.5, wins_quant = c(0.05, 0.95), method = "pearson", soft = FALSE, thresh_len = 20, n_cv = 10, thresh_hard = 0.3, max_p = 0.005, n_cl = 2) # Nonlinear relationships res_dist = secom_dist(data = list(CE = tse1, NE = tse2), assay_name = c("counts", "counts"), tax_level = c("Phylum", "Phylum"), pseudo = 0, prv_cut = 0.5, lib_cut = 1000, corr_cut = 0.5, wins_quant = c(0.05, 0.95), R = 1000, thresh_hard = 0.3, max_p = 0.005, n_cl = 2) ``` ## 5.3 Visualizations {.tabset} ### Pearson correlation with thresholding ```{r, fig.width=8, fig.height=8} corr_linear = res_linear$corr_th cooccur_linear = res_linear$mat_cooccur # Filter by co-occurrence overlap = 10 corr_linear[cooccur_linear < overlap] = 0 df_linear = data.frame(get_upper_tri(corr_linear)) %>% rownames_to_column("var1") %>% pivot_longer(cols = -var1, names_to = "var2", values_to = "value") %>% filter(!is.na(value)) %>% mutate(var2 = gsub("\\...", " - ", var2), value = round(value, 2)) tax_name = sort(union(df_linear$var1, df_linear$var2)) df_linear$var1 = factor(df_linear$var1, levels = tax_name) df_linear$var2 = factor(df_linear$var2, levels = tax_name) txt_color = ifelse(grepl("CE", tax_name), "#1B9E77", "#D95F02") heat_linear_th = df_linear %>% ggplot(aes(var2, var1, fill = value)) + geom_tile(color = "black") + scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "grey", midpoint = 0, limit = c(-1,1), space = "Lab", name = NULL) + scale_x_discrete(drop = FALSE) + scale_y_discrete(drop = FALSE) + geom_text(aes(var2, var1, label = value), color = "black", size = 4) + labs(x = NULL, y = NULL, title = "Pearson (Thresholding)") + theme_bw() + geom_vline(xintercept = 6.5, color = "blue", linetype = "dashed") + geom_hline(yintercept = 6.5, color = "blue", linetype = "dashed") + theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1, face = "italic", color = txt_color), axis.text.y = element_text(size = 12, face = "italic", color = txt_color), strip.text.x = element_text(size = 14), strip.text.y = element_text(size = 14), legend.text = element_text(size = 12), plot.title = element_text(hjust = 0.5, size = 15), panel.grid.major = element_blank(), axis.ticks = element_blank(), legend.position = "none") + coord_fixed() heat_linear_th ``` ### Pearson correlation with p-value filtering ```{r, fig.width=8, fig.height=8} corr_linear = res_linear$corr_th cooccur_linear = res_linear$mat_cooccur # Filter by co-occurrence overlap = 10 corr_linear[cooccur_linear < overlap] = 0 df_linear = data.frame(get_upper_tri(corr_linear)) %>% rownames_to_column("var1") %>% pivot_longer(cols = -var1, names_to = "var2", values_to = "value") %>% filter(!is.na(value)) %>% mutate(var2 = gsub("\\...", " - ", var2), value = round(value, 2)) tax_name = sort(union(df_linear$var1, df_linear$var2)) df_linear$var1 = factor(df_linear$var1, levels = tax_name) df_linear$var2 = factor(df_linear$var2, levels = tax_name) txt_color = ifelse(grepl("CE", tax_name), "#1B9E77", "#D95F02") heat_linear_fl = df_linear %>% ggplot(aes(var2, var1, fill = value)) + geom_tile(color = "black") + scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "grey", midpoint = 0, limit = c(-1,1), space = "Lab", name = NULL) + scale_x_discrete(drop = FALSE) + scale_y_discrete(drop = FALSE) + geom_text(aes(var2, var1, label = value), color = "black", size = 4) + labs(x = NULL, y = NULL, title = "Pearson (Filtering)") + theme_bw() + geom_vline(xintercept = 6.5, color = "blue", linetype = "dashed") + geom_hline(yintercept = 6.5, color = "blue", linetype = "dashed") + theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1, face = "italic", color = txt_color), axis.text.y = element_text(size = 12, face = "italic", color = txt_color), strip.text.x = element_text(size = 14), strip.text.y = element_text(size = 14), legend.text = element_text(size = 12), plot.title = element_text(hjust = 0.5, size = 15), panel.grid.major = element_blank(), axis.ticks = element_blank(), legend.position = "none") + coord_fixed() heat_linear_fl ``` ### Distance correlation with p-value filtering ```{r, fig.width=8, fig.height=8} corr_dist = res_dist$dcorr_fl cooccur_dist = res_dist$mat_cooccur # Filter by co-occurrence overlap = 10 corr_dist[cooccur_dist < overlap] = 0 df_dist = data.frame(get_upper_tri(corr_dist)) %>% rownames_to_column("var1") %>% pivot_longer(cols = -var1, names_to = "var2", values_to = "value") %>% filter(!is.na(value)) %>% mutate(var2 = gsub("\\...", " - ", var2), value = round(value, 2)) tax_name = sort(union(df_dist$var1, df_dist$var2)) df_dist$var1 = factor(df_dist$var1, levels = tax_name) df_dist$var2 = factor(df_dist$var2, levels = tax_name) txt_color = ifelse(grepl("CE", tax_name), "#1B9E77", "#D95F02") heat_dist_fl = df_dist %>% ggplot(aes(var2, var1, fill = value)) + geom_tile(color = "black") + scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "grey", midpoint = 0, limit = c(-1,1), space = "Lab", name = NULL) + scale_x_discrete(drop = FALSE) + scale_y_discrete(drop = FALSE) + geom_text(aes(var2, var1, label = value), color = "black", size = 4) + labs(x = NULL, y = NULL, title = "Distance (Filtering)") + theme_bw() + geom_vline(xintercept = 6.5, color = "blue", linetype = "dashed") + geom_hline(yintercept = 6.5, color = "blue", linetype = "dashed") + theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1, face = "italic", color = txt_color), axis.text.y = element_text(size = 12, face = "italic", color = txt_color), strip.text.x = element_text(size = 14), strip.text.y = element_text(size = 14), legend.text = element_text(size = 12), plot.title = element_text(hjust = 0.5, size = 15), panel.grid.major = element_blank(), axis.ticks = element_blank(), legend.position = "none") + coord_fixed() heat_dist_fl ``` # Session information ```{r sessionInfo, message = FALSE, warning = FALSE, comment = NA} sessionInfo() ``` # References