## ----global_options, include=FALSE-------------------------------------------- knitr::opts_chunk$set(echo=TRUE, warning=FALSE, message=FALSE, fig.pos = 'h') ## ----setup, include = FALSE--------------------------------------------------- chooseCRANmirror(graphics=FALSE, ind=1) knitr::opts_chunk$set( collapse = TRUE, comment = '#>' ) options(tinytex.verbose = TRUE) ## ----batch_workflow, include = TRUE, fig.align = 'center', echo=FALSE, fig.cap='proBatch in batch correction workflow', out.width = '50%'---- knitr::include_graphics('Batch_effects_workflow_staircase.png') ## ----dependencies, eval = FALSE----------------------------------------------- # bioc_deps <- c("GO.db", "impute", "preprocessCore", "pvca","sva" ) # cran_deps <- c("corrplot", "data.table", "ggplot2", "ggfortify","lazyeval", # "lubridate", "pheatmap", "reshape2","readr", "rlang", # "tibble", "dplyr", "tidyr", "wesanderson","WGCNA") # # if (!requireNamespace("BiocManager", quietly = TRUE)) # install.packages("BiocManager") # BiocManager::install(bioc_deps) # install.packages(cran_deps) ## ----install_proBatch, fig.show='hold', eval = FALSE-------------------------- # #install the development version from GitHub: # install.packages('devtools') # devtools::install_github('symbioticMe/proBatch', build_vignettes = TRUE) ## ----load_packages------------------------------------------------------------ require(dplyr) require(tibble) require(ggplot2) ## ----col_names---------------------------------------------------------------- feature_id_col = 'peptide_group_label' measure_col = 'Intensity' sample_id_col = 'FullRunName' essential_columns = c(feature_id_col, measure_col, sample_id_col) ## ----tech_bio_cols------------------------------------------------------------ technical_factors = c('MS_batch', 'digestion_batch') biological_factors = c('Strain', 'Diet', 'Sex') biospecimen_id_col = 'EarTag' ## ----------------------------------------------------------------------------- batch_col = 'MS_batch' ## ----load_data, fig.show='hold'----------------------------------------------- library(proBatch) data('example_proteome', 'example_sample_annotation', 'example_peptide_annotation', package = 'proBatch') ## ----date_to_order, fig.show='hold'------------------------------------------- generated_sample_annotation <- date_to_sample_order(example_sample_annotation, time_column = c('RunDate','RunTime'), new_time_column = 'generated_DateTime', dateTimeFormat = c('%b_%d', '%H:%M:%S'), new_order_col = 'generated_order', instrument_col = NULL) library(knitr) kable(generated_sample_annotation[1:5,] %>% select(c('RunDate', 'RunTime', 'order', 'generated_DateTime', 'generated_order'))) ## ----pep_annotation, fig.show='hold'------------------------------------------ generated_peptide_annotation <- create_peptide_annotation(example_proteome, feature_id_col = 'peptide_group_label', protein_col = 'Protein') ## ----reduce_proteome---------------------------------------------------------- example_proteome = example_proteome %>% select(one_of(essential_columns)) gc() ## ----long_to_matrix----------------------------------------------------------- example_matrix <- long_to_matrix(example_proteome, feature_id_col = 'peptide_group_label', measure_col = 'Intensity', sample_id_col = 'FullRunName') ## ----------------------------------------------------------------------------- log_transformed_matrix <- log_transform_dm(example_matrix, log_base = 2, offset = 1) ## ---- fig.show='hold'--------------------------------------------------------- color_list <- sample_annotation_to_colors(example_sample_annotation, factor_columns = c('MS_batch', 'digestion_batch', 'EarTag', 'Strain', 'Diet', 'Sex'), numeric_columns = c('DateTime','order')) ## ----plot_mean, fig.show='hold', fig.width=5, fig.height=2-------------------- plot_sample_mean(log_transformed_matrix, example_sample_annotation, order_col = 'order', batch_col = batch_col, color_by_batch = TRUE, ylimits = c(12, 16.5), color_scheme = color_list[[batch_col]]) ## ----plot_boxplots, fig.show='hold', fig.width=10, fig.height=5--------------- log_transformed_long <- matrix_to_long(log_transformed_matrix) batch_col = 'MS_batch' plot_boxplot(log_transformed_long, example_sample_annotation, batch_col = batch_col, color_scheme = color_list[[batch_col]]) ## ----median_normalization_log, fig.show='hold'-------------------------------- median_normalized_matrix = normalize_data_dm(log_transformed_matrix, normalize_func = 'medianCentering') ## ----median_normalization_raw, fig.show='hold'-------------------------------- median_normalized_matrix = normalize_data_dm(example_matrix, normalize_func = 'medianCentering', log_base = 2, offset = 1) ## ----quantile_norm, fig.show='hold'------------------------------------------- quantile_normalized_matrix = normalize_data_dm(log_transformed_matrix, normalize_func = 'quantile') ## ----plot_mean_normalized, fig.show='hold', fig.width=5, fig.height=2--------- plot_sample_mean(quantile_normalized_matrix, example_sample_annotation, color_by_batch = TRUE, ylimits = c(12, 16), color_scheme = color_list[[batch_col]]) ## ----plot_hierarchical, fig.show='hold', fig.width=10, fig.height=5----------- selected_annotations <- c('MS_batch', 'digestion_batch', 'Diet') #Plot clustering between samples plot_hierarchical_clustering(quantile_normalized_matrix, sample_annotation = example_sample_annotation, color_list = color_list, factors_to_plot = selected_annotations, distance = 'euclidean', agglomeration = 'complete', label_samples = FALSE) ## ----plot_heatmap, fig.show='hold', fig.width=10, fig.height=11--------------- plot_heatmap_diagnostic(quantile_normalized_matrix, example_sample_annotation, factors_to_plot = selected_annotations, cluster_cols = TRUE, color_list = color_list, show_rownames = FALSE, show_colnames = FALSE) ## ----plot_PCA, fig.show='hold', fig.width=10, fig.height=8.5------------------ pca1 = plot_PCA(quantile_normalized_matrix, example_sample_annotation, color_by = 'MS_batch', plot_title = 'MS batch', color_scheme = color_list[['MS_batch']]) pca2 = plot_PCA(quantile_normalized_matrix, example_sample_annotation, color_by = 'digestion_batch', plot_title = 'Digestion batch', color_scheme = color_list[['digestion_batch']]) pca3 = plot_PCA(quantile_normalized_matrix, example_sample_annotation, color_by = 'Diet', plot_title = 'Diet', color_scheme = color_list[['Diet']]) pca4 = plot_PCA(quantile_normalized_matrix, example_sample_annotation, color_by = 'DateTime', plot_title = 'DateTime', color_scheme = color_list[['DateTime']]) library(ggpubr) ggarrange(pca1, pca2, pca3, pca4, ncol = 2, nrow = 2) ## ----plot_PCA_spec, fig.show='hold', fig.width=5, fig.height=5---------------- pca_spec = plot_PCA(quantile_normalized_matrix, example_sample_annotation, color_by = 'digestion_batch', plot_title = 'Digestion batch') pca_spec ## ----plot_PVCA, fig.show='hold', fig.width=5, fig.height=5-------------------- plot_PVCA(quantile_normalized_matrix, example_sample_annotation, technical_factors = technical_factors, biological_factors = biological_factors) ## ----plot_spikeIn, fig.show='hold'-------------------------------------------- quantile_normalized_long <- matrix_to_long(quantile_normalized_matrix) plot_spike_in(quantile_normalized_long, example_sample_annotation, peptide_annotation = example_peptide_annotation, protein_col = 'Gene', spike_ins = 'BOVINE_A1ag', plot_title = 'Spike-in BOVINE protein peptides', color_by_batch = TRUE, color_scheme = color_list[[batch_col]]) ## ----loess_fit, fig.show='hold', fig.width=5, fig.height=2.4------------------ loess_fit_df <- adjust_batch_trend_df(quantile_normalized_long, example_sample_annotation) ## ----loess_30, fig.show='hold', fig.width=5, fig.height=2.4------------------- loess_fit_30 <- adjust_batch_trend_df(quantile_normalized_long, example_sample_annotation, span = 0.3) plot_with_fitting_curve(feature_name = '10231_QDVDVWLWQQEGSSK_2', fit_df = loess_fit_30, fit_value_col = 'fit', df_long = quantile_normalized_long, sample_annotation = example_sample_annotation, color_by_batch = TRUE, color_scheme = color_list[[batch_col]], plot_title = 'Span = 30%') ## ----loess_70, fig.show='hold', fig.width=5, fig.height=2.4------------------- loess_fit_70 <- adjust_batch_trend_df(quantile_normalized_long, example_sample_annotation, span = 0.7) plot_with_fitting_curve(feature_name = '10231_QDVDVWLWQQEGSSK_2', fit_df = loess_fit_70, fit_value_col = 'fit', df_long = quantile_normalized_long, sample_annotation = example_sample_annotation, color_by_batch = TRUE, color_scheme = color_list[[batch_col]], plot_title = 'Span = 70%') ## ----median_batch, fig.show='hold', fig.width=3, fig.height=2.4--------------- peptide_median_df <- center_feature_batch_medians_df(loess_fit_df, example_sample_annotation) plot_single_feature(feature_name = '10231_QDVDVWLWQQEGSSK_2', df_long = peptide_median_df, sample_annotation = example_sample_annotation, measure_col = 'Intensity', plot_title = 'Feature-level Median Centered') ## ----comBat, fig.show='hold'-------------------------------------------------- comBat_df <- correct_with_ComBat_df(loess_fit_df, example_sample_annotation) ## ----combat_result, fig.show='hold', fig.width=3, fig.height=2.4------------- plot_single_feature(feature_name = '10231_QDVDVWLWQQEGSSK_2', df_long = loess_fit_df, sample_annotation = example_sample_annotation, plot_title = 'Loess Fitted') plot_single_feature(feature_name = '10231_QDVDVWLWQQEGSSK_2', df_long = comBat_df, sample_annotation = example_sample_annotation, plot_title = 'ComBat corrected') ## ----batch_corr_general, fig.show='hold'-------------------------------------- batch_corrected_df <- correct_batch_effects_df(df_long = quantile_normalized_long, sample_annotation = example_sample_annotation, discrete_func = 'ComBat', continuous_func = 'loess_regression', abs_threshold = 5, pct_threshold = 0.20) batch_corrected_matrix = long_to_matrix(batch_corrected_df) ## ----setup_corr_heatmap, fig.show='hold', fig.height=5, fig.width=8----------- earTags <- c('ET1524', 'ET2078', 'ET1322', 'ET1566', 'ET1354', 'ET1420', 'ET2154', 'ET1515', 'ET1506', 'ET2577', 'ET1681', 'ET1585', 'ET1518', 'ET1906') replicate_filenames = example_sample_annotation %>% filter(MS_batch %in% c('Batch_2', 'Batch_3')) %>% filter(EarTag %in% earTags) %>% pull(!!sym('FullRunName')) ## ---- fig.show='hold', fig.width=10, fig.height=11---------------------------- p1_exp = plot_sample_corr_heatmap(log_transformed_matrix, samples_to_plot = replicate_filenames, plot_title = 'Correlation of selected samples') ## ----------------------------------------------------------------------------- breaksList <- seq(0.7, 1, by = 0.01) # color scale of pheatmap heatmap_colors = colorRampPalette( rev(RColorBrewer::brewer.pal(n = 7, name = 'RdYlBu')))(length(breaksList) + 1) ## ----corr_samples_heatmap, fig.show='hold', fig.height=2.12, fig.width=3.35---- # Plot the heatmap with annotations for the chosen samples factors_to_show = c(batch_col, biospecimen_id_col) p1 = plot_sample_corr_heatmap(log_transformed_matrix, samples_to_plot = replicate_filenames, sample_annotation = example_sample_annotation, factors_to_plot = factors_to_show, plot_title = 'Log transformed correlation matrix of \nselected replicated samples', color_list = color_list, heatmap_color = heatmap_colors, breaks = breaksList, cluster_rows= FALSE, cluster_cols=FALSE,fontsize = 4, annotation_names_col = TRUE, annotation_legend = FALSE, show_colnames = FALSE) p2 = plot_sample_corr_heatmap(batch_corrected_matrix, samples_to_plot = replicate_filenames, sample_annotation = example_sample_annotation, factors_to_plot = factors_to_show, plot_title = 'Batch Corrected \nselected replicated samples', color_list = color_list, heatmap_color = heatmap_colors, breaks = breaksList, cluster_rows= FALSE, cluster_cols=FALSE,fontsize = 4, annotation_names_col = TRUE, annotation_legend = FALSE, show_colnames = FALSE) library(gridExtra) grid.arrange(grobs = list(p1$gtable, p2$gtable), ncol=2) ## ----corr_samples_distrib, fig.show='hold', fig.width=3.2, fig.height=3.5----- sample_cor_raw <- plot_sample_corr_distribution(log_transformed_matrix, example_sample_annotation, #repeated_samples = replicate_filenames, batch_col = 'MS_batch', biospecimen_id_col = 'EarTag', plot_title = 'Correlation of samples (raw)', plot_param = 'batch_replicate') raw_corr = sample_cor_raw + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0.7,1) + xlab(NULL) sample_cor_batchCor <- plot_sample_corr_distribution(batch_corrected_matrix, example_sample_annotation, batch_col = 'MS_batch', plot_title = 'Batch corrected', plot_param = 'batch_replicate') corr_corr = sample_cor_batchCor + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0.7, 1) + xlab(NULL) library(gtable) library(grid) g2 <- ggplotGrob(raw_corr) g3 <- ggplotGrob(corr_corr) g <- cbind(g2, g3, size = 'first') grid.draw(g) ## ----correlation_of_peptides, eval = FALSE------------------------------------ # peptide_cor_raw <- plot_peptide_corr_distribution(log_transformed_matrix, # example_peptide_annotation, # protein_col = 'Gene', # plot_title = 'Peptide correlation (raw)') # # peptide_cor_batchCor <- plot_peptide_corr_distribution(batch_corrected_matrix, # example_peptide_annotation, # protein_col = 'Gene', # plot_title = 'Peptide correlation (batch corrected)') # g2 <- ggplotGrob(peptide_cor_raw+ # ylim(c(-1, 1))) # g3 <- ggplotGrob(peptide_cor_batchCor+ # ylim(c(-1, 1))) # g <- cbind(g2, g3, size = 'first') # grid.draw(g) ## ----sessionInfo, eval=TRUE--------------------------------------------------- sessionInfo() ## ----citation----------------------------------------------------------------- citation('proBatch')