Quality checking of experiment

Reading the data

Here we read gene expression data which is quantified by ‘salmon’ and processed by tximport package. The ‘salmon’ generates count data, “abundance” (TPM) and gene length values. Therefore we may rename the corresponding assay slot ‘abundance’ to ‘TPM’.

These data belongs to this studies reference!!

In this data, there are gene expression values from first molar tissue of three different developing rodent species. The tissues are also from four different developmental time points.

se <- readRDS(system.file("extdata","rat_vole_mouseSE_salmon.rds", package = "broadSeq"))
SummarizedExperiment::assayNames(se)
#> [1] "counts"      "abundance"   "avgTxLength" "vst"

Sample metadata

as.data.frame(colData(se)) %>% dplyr::count(stage,species) %>% tidyr::spread(stage,n)
#>   species Bud Cap Late Cap Bell
#> 1   Mouse   7   7        7    7
#> 2     Rat   5   5        5    5
#> 3    Vole   7   7        7    7

se$stage <- factor(se$stage,levels = c("Bud","Cap","Late Cap","Bell"))

Filtering out low expression genes

# assays(se)[["counts"]][,1] %>% gghistogram() +scale_x_log10()
assays(se)[["counts"]][,5] %>% ggpubr::ggdensity(y = "count")+
    ggplot2::geom_vline(xintercept = 10)+ggplot2::scale_x_log10()


keep <- (assays(se)[["counts"]] >= 3) %>% rowSums() >= 5 
# smallest Group Size is 5
table(keep)
#> keep
#> FALSE  TRUE 
#>   689  5089

Normalization

edgeR provides two different methods ; CPM and TMM. But it does not use normalized values for Differential Expression.

It also does not normalizes count values rather it normalizes the library sizes using “TMM” method in ‘normLibSizes’. Either use or not use normLibSizes(). edgeR::cpm() will generate normalized expression values.

CPM

se <- broadSeq::normalizeEdgerCPM(se ,method = "none",cpm.log = TRUE )
## The normalized values are added with the assay name "logCPM"
SummarizedExperiment::assayNames(se)
#> [1] "counts"      "abundance"   "avgTxLength" "vst"         "logCPM"

TMM

se <- broadSeq::normalizeEdgerCPM(se , method = "TMM", cpm.log = FALSE )
## The normalized values are added with the assay name "TMM"
SummarizedExperiment::assayNames(se)
#> [1] "counts"      "abundance"   "avgTxLength" "vst"         "logCPM"     
#> [6] "TMM"

access

assays(se)[["counts"]][1:5,1:5]
#>       ME16-E3M1L ME16-E5M1L ME16-E6M1R ME16-E6M1L ME16-E7M1R
#> Axin2       1080       2844       3129       2747       1850
#> Mmp14       2031       8334       8328       8626       6147
#> Timp1         60         96        209        265         66
#> Pax9        3063       6011       7177       6438       3794
#> Irx2         519       1237       1160       1056        802
assays(se)[["TMM"]][1:5,1:5]
#>       ME16-E3M1L ME16-E5M1L ME16-E6M1R ME16-E6M1L ME16-E7M1R
#> Axin2  266.12685  485.34661  496.17821  460.53011  432.54157
#> Mmp14  500.46634 1422.24988 1320.60470 1446.13495 1437.20705
#> Timp1   14.78483   16.38301   33.14198   44.42682   15.43121
#> Pax9   754.76533 1025.81522 1138.08597 1079.32029  887.06093
#> Irx2   127.88874  211.10188  183.94590  177.03669  187.51262
assays(se)[["logCPM"]][1:5,1:5]
#>       ME16-E3M1L ME16-E5M1L ME16-E6M1R ME16-E6M1L ME16-E7M1R
#> Axin2   8.031398   8.883492   8.916061   8.810157   8.733173
#> Mmp14   8.941266  10.433573  10.327398  10.459897  10.464346
#> Timp1   3.907562   4.037786   5.032632   5.451326   3.969329
#> Pax9    9.533527   9.962371  10.112899  10.037992   9.768499
#> Irx2    6.977147   7.684397   7.487015   7.433479   7.529533

DESeq2 Transformation

DESeq2 provides three different transformations

VST

variance stabilizing transformation (VST)

se <- broadSeq::transformDESeq2(se,method = "vst"  )
#> using 'avgTxLength' from assays(dds), correcting for library size

Normalized counts transformation

se <- broadSeq::transformDESeq2(se, method = "normTransform"  )
#> Warning in broadSeq::transformDESeq2(se, method = "normTransform"): For length correction assayname must match with avgTxLength
#> 
#> using 'avgTxLength' from assays(dds), correcting for library size

rlog

regularized log

se <- broadSeq::transformDESeq2(se, method = "rlog")
SummarizedExperiment::assayNames(se)
#> [1] "counts"        "abundance"     "avgTxLength"   "vst"          
#> [5] "logCPM"        "TMM"           "normTransform"

Comparision

Boxplot of different transforms for each sample.

p <- broadSeq::sampleAssay_plot(se[, se$species=="Mouse" ], 
                           assayName = "counts", fill = "stage", 
                           yscale = "log2")+ rremove("x.text")

p1 <- broadSeq::sampleAssay_plot(se[, se$species=="Mouse"], 
                           assayName = "vst", fill = "stage")+ rremove("x.text")

p2 <- broadSeq::sampleAssay_plot(se[, se$species=="Mouse"], 
                           assayName = "TMM", fill = "stage", 
                           yscale = "log10")+ rremove("x.text")

p3 <- broadSeq::sampleAssay_plot(se[, se$species=="Mouse"], 
                           assayName = "logCPM", fill = "stage")+ rremove("x.text")


ggarrange(p,p1,p2,p3, common.legend = TRUE, labels = c("A","B","C"))

Plot standard deviations versus means expression

if (requireNamespace("vsn", quietly = TRUE)) {
    library("vsn")
    x <- meanSdPlot(
        log2(assays(se[, se$species == "Rat"])[["counts"]]+1),
        plot = FALSE)
    print(x$gg +ggtitle(label = "log2(n+1) "))
    
    x <- meanSdPlot(
        assays(se[, se$species == "Rat"])[["vst"]],
        plot = FALSE)
    
    print(x$gg +ggtitle(label = "Vst"))
    
    x <- meanSdPlot(
        assays(se[, se$species == "Rat"])[["logCPM"]],
        plot = FALSE)
    print(x$gg + ggtitle(label = "logCPM"))
}    

Visualization of gene Expression

## Multiple assay of a single gene
broadSeq::assay_plot(se, feature = c("Shh"), 
           assayNames = c("counts","logCPM","vst","TMM"),
           x = "stage", fill="species", add="dotplot", palette = "npg")


## Expression of multiple genes from a single assay
broadSeq::genes_plot(se, 
                     features = c("Shh","Edar"), 
                     facet.by = "symbol",
                     x = "stage", assayName = "vst", fill="species", palette = "jco")

Pre-defined or custom color palette based on journals

Scientific journal palettes from ggsci R package, e.g.: “npg”, “aaas”, “lancet”, “jco”, “ucscgb”, “uchicago”, “simpsons” and “rickandmorty” can be passed for coloring or filling by groups from metadata.

jco <- broadSeq::genes_plot(se[,se$species == "Mouse"], 
                     features = c("Shh"), facet.by = "symbol", assayName =  "logCPM",
                     x = "stage",  fill="stage", add="dotplot", xlab = "", 
                     title = "Journal of Clinical Oncology", palette = "jco") 

npg <- broadSeq::genes_plot(se[,se$species == "Mouse"], 
                     features = c("Shh"), facet.by = "symbol",assayName =  "logCPM",
                     x = "stage", fill="stage", add="dotplot", xlab = "",
                     title = "Nature Publishing Group", palette = "npg") 

aaas <- broadSeq::genes_plot(se[,se$species == "Mouse"], 
                     features = c("Shh"), facet.by = "symbol", assayName = "logCPM",
                     x = "stage", fill="stage", add="dotplot", xlab = "",
                     title = "Science", palette = "aaas")

nejm <- broadSeq::genes_plot(se[,se$species == "Mouse"], 
                     features = c("Shh"), facet.by = "symbol", assayName = "logCPM",
                     x = "stage", fill="stage", add="dotplot",  xlab = "",
                     title = "New England Journal of Medicine",palette = "nejm")

# ggarrange(jco,npg,aaas,nejm, 
#           common.legend = TRUE,legend = "none",
#           labels = c("A","B","C","D"))

ggarrange(jco+ggpubr::rotate_x_text(), npg+ggpubr::rotate_x_text(),
          aaas+ggpubr::rotate_x_text(),nejm+ggpubr::rotate_x_text(),
          nrow = 1, common.legend = TRUE,legend = "none",
          labels = c("A","B","C","D")) %>% 
    annotate_figure( top = text_grob("Color palette")) 

QC with Clustering

## Number of top variable genes
ntop=500
## How it matters?

MDS plot

Classical multidimensional scaling is based on measuring the distance between the samples.

Popular function plotMDS from limma does not work with SummarizedExperiment object. Here broadSeq provides this function through package ´cmdscale {stats}´.

[Classical multidimensional scaling (MDS) of a data matrix. Also known as principal coordinates analysis (Gower, 1966). ]

broadSeq::plot_MDS(se, scaledAssay = "vst", ntop=500, 
                   color = "species", shape = "stage", 
                   ellipse=TRUE, legend = "bottom")
#> Only using 500 most variable genes.

MDS is cool but it is not possible to know/visualize top variable genes with their meta data which is stored in se object

head(rowData(se))
#> DataFrame with 6 rows and 8 columns
#>             mouse_gene_id      symbol       Class chromosome_name
#>               <character> <character>    <factor>     <character>
#> Axin2  ENSMUSG00000000142       Axin2 Dispensable              11
#> Mmp14  ENSMUSG00000000957       Mmp14 Tissue                   14
#> Timp1  ENSMUSG00000001131       Timp1 Dispensable               X
#> Pax9   ENSMUSG00000001497        Pax9 Progression              12
#> Irx2   ENSMUSG00000001504        Irx2 Dispensable              13
#> Col1a1 ENSMUSG00000001506      Col1a1 Dispensable              11
#>        start_position   gene_biotype        vole_gene_id       Rnor_gene_id
#>             <integer>    <character>         <character>        <character>
#> Axin2       108811175 protein_coding Mglareolus_00009775 ENSRNOG00000055010
#> Mmp14        54669069 protein_coding Mglareolus_00038784 ENSRNOG00000010947
#> Timp1        20736405 protein_coding Mglareolus_00039838 ENSRNOG00000010208
#> Pax9         56738552 protein_coding Mglareolus_00000242 ENSRNOG00000008826
#> Irx2         72776939 protein_coding Mglareolus_00017335 ENSRNOG00000012742
#> Col1a1       94827050 protein_coding Mglareolus_00042400 ENSRNOG00000003897

Other methods can help to visualize gene information along with clustering information.

Hierarchical clustering and Heatmap

p_vst <- broadSeq::plotHeatmapCluster(
    se,
    scaledAssay = "vst",
    annotation_col = c("species", "stage"),
    annotation_row = c("Class","gene_biotype"),
    ntop = 30, show_geneAs = "symbol",
    cluster_cols = TRUE, cluster_rows = FALSE,
    show_rownames = TRUE, show_colnames = FALSE,
    main = "Top 30 variable gene vst"
)
#> Only using 30 most variable genes.

PCA plot

prcompTidy

Perform Principal Components Analysis with function broadSeq::prcompTidy() which returns a list of four data.frame objects:

  • pc_scores,

  • eigen_values,

  • loadings (eigen vectors) and

  • the original data.

    Compute PCA using any assay

computedPCA_logCPM <- broadSeq::prcompTidy(se, scaledAssay = "logCPM", ntop = 500)
#> Only using 500 most variable genes.
## PCA based on vst values
computedPCA_vst <- broadSeq::prcompTidy(se, scaledAssay = "vst", ntop = 500)
#> Only using 500 most variable genes.

Plot

logCPM
plotAnyPC(computedPCA = computedPCA_logCPM,
          x = 1, y = 2, color = "species", shape = "stage",
          legend = "bottom")

VST
pca_vst <- plotAnyPC(computedPCA = computedPCA_vst,
            x = 1, y = 2,  color = "species", shape = "stage", 
            legend = "bottom") 
pca_vst

Other PCs
computedPCA_vst$eigen_values %>%
        dplyr::filter(var_exp >= 2) %>%
    ggbarplot(x="PC",y="var_exp", label = TRUE, label.pos = "out")

It can be checked if there are other PCs to explain considerable variance. PC3 can be useful to see variance due to different developmental time points.

pca_vst_2_3 <-plotAnyPC(computedPCA = computedPCA_vst,
                x = 2, y = 3,  
                color = "species", shape = "stage", legend = "bottom")
# pca_vst_2_3

PC3 captures beautifully the variance in gene expression due to developmental stages.

Gene loading

computedPCA_vst %>% broadSeq::getFeatureLoadRanking(keep = c("symbol","Class")) %>% head()
#>    symbol        Class     loading  PC Rank
#> 1    Rpl4        Other -0.10384659 PC1    1
#> 2 Hsp90b1        Other -0.09727451 PC1    2
#> 3    Atrx Dev. process -0.09419071 PC1    3
#> 4   Rpl19        Other -0.09347617 PC1    4
#> 5     Ncl        Other -0.09181188 PC1    5
#> 6    Rpl3        Other -0.08743885 PC1    6

#  Top 5 genes of PC2

computedPCA_vst$loadings %>% top_n(5,abs(PC2)  ) %>% dplyr::select(gene,PC2)
#>                  gene        PC2
#> Aadacl2fm3 Aadacl2fm3  0.1635445
#> Rack1           Rack1 -0.1452180
#> Rpl11           Rpl11 -0.1474743
#> Rpl8             Rpl8 -0.1511175
#> Rplp0           Rplp0 -0.1586286

pca_vst_loading <- computedPCA_vst %>% 
    broadSeq::getFeatureLoadRanking(keep = c("symbol","Class"), topN = 50, pcs=1:10) %>% 
    dplyr::count(Class, PC) %>%
    ggbarplot(
        x = "PC", y = "n", fill = "Class",
        legend = "bottom", palette = c("red","blue","orange","purple","white","grey")
    ) 
# pca_vst_loading

Biplot

# By default it plots top 2 genes from each PC axis
pca_vst_bi <- broadSeq::biplotAnyPC(computedPCA = computedPCA_vst, 
            x = 1, y = 2, genesLabel = "symbol", 
            color = "species", shape = "stage", 
            legend = "bottom")
# pca_vst_bi
ggarrange(
    ggarrange(pca_vst_bi+ggtitle(label =  ""),
          pca_vst_2_3+ggtitle(label =  ""), common.legend = TRUE),
    pca_vst_loading, nrow = 2)

To plot user defined genes

Now plotting top 5 genes from PC3

# Top 5 genes of PC3
biplotAnyPC(computedPCA = computedPCA_vst,x = 2, y = 3, 
            color = "species", shape = "stage",
            genes= computedPCA_vst$loadings %>% 
                top_n(5,abs(PC3)) %>% pull(gene),
            genesLabel = "symbol")


## Plot progression gene "Shh" 
biplotAnyPC(computedPCA = computedPCA_vst,x = 2, y = 3, 
            color = "species", shape = "stage",
            genes=c("Shh"),
            genesLabel = "symbol")

sessionInfo

#> R Under development (unstable) (2024-03-18 r86148)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 22.04.4 LTS
#> 
#> Matrix products: default
#> BLAS:   /home/biocbuild/bbs-3.19-bioc/R/lib/libRblas.so 
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_GB              LC_COLLATE=C              
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: America/New_York
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats4    stats     graphics  grDevices utils     datasets  methods  
#> [8] base     
#> 
#> other attached packages:
#>  [1] vsn_3.71.1                  broadSeq_0.99.1            
#>  [3] SummarizedExperiment_1.33.3 Biobase_2.63.1             
#>  [5] GenomicRanges_1.55.4        GenomeInfoDb_1.39.13       
#>  [7] IRanges_2.37.1              S4Vectors_0.41.6           
#>  [9] BiocGenerics_0.49.1         MatrixGenerics_1.15.0      
#> [11] matrixStats_1.2.0           ggpubr_0.6.0               
#> [13] ggplot2_3.5.0               dplyr_1.1.4                
#> 
#> loaded via a namespace (and not attached):
#>   [1] splines_4.4.0           ggplotify_0.1.2         tibble_3.2.1           
#>   [4] polyclip_1.10-6         preprocessCore_1.65.0   XML_3.99-0.16.1        
#>   [7] lifecycle_1.0.4         rstatix_0.7.2           edgeR_4.1.21           
#>  [10] doParallel_1.0.17       lattice_0.22-6          MASS_7.3-60.2          
#>  [13] backports_1.4.1         magrittr_2.0.3          limma_3.59.6           
#>  [16] sass_0.4.9              rmarkdown_2.26          jquerylib_0.1.4        
#>  [19] yaml_2.3.8              cowplot_1.1.3           DBI_1.2.2              
#>  [22] RColorBrewer_1.1-3      abind_1.4-5             zlibbioc_1.49.3        
#>  [25] Rtsne_0.17              purrr_1.0.2             ggraph_2.2.1           
#>  [28] yulab.utils_0.1.4       tweenr_2.0.3            circlize_0.4.16        
#>  [31] seriation_1.5.4         GenomeInfoDbData_1.2.12 enrichplot_1.23.1      
#>  [34] ggrepel_0.9.5           tidytree_0.4.6          genefilter_1.85.1      
#>  [37] pheatmap_1.0.12         annotate_1.81.2         codetools_0.2-20       
#>  [40] DelayedArray_0.29.9     DOSE_3.29.2             ggforce_0.4.2          
#>  [43] tidyselect_1.2.1        shape_1.4.6.1           aplot_0.2.2            
#>  [46] UCSC.utils_0.99.5       farver_2.1.1            viridis_0.6.5          
#>  [49] TSP_1.2-4               jsonlite_1.8.8          GetoptLong_1.0.5       
#>  [52] tidygraph_1.3.1         randomcoloR_1.1.0.1     survival_3.5-8         
#>  [55] iterators_1.0.14        foreach_1.5.2           tools_4.4.0            
#>  [58] treeio_1.27.0           sechm_1.11.0            Rcpp_1.0.12            
#>  [61] glue_1.7.0              gridExtra_2.3           SparseArray_1.3.4      
#>  [64] xfun_0.43               DESeq2_1.43.4           qvalue_2.35.0          
#>  [67] ca_0.71.1               withr_3.0.0             BiocManager_1.30.22    
#>  [70] fastmap_1.1.1           fansi_1.0.6             digest_0.6.35          
#>  [73] R6_2.5.1                gridGraphics_0.5-1      colorspace_2.1-0       
#>  [76] GO.db_3.19.0            RSQLite_2.3.6           ggsci_3.0.3            
#>  [79] hexbin_1.28.3           utf8_1.2.4              tidyr_1.3.1            
#>  [82] generics_0.1.3          data.table_1.15.4       graphlayouts_1.1.1     
#>  [85] httr_1.4.7              S4Arrays_1.3.6          scatterpie_0.2.2       
#>  [88] pkgconfig_2.0.3         gtable_0.3.4            registry_0.5-1         
#>  [91] blob_1.2.4              ComplexHeatmap_2.19.0   XVector_0.43.1         
#>  [94] clusterProfiler_4.11.0  shadowtext_0.1.3        htmltools_0.5.8.1      
#>  [97] carData_3.0-5           fgsea_1.29.0            clue_0.3-65            
#> [100] scales_1.3.0            png_0.1-8               ggfun_0.1.4            
#> [103] knitr_1.46              reshape2_1.4.4          rjson_0.2.21           
#> [106] nlme_3.1-164            curl_5.2.1              cachem_1.0.8           
#> [109] GlobalOptions_0.1.2     stringr_1.5.1           parallel_4.4.0         
#> [112] HDO.db_0.99.1           AnnotationDbi_1.65.2    pillar_1.9.0           
#> [115] grid_4.4.0              vctrs_0.6.5             car_3.1-2              
#> [118] xtable_1.8-4            cluster_2.1.6           evaluate_0.23          
#> [121] cli_3.6.2               locfit_1.5-9.9          compiler_4.4.0         
#> [124] rlang_1.1.3             crayon_1.5.2            ggsignif_0.6.4         
#> [127] labeling_0.4.3          forcats_1.0.0           affy_1.81.0            
#> [130] plyr_1.8.9              fs_1.6.3                stringi_1.8.3          
#> [133] viridisLite_0.4.2       BiocParallel_1.37.1     munsell_0.5.1          
#> [136] Biostrings_2.71.5       lazyeval_0.2.2          V8_4.4.2               
#> [139] GOSemSim_2.29.1         Matrix_1.7-0            patchwork_1.2.0        
#> [142] bit64_4.0.5             KEGGREST_1.43.0         statmod_1.5.0          
#> [145] highr_0.10              igraph_2.0.3            broom_1.0.5            
#> [148] memoise_2.0.1           affyio_1.73.0           bslib_0.7.0            
#> [151] ggtree_3.11.1           fastmatch_1.1-4         bit_4.0.5              
#> [154] ape_5.7-1               gson_0.1.0