--- title: "Introduction to excluderanges" author: - name: Mikhail Dozmorov affiliation: - Virginia Commonwealth University email: mikhail.dozmorov@gmail.com output: BiocStyle::html_document: self_contained: yes toc: true toc_float: true toc_depth: 2 code_folding: show date: "`r Sys.Date()`" vignette: > %\VignetteIndexEntry{excluderanges} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} bibliography: references.bib editor_options: chunk_output_type: console --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) library(BiocStyle) ``` # Exclude ranges Genomic ranges of problematic genomic regions that should be avoided when working with genomic data. For human, mouse, and selected model organisms. TL;DR - For human hg38 genome assembly, [Anshul](https://twitter.com/anshulkundaje) [recommends](https://twitter.com/anshulkundaje/status/1263546023151992832?s=20) [ENCFF356LFX exclusion list regions](https://www.encodeproject.org/files/ENCFF356LFX/). BED files of exclusion regions are available on the [ENCODE project](https://www.encodeproject.org/search/?searchTerm=exclusion+list) website [@Amemiya:2019aa]. Human (hg19, hg38) and mouse (mm9, mm10) exclusion regions are available. However, exclusion lists generated by multiple labs often create uncertainty what to use. The purpose of this package is to provide a unified place for informed retrieval of exclusion regions. Naming convention: `..`, e.g., `hg19.Birney.wgEncodeDacMapabilityConsensusExcludable`. See [make-data.R](../inst/scripts/make-data.R) how to create the excluderanges GRanges objects. ## Install `excluderanges` ```{r "install", eval = FALSE} if (!requireNamespace("BiocManager", quietly = TRUE)) { install.packages("BiocManager") } # Install the development version of Bioconductor (need 3.14 and above) # BiocManager::install(version = "devel") # Check that you have a valid Bioconductor installation # BiocManager::valid() # Install the package BiocManager::install("excluderanges", version = "devel") # BiocManager::install("mdozmorov/excluderanges") ``` ## Use excluderanges Get an overview of what's available ```{r AnnotationHub} suppressMessages(library(AnnotationHub)) ah <- AnnotationHub() query_data <- query(ah, "excluderanges") query_data ``` hg38 excluderanges coordinates recommended by Anshul ```{r hg38excluderanges} # Check titles # as.data.frame(mcols(query_data[1:10])["title"]) excludeGR.hg38.Kundaje.1 <- query_data[["AH95917"]] excludeGR.hg38.Kundaje.1 ``` Save the data in a BED file, if needed. ```{r eval=FALSE} rtracklayer::export(excludeGR.hg38.Kundaje.1, "hg38.Kundaje.GRCh38_unified_Excludable.bed", format = "bed") ``` We can load other excludable regions for the hg38 genome assembly and compare them. ```{r allhg38excluderanges} query_data <- query(ah, c("excluderanges", "hg38", "Exclusion regions")) query_data excludeGR.hg38.Bernstein <- query_data[["AH95915"]] excludeGR.hg38.Kundaje.2 <- query_data[["AH95916"]] excludeGR.hg38.Reddy <- query_data[["AH95918"]] excludeGR.hg38.Wold <- query_data[["AH95919"]] excludeGR.hg38.Yeo <- query_data[["AH95920"]] ``` Compare the number of excludable regions. ```{r excluderanges_hg38_count, fig.width=6.5, fig.height=2} library(ggplot2) mtx_to_plot <- data.frame(Count = c(length(excludeGR.hg38.Bernstein), length(excludeGR.hg38.Kundaje.1), length(excludeGR.hg38.Kundaje.2), length(excludeGR.hg38.Reddy), length(excludeGR.hg38.Wold), length(excludeGR.hg38.Yeo)), Source = c("Bernstein.Mint_Excludable_GRCh38", "Kundaje.GRCh38_unified_Excludable", "Kundaje.GRCh38.Excludable", "Reddy.wgEncodeDacMapabilityConsensusExcludable", "Wold.hg38mitoExcludable", "Yeo.eCLIP_Excludableregions.hg38liftover.bed")) # Order Source by the number of regions mtx_to_plot$Source <- factor(mtx_to_plot$Source, levels = mtx_to_plot$Source[order(mtx_to_plot$Count)]) ggplot(mtx_to_plot, aes(x = Source, y = Count, fill = Source)) + geom_bar(stat = "identity") + coord_flip() + theme_bw() + theme(legend.position = "none") # ggsave("man/figures/excluderanges_hg38_count.png", width = 5.5, height = 2) ``` ```{r echo=FALSE, eval=FALSE} knitr::include_graphics('man/figures/excluderanges_hg38_count.png') ``` Compare the width of excludable regions. log2 scale because of heavy right tail distributions. ```{r excluderanges_hg38_width, fig.width=6.5, fig.height=2} library(ggridges) mtx_to_plot <- data.frame(Width = c(width(excludeGR.hg38.Bernstein), width(excludeGR.hg38.Kundaje.1), width(excludeGR.hg38.Kundaje.2), width(excludeGR.hg38.Reddy), width(excludeGR.hg38.Wold), width(excludeGR.hg38.Yeo)), Source = c(rep("Bernstein.Mint_Excludable_GRCh38", length(excludeGR.hg38.Bernstein)), rep("Kundaje.GRCh38_unified_Excludable", length(excludeGR.hg38.Kundaje.1)), rep("Kundaje.GRCh38.Excludable", length(excludeGR.hg38.Kundaje.2)), rep("Reddy.wgEncodeDacMapabilityConsensusExcludable", length(excludeGR.hg38.Reddy)), rep("Wold.hg38mitoExcludable", length(excludeGR.hg38.Wold)), rep("Yeo.eCLIP_Excludableregions.hg38liftover.bed", length(excludeGR.hg38.Yeo)))) ggplot(mtx_to_plot, aes(x = log2(Width), y = Source, fill = Source)) + geom_density_ridges() + theme_bw() + theme(legend.position = "none") # ggsave("man/figures/excluderanges_hg38_width.png", width = 5.5, height = 2) ``` ```{r echo=FALSE, eval=FALSE} knitr::include_graphics('man/figures/excluderanges_hg38_width.png') ``` We can investigate the total width of each set of excludable ranges. ```{r excluderanges_hg38_sumwidth, fig.width=6.5, fig.height=2} mtx_to_plot <- data.frame(TotalWidth = c(sum(width(excludeGR.hg38.Bernstein)), sum(width(excludeGR.hg38.Kundaje.1)), sum(width(excludeGR.hg38.Kundaje.2)), sum(width(excludeGR.hg38.Reddy)), sum(width(excludeGR.hg38.Wold)), sum(width(excludeGR.hg38.Yeo))), Source = c("Bernstein.Mint_Excludable_GRCh38", "Kundaje.GRCh38_unified_Excludable", "Kundaje.GRCh38.Excludable", "Reddy.wgEncodeDacMapabilityConsensusExcludable", "Wold.hg38mitoExcludable", "Yeo.eCLIP_Excludableregions.hg38liftover")) ggplot(mtx_to_plot, aes(x = TotalWidth, y = Source, fill = Source)) + geom_bar(stat="identity") + scale_x_log10() + scale_y_discrete(label=abbreviate) + xlab("log10 total width") # ggsave("man/figures/excluderanges_hg38_sumwidth.png", width = 6.5, height = 2) ``` ```{r echo=FALSE, eval=FALSE} knitr::include_graphics('man/figures/excluderanges_hg38_sumwidth.png') ``` We can compare Jaccard overlap between those sets of excludable regions. ```{r excluderanges_hg38_jaccard, warning=FALSE, fig.width=6.5, fig.height=6} library(pheatmap) library(stringr) # Jaccard calculations jaccard <- function(gr_a, gr_b) { intersects <- GenomicRanges::intersect(gr_a, gr_b, ignore.strand = TRUE) intersection <- sum(width(intersects)) union <- sum(width(GenomicRanges::union(gr_a, gr_b, ignore.strand = TRUE))) DataFrame(intersection, union, jaccard = intersection/union, n_intersections = length(intersects)) } # List and names of all excludable regions all_excludeGR_list <- list(excludeGR.hg38.Bernstein, excludeGR.hg38.Kundaje.1, excludeGR.hg38.Kundaje.2, excludeGR.hg38.Reddy, excludeGR.hg38.Wold, excludeGR.hg38.Yeo) all_excludeGR_name <- c("Bernstein.Mint_Excludable_GRCh38", "Kundaje.GRCh38_unified_Excludable", "Kundaje.GRCh38.Excludable", "Reddy.wgEncodeDacMapabilityConsensusExcludable", "Wold.hg38mitoExcludable", "Yeo.eCLIP_Excludableregions.hg38liftover") # Correlation matrix, empty mtx_to_plot <- matrix(data = 0, nrow = length(all_excludeGR_list), ncol = length(all_excludeGR_list)) # Fill it in for (i in 1:length(all_excludeGR_list)) { for (j in 1:length(all_excludeGR_list)) { # If diagonal, set to zero if (i == j) mtx_to_plot[i, j] <- 0 # Process only one half, the other is symmetric if (i > j) { mtx_to_plot[i, j] <- mtx_to_plot[j, i] <- jaccard(all_excludeGR_list[[i]], all_excludeGR_list[[j]])[["jaccard"]] } } } # Trim row/colnames rownames(mtx_to_plot) <- colnames(mtx_to_plot) <- str_trunc(all_excludeGR_name, width = 25) # Save the plot # png("man/figures/excluderanges_hg38_jaccard.png", width = 1000, height = 900, res = 200) pheatmap(data.matrix(mtx_to_plot)) # dev.off() ``` ```{r echo=FALSE, eval=FALSE} knitr::include_graphics('man/figures/excluderanges_hg38_jaccard.png') ``` Note that some excludable ranges objects contain six columns, implying there may be some interesting metadata. Let's explore one. ```{r excluderanges_hg38_Reddy_metadata, fig.width=6.5, fig.height=3} mcols(excludeGR.hg38.Reddy) mtx_to_plot <- as.data.frame(table(mcols(excludeGR.hg38.Reddy)[["name"]])) colnames(mtx_to_plot) <- c("Type", "Number") mtx_to_plot <- mtx_to_plot[order(mtx_to_plot$Number), ] mtx_to_plot$Type <- factor(mtx_to_plot$Type, levels = mtx_to_plot$Type) ggplot(mtx_to_plot, aes(x = Number, y = Type, fill = Type)) + geom_bar(stat="identity") + theme_bw() + theme(legend.position = "none") # ggsave("man/figures/excluderanges_hg38_Reddy_metadata.png", width = 5, height = 2.5) ``` ```{r echo=FALSE, eval=FALSE} knitr::include_graphics('man/figures/excluderanges_hg38_Reddy_metadata.png') ``` One may decide to combine the excludable ranges from all labs, although from previous results we may decide to follow Anshul's [advice](https://twitter.com/anshulkundaje/status/1263546023151992832?s=20) advice about the [ENCFF356LFX exclusion list regions](https://www.encodeproject.org/files/ENCFF356LFX/) and use the `excludeGR.hg38.Kundaje.1` object. ```{r combinedexcluderanges} excludeGR.hg38.all <- reduce(c(excludeGR.hg38.Bernstein, excludeGR.hg38.Kundaje.1, excludeGR.hg38.Kundaje.2, excludeGR.hg38.Reddy, excludeGR.hg38.Wold, excludeGR.hg38.Yeo)) # Keep only standard chromosomes excludeGR.hg38.all <- keepStandardChromosomes(excludeGR.hg38.all, pruning.mode = "coarse") print(length(excludeGR.hg38.all)) summary(width(excludeGR.hg38.all)) ``` ## Centromeres, telomeres, etc. Besides the ENCODE-produced excludable regions, we may want to exclude centromeres, telomeres, and other gap locations. The "Gap Locations" track for Homo Sapiens is available for the GRcH37/hg19 genome assembly as a [UCSC 'gap' table](http://genome.ucsc.edu/cgi-bin/hgTables?db=hg19&hgta_group=map&hgta_track=gap&hgta_table=gap&hgta_doSchema=describe+table+schema). It can be retrieved from `r BiocStyle::Biocpkg("AnnotationHub")`, but lacks the metadata columns needed to decide the type of gaps. ```{r eval=FALSE} # Search for the gap track # ahData <- query(ah, c("gap", "Homo sapiens", "hg19")) # ahData[ahData$title == "Gap"] gaps <- ahData[["AH6444"]] ``` The [UCSC 'gap' table](http://genome.ucsc.edu/cgi-bin/hgTables?db=hg19&hgta_group=map&hgta_track=gap&hgta_table=gap&hgta_doSchema=describe+table+schema) provides better granularity about the types of gaps available. E.g., for human, hg19, we have the following types and the number of gaps. ```{r echo=FALSE, out.height="70%", out.width="70%"} knitr::include_graphics('../man/figures/excluderanges_hg19_gaps_number.png') ``` Those objects are provided as individual GRanges. Naming convention: `.UCSC.`, e.g., `hg19.UCSC.gap_centromere`. We can similarly load any gap type object. ```{r gapshg19} query_data <- query(ah, c("excluderanges", "UCSC", "Homo Sapiens", "hg19")) query_data gapsGR_hg19_centromere <- query_data[["AH95927"]] gapsGR_hg19_centromere ``` ### Centromeres for the hg38 genome assembly Note that the UCSC 'gap' table for the hg38 human genome assembly does not contain genomic coordinates for the "centromere" gap type. These can be obtained from the `r BiocStyle::Biocpkg("rCGH")` package as follows: ```{r gapshg38} suppressPackageStartupMessages(library(rCGH)) suppressPackageStartupMessages(library(GenomicRanges)) # hg38 # data.frame # Adjust chromosome names hg38$chrom[hg38$chrom == 23] <- "X" hg38$chrom[hg38$chrom == 24] <- "Y" hg38$chrom <- paste0("chr", hg38$chrom) # Make GRanges object hg38.UCSC.centromere <- makeGRangesFromDataFrame(hg38, seqnames.field = "chrom", start.field = "centromerStart", end.field = "centromerEnd") # Assign seqinfo data seqlengths(hg38.UCSC.centromere) <- hg38$length genome(hg38.UCSC.centromere) <- "hg38" # Resulting object hg38.UCSC.centromere ``` The `r BiocStyle::Biocpkg("rCGH")` package also contains data for the `hg19` and `hg18` genomes. The `hg19` centromere data is equivalent to the `hg19.UCSC.centromere` object provided in our `r BiocStyle::Biocpkg("excluderanges")` package. ## Source data for the excludable regions ```{r echo=FALSE} mtx <- read.csv("../inst/extdata/table_excluderanges.csv") knitr::kable(mtx) ``` ```{r echo=FALSE} mtx <- read.csv("../inst/extdata/table_gap.csv") knitr::kable(mtx) ``` Download all data from the [Google Drive folder](https://drive.google.com/drive/folders/1MFn6LPZD1zZRQz7biR0Fyl-mzkx4CIxS?usp=sharing) ## `R` session information. ```{r reproduce3, echo=FALSE} ## Session info sessionInfo() ``` # References