## ----style, echo = FALSE, results = 'asis'--------------------------------- BiocStyle::markdown() ## ----setup, echo=FALSE----------------------------------------------------- knitr::opts_chunk$set( eval=as.logical(Sys.getenv("KNITR_EVAL", "TRUE")), cache=as.logical(Sys.getenv("KNITR_CACHE", "TRUE")) ) suppressPackageStartupMessages({ library(Biostrings) library(GenomicRanges) }) ## ----setup-objects--------------------------------------------------------- library(Biostrings) library(GenomicRanges) ## ----Biostrings, message=FALSE--------------------------------------------- library(Biostrings) # Biological sequences data(phiX174Phage) # sample data, see ?phiX174Phage phiX174Phage m <- consensusMatrix(phiX174Phage)[1:4,] # nucl. x position counts polymorphic <- which(colSums(m != 0) > 1) m[, polymorphic] ## ----methods, eval=FALSE--------------------------------------------------- # methods(class=class(phiX174Phage)) # 'DNAStringSet' methods ## ----phiX------------------------------------------------------------------ library(Biostrings) data(phiX174Phage) ## ----consensusMatrix------------------------------------------------------- m <- consensusMatrix(phiX174Phage)[1:4,] polymorphic <- which(colSums(m != 0) > 1) mapply(substr, polymorphic, polymorphic, MoreArgs=list(x=phiX174Phage)) ## ----iranges--------------------------------------------------------------- library(IRanges) ir <- IRanges(start=c(10, 20, 30), width=5) ir ## ----iranges-flank--------------------------------------------------------- flank(ir, 3) ## ----iranges-class--------------------------------------------------------- class(ir) getClass(class(ir)) ## ----iranges-flank-method, eval=FALSE-------------------------------------- # ?"flank,Ranges-method" ## ----granges--------------------------------------------------------------- library(GenomicRanges) gr <- GRanges(c("chr1", "chr1", "chr2"), ir, strand=c("+", "-", "+")) gr ## ----granges-flank--------------------------------------------------------- flank(gr, 3) ## ----granges-class--------------------------------------------------------- class(gr) getClass(class(gr)) ## ----granges-flank-method, eval=FALSE-------------------------------------- # ?"flank,GenomicRanges-method" ## ----granges-methods, eval=FALSE------------------------------------------- # showMethods(class="GRanges", where=search()) ## ----granges-man-and-vignettes, eval=FALSE--------------------------------- # help(package="GenomicRanges") # vignette(package="GenomicRanges") # vignette(package="GenomicRanges", "GenomicRangesHOWTOs") ## ----BSgenome-require, message=FALSE--------------------------------------- library(BSgenome.Hsapiens.UCSC.hg38) chr14_range = GRanges("chr14", IRanges(1, seqlengths(Hsapiens)["chr14"])) chr14_dna <- getSeq(Hsapiens, chr14_range) letterFrequency(chr14_dna, "GC", as.prob=TRUE) ## -------------------------------------------------------------------------- library(Biostrings) url <- "ftp://ftp.ensembl.org/pub/release-92/fasta/mus_musculus/cds/Mus_musculus.GRCm38.cds.all.fa.gz" fl <- BiocFileCache::bfcrpath(rnames = url) cds <- rtracklayer::import(fl, "fasta") ## -------------------------------------------------------------------------- pred1 <- width(cds) %% 3 == 0 table(pred1) pred2 <- narrow(cds, 1, 3) == "ATG" stops <- c("TAA", "TAG", "TGA") pred3 <- narrow(cds, width(cds) - 2, width(cds)) %in% stops table(pred1 & pred2 & pred3) cds <- cds[ pred1 & pred2 & pred3 ] ## -------------------------------------------------------------------------- hist(log10(width(cds))) cds[ which.max(width(cds)) ] names(cds)[ which.max(width(cds)) ] ## -------------------------------------------------------------------------- gc <- letterFrequency(cds, "GC", as.prob=TRUE) head(gc) hist(gc) plot( log10(width(cds)), gc, pch=".") ## -------------------------------------------------------------------------- AMINO_ACID_CODE aa <- translate(cds) codon_use <- letterFrequency(aa, names(AMINO_ACID_CODE)) head(codon_use) ## -------------------------------------------------------------------------- mcols(cds) <- DataFrame( GC = gc[,"G|C"] ) mcols(cds, use.names = FALSE) mcols(cds[1:3], use.names = FALSE) ## -------------------------------------------------------------------------- library(airway) data(airway) airway ## -------------------------------------------------------------------------- colData(airway) airway[ , airway$dex == "untrt"] ## -------------------------------------------------------------------------- colSums(assay(airway)) ## ---- message=FALSE-------------------------------------------------------- url <- "https://scrnaseq-public-datasets.s3.amazonaws.com/scater-objects/manno_mouse.rds" fl <- BiocFileCache::bfcrpath(rnames = url) sce <- readRDS(fl) ## ----ranges, message=FALSE------------------------------------------------- library(GenomicRanges) gr <- GRanges(c("chr1:10-14:+", "chr1:20-24:+", "chr1:22-26:+")) shift(gr, 1) # 1-based coordinates! range(gr) # intra-range reduce(gr) # inter-range coverage(gr) setdiff(range(gr), gr) # 'introns' ## -------------------------------------------------------------------------- genes <- GRanges(c("chr1:30-40:+", "chr1:60-70:-")) snps <- GRanges(c("chr1:35", "chr1:60", "chr1:45")) countOverlaps(snps, genes) > 0 ## -------------------------------------------------------------------------- reg <- GRanges(c("chr1:50-55", "chr1:75-80")) nearest(reg, genes) precede(reg, genes) ## -------------------------------------------------------------------------- reads <- GRanges(c("chr1:10-19", "chr1:15-24", "chr1:30-41")) coverage(reads, width = 100) as(coverage(reads, width = 100), "GRanges") ## ----bam-require----------------------------------------------------------- library(GenomicRanges) library(GenomicAlignments) library(Rsamtools) ## our 'region of interest' roi <- GRanges("chr14", IRanges(19653773, width=1)) ## sample data library('RNAseqData.HNRNPC.bam.chr14') bf <- BamFile(RNAseqData.HNRNPC.bam.chr14_BAMFILES[[1]], asMates=TRUE) ## alignments, junctions, overlapping our roi paln <- readGAlignmentsList(bf) j <- summarizeJunctions(paln, with.revmap=TRUE) j_overlap <- j[j %over% roi] ## supporting reads paln[j_overlap$revmap[[1]]] ## ----vcf, message=FALSE---------------------------------------------------- ## input variants library(VariantAnnotation) fl <- system.file("extdata", "chr22.vcf.gz", package="VariantAnnotation") vcf <- readVcf(fl, "hg19") seqlevels(vcf) <- "chr22" ## known gene model library(TxDb.Hsapiens.UCSC.hg19.knownGene) coding <- locateVariants(rowRanges(vcf), TxDb.Hsapiens.UCSC.hg19.knownGene, CodingVariants()) head(coding) ## ----summarizeOverlaps-roi, message=FALSE---------------------------------- library(TxDb.Hsapiens.UCSC.hg19.knownGene) exByGn <- exonsBy(TxDb.Hsapiens.UCSC.hg19.knownGene, "gene") ## only chromosome 14 seqlevels(exByGn, pruning.mode="coarse") = "chr14" ## ----summarizeOverlaps-bam, message=FALSE---------------------------------- library(RNAseqData.HNRNPC.bam.chr14) length(RNAseqData.HNRNPC.bam.chr14_BAMFILES) ## ----summarizeOverlaps----------------------------------------------------- ## next 2 lines optional; non-Windows library(BiocParallel) register(MulticoreParam(workers=detectCores())) olaps <- summarizeOverlaps(exByGn, RNAseqData.HNRNPC.bam.chr14_BAMFILES) ## ----summarizeOverlaps-explore--------------------------------------------- olaps head(assay(olaps)) colSums(assay(olaps)) # library sizes plot(sum(width(olaps)), rowMeans(assay(olaps)), log="xy") ## ----summarizeOverlaps-gc-------------------------------------------------- library(BSgenome.Hsapiens.UCSC.hg19) sequences <- getSeq(BSgenome.Hsapiens.UCSC.hg19, rowRanges(olaps)) gcPerExon <- letterFrequency(unlist(sequences), "GC") gc <- relist(as.vector(gcPerExon), sequences) gc_percent <- sum(gc) / sum(width(olaps)) plot(gc_percent, rowMeans(assay(olaps)), log="y") ## -------------------------------------------------------------------------- sessionInfo()