## ----style, echo = FALSE, results = 'asis'---------------------------------
BiocStyle::markdown()

## ----setup, echo=FALSE-----------------------------------------------------
knitr::opts_chunk$set(
    eval=as.logical(Sys.getenv("KNITR_EVAL", "TRUE")),
    cache=as.logical(Sys.getenv("KNITR_CACHE", "TRUE"))
)
suppressPackageStartupMessages({
    library(Biostrings)
    library(GenomicRanges)
})

## ----setup-objects---------------------------------------------------------
library(Biostrings)
library(GenomicRanges)

## ----Biostrings, message=FALSE---------------------------------------------
library(Biostrings)                     # Biological sequences
data(phiX174Phage)                      # sample data, see ?phiX174Phage
phiX174Phage
m <- consensusMatrix(phiX174Phage)[1:4,] # nucl. x position counts
polymorphic <- which(colSums(m != 0) > 1)
m[, polymorphic]

## ----methods, eval=FALSE---------------------------------------------------
#  methods(class=class(phiX174Phage))      # 'DNAStringSet' methods

## ----phiX------------------------------------------------------------------
library(Biostrings)
data(phiX174Phage)

## ----consensusMatrix-------------------------------------------------------
m <- consensusMatrix(phiX174Phage)[1:4,]
polymorphic <- which(colSums(m != 0) > 1)
mapply(substr, polymorphic, polymorphic, MoreArgs=list(x=phiX174Phage))

## ----iranges---------------------------------------------------------------
library(IRanges)
ir <- IRanges(start=c(10, 20, 30), width=5)
ir

## ----iranges-flank---------------------------------------------------------
flank(ir, 3)

## ----iranges-class---------------------------------------------------------
class(ir)
getClass(class(ir))

## ----iranges-flank-method, eval=FALSE--------------------------------------
#  ?"flank,Ranges-method"

## ----granges---------------------------------------------------------------
library(GenomicRanges)
gr <- GRanges(c("chr1", "chr1", "chr2"), ir, strand=c("+", "-", "+"))
gr

## ----granges-flank---------------------------------------------------------
flank(gr, 3)

## ----granges-class---------------------------------------------------------
class(gr)
getClass(class(gr))

## ----granges-flank-method, eval=FALSE--------------------------------------
#  ?"flank,GenomicRanges-method"

## ----granges-methods, eval=FALSE-------------------------------------------
#  showMethods(class="GRanges", where=search())

## ----granges-man-and-vignettes, eval=FALSE---------------------------------
#  help(package="GenomicRanges")
#  vignette(package="GenomicRanges")
#  vignette(package="GenomicRanges", "GenomicRangesHOWTOs")

## ----BSgenome-require, message=FALSE---------------------------------------
library(BSgenome.Hsapiens.UCSC.hg38)
chr14_range = GRanges("chr14", IRanges(1, seqlengths(Hsapiens)["chr14"]))
chr14_dna <- getSeq(Hsapiens, chr14_range)
letterFrequency(chr14_dna, "GC", as.prob=TRUE)

## --------------------------------------------------------------------------
library(Biostrings)
url <- "ftp://ftp.ensembl.org/pub/release-92/fasta/mus_musculus/cds/Mus_musculus.GRCm38.cds.all.fa.gz"
fl <- BiocFileCache::bfcrpath(rnames = url)
cds <- rtracklayer::import(fl, "fasta")

## --------------------------------------------------------------------------
pred1 <- width(cds) %% 3 == 0
table(pred1)
pred2 <- narrow(cds, 1, 3) == "ATG"
stops <- c("TAA", "TAG", "TGA")
pred3 <- narrow(cds, width(cds) - 2, width(cds)) %in% stops
table(pred1 & pred2 & pred3)
cds <- cds[ pred1 & pred2 & pred3 ]

## --------------------------------------------------------------------------
hist(log10(width(cds)))
cds[ which.max(width(cds)) ]
names(cds)[ which.max(width(cds)) ]

## --------------------------------------------------------------------------
gc <- letterFrequency(cds, "GC", as.prob=TRUE)
head(gc)
hist(gc)
plot( log10(width(cds)), gc, pch=".")

## --------------------------------------------------------------------------
AMINO_ACID_CODE
aa <- translate(cds)
codon_use <- letterFrequency(aa, names(AMINO_ACID_CODE))
head(codon_use)

## --------------------------------------------------------------------------
mcols(cds) <- DataFrame(
    GC = gc[,"G|C"]
)
mcols(cds, use.names = FALSE)
mcols(cds[1:3], use.names = FALSE)

## --------------------------------------------------------------------------
library(airway)
data(airway)
airway

## --------------------------------------------------------------------------
colData(airway)
airway[ , airway$dex == "untrt"]

## --------------------------------------------------------------------------
colSums(assay(airway))

## ---- message=FALSE--------------------------------------------------------
url <- "https://scrnaseq-public-datasets.s3.amazonaws.com/scater-objects/manno_mouse.rds"
fl <- BiocFileCache::bfcrpath(rnames = url)
sce <- readRDS(fl)

## ----ranges, message=FALSE-------------------------------------------------
library(GenomicRanges)
gr <- GRanges(c("chr1:10-14:+", "chr1:20-24:+", "chr1:22-26:+"))
shift(gr, 1)                            # 1-based coordinates!
range(gr)                               # intra-range
reduce(gr)                              # inter-range
coverage(gr)
setdiff(range(gr), gr)                  # 'introns'

## --------------------------------------------------------------------------
genes <- GRanges(c("chr1:30-40:+", "chr1:60-70:-"))
snps <- GRanges(c("chr1:35", "chr1:60", "chr1:45"))
countOverlaps(snps, genes) > 0

## --------------------------------------------------------------------------
reg <- GRanges(c("chr1:50-55", "chr1:75-80"))
nearest(reg, genes)
precede(reg, genes)

## --------------------------------------------------------------------------
reads <- GRanges(c("chr1:10-19", "chr1:15-24", "chr1:30-41"))
coverage(reads, width = 100)
as(coverage(reads, width = 100), "GRanges")

## ----bam-require-----------------------------------------------------------
library(GenomicRanges)
library(GenomicAlignments)
library(Rsamtools)

## our 'region of interest'
roi <- GRanges("chr14", IRanges(19653773, width=1)) 
## sample data
library('RNAseqData.HNRNPC.bam.chr14')
bf <- BamFile(RNAseqData.HNRNPC.bam.chr14_BAMFILES[[1]], asMates=TRUE)
## alignments, junctions, overlapping our roi
paln <- readGAlignmentsList(bf)
j <- summarizeJunctions(paln, with.revmap=TRUE)
j_overlap <- j[j %over% roi]

## supporting reads
paln[j_overlap$revmap[[1]]]

## ----vcf, message=FALSE----------------------------------------------------
## input variants
library(VariantAnnotation)
fl <- system.file("extdata", "chr22.vcf.gz", package="VariantAnnotation")
vcf <- readVcf(fl, "hg19")
seqlevels(vcf) <- "chr22"
## known gene model
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
coding <- locateVariants(rowRanges(vcf),
    TxDb.Hsapiens.UCSC.hg19.knownGene,
    CodingVariants())
head(coding)

## ----summarizeOverlaps-roi, message=FALSE----------------------------------
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
exByGn <- exonsBy(TxDb.Hsapiens.UCSC.hg19.knownGene, "gene")
## only chromosome 14
seqlevels(exByGn, pruning.mode="coarse") = "chr14"

## ----summarizeOverlaps-bam, message=FALSE----------------------------------
library(RNAseqData.HNRNPC.bam.chr14)
length(RNAseqData.HNRNPC.bam.chr14_BAMFILES)

## ----summarizeOverlaps-----------------------------------------------------
## next 2 lines optional; non-Windows
library(BiocParallel)
register(MulticoreParam(workers=detectCores()))
olaps <- summarizeOverlaps(exByGn, RNAseqData.HNRNPC.bam.chr14_BAMFILES)

## ----summarizeOverlaps-explore---------------------------------------------
olaps
head(assay(olaps))
colSums(assay(olaps))                # library sizes
plot(sum(width(olaps)), rowMeans(assay(olaps)), log="xy")

## ----summarizeOverlaps-gc--------------------------------------------------
library(BSgenome.Hsapiens.UCSC.hg19)
sequences <- getSeq(BSgenome.Hsapiens.UCSC.hg19, rowRanges(olaps))
gcPerExon <- letterFrequency(unlist(sequences), "GC")
gc <- relist(as.vector(gcPerExon), sequences)
gc_percent <- sum(gc) / sum(width(olaps))
plot(gc_percent, rowMeans(assay(olaps)), log="y")

## --------------------------------------------------------------------------
sessionInfo()