## A typical chipseq workflow library(chipseq) library(lattice) library(latticeExtra) library(geneplotter) library(Matrix) library(IRanges) load("data/alignedLocs.rda") alignedLocs ## Each component represents data from one lane alignedLocs$sample1 ## Internal structure: stored as a list str(head(alignedLocs$sample1)) ## Mouse chromosome lengths library("BSgenome.Mmusculus.UCSC.mm9") mouse.chromlens <- seqlengths(Mmusculus) ## mouse.chromlens <- ## if (require(BSgenome.Mmusculus.UCSC.mm9)) seqlengths(Mmusculus) else { ## structure(c(197195432L, 181748087L, 159599783L, 155630120L, ## 152537259L, 149517037L, 152524553L, 131738871L, ## 124076172L, 129993255L, 121843856L, 121257530L, ## 120284312L, 125194864L, 103494974L, 98319150L, ## 95272651L, 90772031L, 61342430L, 166650296L, ## 15902555L, 16299L, 1231697L, 41899L, 160594L, 357350L, ## 362490L, 849593L, 449403L, 400311L, 3994L, 628739L, ## 1785075L, 58682461L, 5900358L), ## .Names = c("chr1", "chr2", "chr3", "chr4", "chr5", ## "chr6", "chr7", "chr8", "chr9", "chr10", ## "chr11", "chr12", "chr13", "chr14", "chr15", ## "chr16", "chr17", "chr18", "chr19", "chrX", ## "chrY", "chrM", "chr1_random", "chr3_random", ## "chr4_random", "chr5_random", "chr7_random", ## "chr8_random", "chr9_random", "chr13_random", ## "chr16_random", "chr17_random", ## "chrX_random", "chrY_random", ## "chrUn_random")) ## } ## Extending the short reads as an approximation to the segment ## sequenced ext <- extendReads(alignedLocs$sample1$chr5, readLen = 35, seqLen = 200) head(ext) ## Coverage: how many extended reads cover each coordinate of the genome cov <- coverage(ext, start = 1, end = mouse.chromlens["chr5"]) cov ## Islands: contiguous segments of non-zero coverage islands <- slice(cov, lower = 1) head(islands) viewSums(head(islands)) viewMaxs(head(islands)) nread.tab <- table(viewSums(islands) / 200) depth.tab <- table(viewMaxs(islands)) head(nread.tab, 10) head(depth.tab, 10) ## Do this for all lanes/chromosomes islandReadSummary <- function(x) { g <- extendReads(x) s <- slice(coverage(g, 1, max(end(g))), lower = 1) tab <- table(viewSums(s) / 200) ans <- data.frame(nread = as.numeric(names(tab)), count = as.numeric(tab)) ans } head(islandReadSummary(alignedLocs$sample1$chr5)) nread.islands <- summarizeReads(alignedLocs, summary.fun = islandReadSummary) ## distribution of reads per island xyplot(log(count) ~ nread | lane, nread.islands, subset = (chromosome == "chr5" & nread <= 40), layout = c(1, 3), pch = 16, type = c("p", "g")) xyplot(log(count) ~ nread | lane, nread.islands, subset = (chromosome == "chr5" & nread <= 40), layout = c(1, 3), pch = 16, type = c("p", "g"), panel = function(x, y, ...) { panel.lmline(x[1:2], y[1:2]) panel.xyplot(x, y, ...) }) ## Exercise: Create a similar plot for all chromosomes in a lane. Do ## you see the same pattern across chromosomes? xyplot(log(count) ~ nread | chromosome, nread.islands, subset = (lane == "sample1" & nread <= 20), aspect = 0.5, pch = 16, type = c("p", "g"), as.table = TRUE, panel = function(x, y, ...) { panel.lmline(x[1:2], y[1:2]) panel.xyplot(x, y, ...) }) if (FALSE) { xyplot(log(count) ~ nread | chromosome + lane, nread.islands, subset = (chromosome %in% c("chr1", "chr5", "chr18", "chrX", "chrY") & nread <= 20), aspect = 0.5, pch = 16, type = c("p", "g"), as.table = TRUE, panel = function(x, y, ...) { if (length(x) > 1) panel.lmline(x[1:2], y[1:2]) panel.xyplot(x, y, ...) }) xyplot(log(count) ~ nread | lane, subset(nread.islands, (chromosome %in% c("chr1", "chr5", "chr18", "chrX", "chrY") & nread <= 20)), groups = chromosome[drop = TRUE], auto.key = list(columns = 3), par.settings = simpleTheme(pch = 16), aspect = 0.5, type = c("o", "g"), as.table = TRUE) alignedLocs$sample2$chrY } ## proportions nread.table <- xtabs(count ~ nread + lane + chromosome, nread.islands) nread.props <- as.data.frame.table(prop.table(nread.table, margin = c(2, 3)), responseName="Proportion") dotplot(sqrt(Proportion) ~ chromosome | lane, data = nread.props, groups = nread, subset = (as.numeric(as.character(nread)) <= 4), layout = c(1, 3), strip = FALSE, strip.left = TRUE, type = "o", pch = 16) ## do the singleton islands tell us anything useful? ## one hypothesis is that they represent a random sample of the genome ## (perhaps indicating chromatin availability). singletons <- as.data.frame(t(nread.table[1, , 1:19])) singletons$chrom <- factor(rownames(singletons), levels = rownames(singletons)) singletons dotplot(reorder(chrom, sample1/control) ~ sample1/control + sample2/control, data = singletons, par.settings = simpleTheme(pch = 16), auto.key = TRUE) dotplot(reorder(chrom, (sample2 - sample1)/control) ~ sample1/control + sample2/control, data = singletons, par.settings = simpleTheme(pch = 16), auto.key = TRUE) with(singletons, xyplot(sample2/control ~ sample1/control, labels = as.character(chrom), pch = 16, type = c("p", "g"), panel = panel.text)) ## similar plot for depth islandDepthSummary <- function(x) { g <- extendReads(x) s <- slice(coverage(g, 1, max(end(g))), lower = 1) tab <- table(viewMaxs(s)) ans <- data.frame(depth = as.numeric(names(tab)), count = as.numeric(tab)) ans } depth.islands <- summarizeReads(alignedLocs, summary.fun = islandDepthSummary) xyplot(log(count) ~ depth | lane, depth.islands, subset = (chromosome == "chr5" & depth <= 20), layout = c(1, 3), pch = 16, type = c("p", "g"), as.table = TRUE, panel = function(x, y, ...) { lambda <- 2 * exp(y[2]) / exp(y[1]) null.est <- function(xx) { xx * log(lambda) - lambda - lgamma(xx + 1) } log.N.hat <- null.est(1) - y[1] panel.lines(1:10, -log.N.hat + null.est(1:10)) panel.xyplot(x, y, ...) }) if (FALSE) { xyplot(log(count) ~ depth | chromosome + lane, depth.islands, subset = (chromosome %in% c("chr1", "chr5", "chr18", "chrX", "chrY") & depth <= 20), aspect = 0.5, pch = 16, type = c("p", "g"), as.table = TRUE) } ## peaks: back to chr5 example. How are reads distributed in positive ## and negative strands? peaks <- slice(cov, lower = 8) peaks peak.depths <- viewMaxs(peaks) cov.pos <- coverage(extendReads(alignedLocs$sample1$chr5, strand = "+"), start = 1, end = mouse.chromlens["chr5"]) cov.neg <- coverage(extendReads(alignedLocs$sample1$chr5, strand = "-"), start = 1, end = mouse.chromlens["chr5"]) peaks.pos <- copyIRanges(peaks, cov.pos) peaks.neg <- copyIRanges(peaks, cov.neg) which(peak.depths >= 40) plotPeak(peaks.pos[6], peaks.neg[6]) plotPeak(peaks.pos[96], peaks.neg[96]) plotPeak(peaks.pos[126], peaks.neg[126]) plotPeak(peaks.pos[173], peaks.neg[173]) ## differential peaks: which peaks are different in sample 1 and ## sample 2? ## Strategy: combine lanes, find peaks in combined data, and compare ## contributions extRanges <- lapply(alignedLocs, extendReads) peakSummary <- diffPeakSummary(extRanges$sample1, extRanges$sample2, chrom.lens = mouse.chromlens, lower = 10) xyplot(asinh(sums2) ~ asinh(sums1) | chromosome, data = peakSummary, subset = (chromosome %in% c("chr1", "chr2")), panel = function(x, y, ...) { panel.xyplot(x, y, ...) panel.abline(median(y - x), 1) }, type = c("p", "g", "r"), col.line = "black", aspect = "iso") peakSummary <- within(peakSummary, { diffs <- asinh(sums2) - asinh(sums1) resids <- (diffs - median(diffs)) / mad(diffs) up <- resids > 2 down <- resids < -2 }) head(peakSummary) ## peak context data(geneMouse) gregions <- genomic_regions(genes = geneMouse, proximal = 2000) gregions$gene <- as.character(gregions$gene) ans <- contextDistribution(peakSummary, gregions) head(ans) sumtab <- as.data.frame(rbind(total = xtabs(total ~ type, ans), promoter = xtabs(promoter ~ type, ans), threeprime = xtabs(threeprime ~ type, ans), upstream = xtabs(upstream ~ type, ans), downstream = xtabs(downstream ~ type, ans), gene = xtabs(gene ~ type, ans))) cbind(sumtab, ratio = round(sumtab[, "down"] / sumtab[, "up"], 3)) sessionInfo()