## A typical chipseq workflow

library(chipseq)
library(lattice)
library(latticeExtra)
library(geneplotter)
library(Matrix)
library(IRanges)

load("data/alignedLocs.rda")

alignedLocs


## Each component represents data from one lane

alignedLocs$sample1


## Internal structure: stored as a list

str(head(alignedLocs$sample1))


## Mouse chromosome lengths


library("BSgenome.Mmusculus.UCSC.mm9")
mouse.chromlens <- seqlengths(Mmusculus)


## mouse.chromlens <- 
##     if (require(BSgenome.Mmusculus.UCSC.mm9)) seqlengths(Mmusculus) else {
##         structure(c(197195432L, 181748087L, 159599783L, 155630120L,
##                     152537259L, 149517037L, 152524553L, 131738871L,
##                     124076172L, 129993255L, 121843856L, 121257530L,
##                     120284312L, 125194864L, 103494974L, 98319150L,
##                     95272651L, 90772031L, 61342430L, 166650296L,
##                     15902555L, 16299L, 1231697L, 41899L, 160594L, 357350L,
##                     362490L, 849593L, 449403L, 400311L, 3994L, 628739L,
##                     1785075L, 58682461L, 5900358L),
##                   .Names = c("chr1", "chr2", "chr3", "chr4", "chr5",
##                              "chr6", "chr7", "chr8", "chr9", "chr10",
##                              "chr11", "chr12", "chr13", "chr14", "chr15",
##                              "chr16", "chr17", "chr18", "chr19", "chrX",
##                              "chrY", "chrM", "chr1_random", "chr3_random",
##                              "chr4_random", "chr5_random", "chr7_random",
##                              "chr8_random", "chr9_random", "chr13_random",
##                              "chr16_random", "chr17_random",
##                              "chrX_random", "chrY_random",
##                              "chrUn_random"))
##     }


## Extending the short reads as an approximation to the segment
## sequenced

ext <- extendReads(alignedLocs$sample1$chr5, readLen = 35, seqLen = 200)
head(ext)

## Coverage: how many extended reads cover each coordinate of the genome 

cov <- coverage(ext, start = 1, end = mouse.chromlens["chr5"])
cov

## Islands: contiguous segments of non-zero coverage

islands <- slice(cov, lower = 1)
head(islands)

viewSums(head(islands))
viewMaxs(head(islands))

nread.tab <- table(viewSums(islands) / 200)
depth.tab <- table(viewMaxs(islands))

head(nread.tab, 10)
head(depth.tab, 10)

## Do this for all lanes/chromosomes

islandReadSummary <- function(x)
{
    g <- extendReads(x)
    s <- slice(coverage(g, 1, max(end(g))), lower = 1)
    tab <- table(viewSums(s) / 200)
    ans <- data.frame(nread = as.numeric(names(tab)), count = as.numeric(tab))
    ans
}

head(islandReadSummary(alignedLocs$sample1$chr5))

nread.islands <- summarizeReads(alignedLocs, summary.fun = islandReadSummary)

## distribution of reads per island

xyplot(log(count) ~ nread | lane, nread.islands, 
       subset = (chromosome == "chr5" & nread <= 40), 
       layout = c(1, 3), pch = 16, type = c("p", "g"))

xyplot(log(count) ~ nread | lane, nread.islands, 
       subset = (chromosome == "chr5" & nread <= 40), 
       layout = c(1, 3), pch = 16, type = c("p", "g"),
       panel = function(x, y, ...) {
           panel.lmline(x[1:2], y[1:2])
           panel.xyplot(x, y, ...)
       })

## Exercise: Create a similar plot for all chromosomes in a lane.  Do
## you see the same pattern across chromosomes?

xyplot(log(count) ~ nread | chromosome, nread.islands, 
       subset = (lane == "sample1" & nread <= 20), 
       aspect = 0.5, pch = 16, type = c("p", "g"), as.table = TRUE,
       panel = function(x, y, ...) {
           panel.lmline(x[1:2], y[1:2])
           panel.xyplot(x, y, ...)
       })

if (FALSE)
{
    
xyplot(log(count) ~ nread | chromosome + lane, nread.islands, 
       subset = (chromosome %in% c("chr1", "chr5", "chr18", "chrX", "chrY") & nread <= 20), 
       aspect = 0.5, pch = 16, type = c("p", "g"), as.table = TRUE,
       panel = function(x, y, ...) {
           if (length(x) > 1) panel.lmline(x[1:2], y[1:2])
           panel.xyplot(x, y, ...)
       })

xyplot(log(count) ~ nread | lane,
       subset(nread.islands, (chromosome %in% c("chr1", "chr5", "chr18", "chrX", "chrY") & nread <= 20)), 
       groups = chromosome[drop = TRUE],
       auto.key = list(columns = 3), par.settings = simpleTheme(pch = 16),
       aspect = 0.5, type = c("o", "g"), as.table = TRUE)

alignedLocs$sample2$chrY

}


## proportions 

nread.table <- xtabs(count ~ nread + lane + chromosome, nread.islands)
nread.props <- as.data.frame.table(prop.table(nread.table, margin = c(2, 3)), responseName="Proportion")

dotplot(sqrt(Proportion) ~ chromosome | lane, data = nread.props,
        groups = nread, subset = (as.numeric(as.character(nread)) <= 4),
        layout = c(1, 3), strip = FALSE, strip.left = TRUE, type = "o", pch = 16) 


## do the singleton islands tell us anything useful?

## one hypothesis is that they represent a random sample of the genome
## (perhaps indicating chromatin availability).

singletons <- as.data.frame(t(nread.table[1, , 1:19]))
singletons$chrom <- factor(rownames(singletons), levels = rownames(singletons))
singletons

dotplot(reorder(chrom, sample1/control) ~ sample1/control + sample2/control,
        data = singletons,
        par.settings = simpleTheme(pch = 16), auto.key = TRUE)

dotplot(reorder(chrom, (sample2 - sample1)/control) ~ sample1/control + sample2/control,
        data = singletons,
        par.settings = simpleTheme(pch = 16), auto.key = TRUE)

with(singletons,
     xyplot(sample2/control ~ sample1/control,
            labels = as.character(chrom),
            pch = 16, type = c("p", "g"),
            panel = panel.text))


## similar plot for depth

islandDepthSummary <- function(x)
{
    g <- extendReads(x)
    s <- slice(coverage(g, 1, max(end(g))), lower = 1)
    tab <- table(viewMaxs(s))
    ans <- data.frame(depth = as.numeric(names(tab)), count = as.numeric(tab))
    ans
}

depth.islands <- summarizeReads(alignedLocs, summary.fun = islandDepthSummary)

xyplot(log(count) ~ depth | lane, depth.islands, 
       subset = (chromosome == "chr5" & depth <= 20), 
       layout = c(1, 3), pch = 16, type = c("p", "g"), as.table = TRUE,
       panel = function(x, y, ...) {
           lambda <- 2 * exp(y[2]) / exp(y[1])
           null.est <- function(xx) {
               xx * log(lambda) - lambda - lgamma(xx + 1)
           }
           log.N.hat <- null.est(1) - y[1]
           panel.lines(1:10, -log.N.hat + null.est(1:10))
           panel.xyplot(x, y, ...)
       })


if (FALSE)
{

xyplot(log(count) ~ depth | chromosome + lane, depth.islands, 
       subset = (chromosome %in% c("chr1", "chr5", "chr18", "chrX", "chrY") & depth <= 20), 
       aspect = 0.5, pch = 16, type = c("p", "g"), as.table = TRUE)

}


## peaks: back to chr5 example.  How are reads distributed in positive
## and negative strands?

peaks <- slice(cov, lower = 8)
peaks

peak.depths <- viewMaxs(peaks)

cov.pos <- coverage(extendReads(alignedLocs$sample1$chr5, strand = "+"), start = 1, end = mouse.chromlens["chr5"])
cov.neg <- coverage(extendReads(alignedLocs$sample1$chr5, strand = "-"), start = 1, end = mouse.chromlens["chr5"])

peaks.pos <- copyIRanges(peaks, cov.pos)
peaks.neg <- copyIRanges(peaks, cov.neg)

which(peak.depths >= 40)


plotPeak(peaks.pos[6], peaks.neg[6])
plotPeak(peaks.pos[96], peaks.neg[96])
plotPeak(peaks.pos[126], peaks.neg[126])
plotPeak(peaks.pos[173], peaks.neg[173])


## differential peaks: which peaks are different in sample 1 and
## sample 2?

## Strategy: combine lanes, find peaks in combined data, and compare
## contributions

extRanges <- lapply(alignedLocs, extendReads)


peakSummary <-
    diffPeakSummary(extRanges$sample1, extRanges$sample2,
                    chrom.lens = mouse.chromlens, lower = 10)

xyplot(asinh(sums2) ~ asinh(sums1) | chromosome,
       data = peakSummary, 
       subset = (chromosome %in% c("chr1", "chr2")),
       panel = function(x, y, ...) {
           panel.xyplot(x, y, ...)
           panel.abline(median(y - x), 1)
       },
       type = c("p", "g", "r"), col.line = "black", aspect = "iso")

peakSummary <- 
    within(peakSummary,
       {
           diffs <- asinh(sums2) - asinh(sums1)
           resids <-  (diffs - median(diffs)) / mad(diffs)
           up <- resids > 2
           down <- resids < -2
       })
head(peakSummary)

## peak context

data(geneMouse)

gregions <- genomic_regions(genes = geneMouse, proximal = 2000)
gregions$gene <- as.character(gregions$gene)

ans <- contextDistribution(peakSummary, gregions)
head(ans)


sumtab <- 
    as.data.frame(rbind(total = xtabs(total ~ type, ans),
                        promoter = xtabs(promoter ~ type, ans),
                        threeprime = xtabs(threeprime ~ type, ans),
                        upstream = xtabs(upstream ~ type, ans),
                        downstream = xtabs(downstream ~ type, ans),
                        gene = xtabs(gene ~ type, ans)))

cbind(sumtab, ratio = round(sumtab[, "down"] /  sumtab[, "up"], 3))

sessionInfo()