###################################################
### chunk number 1: setup
###################################################
options(width=72)
olocale <- Sys.setlocale(locale="C")


###################################################
### chunk number 2: PackageLoads
###################################################
library("ShortRead")
library("BSgenome.Mmusculus.UCSC.mm9")


###################################################
### chunk number 3: PackageLoads
###################################################
packageDescription("Biostrings")$Version
packageDescription("IRanges")$Version


###################################################
### chunk number 4: setwd eval=FALSE
###################################################
## setwd(file.path("path", "to", "data"))


###################################################
### chunk number 5: setwd
###################################################
# Change this path to the appropriate location
setwd("~/Documents/workspace/bioC/Courses/Seattle-Nov-2008/labs")


###################################################
### chunk number 6: FindTopReads eval=FALSE
###################################################
## # Code that was used to create the top reads
## # This can be modified for use with your own experiments
## sp <-
##   list("experiment1" = SolexaPath(file.path("path", "to", "experiment1")),
##        "experiment2" = SolexaPath(file.path("path", "to", "experiment2")))
## patSeq <- paste("s_", 1:8, "_.*_seq.txt", sep = "")
## names(patSeq) <- paste("lane", 1:8, sep = "")
## topReads <-
##   lapply(seq_len(length(sp)),
##          function(i) {
##            print(experimentPath(sp[[i]]))
##            lapply(seq_len(length(patSeq)),
##                   function(j, n = 1000) {
##                       cat("Reading", patSeq[[j]], "...")
##                       x <-
##                         tables(readXStringColumns(baseCallPath(sp[[i]]),
##                                                   pattern = patSeq[[j]],
##                                                   colClasses =
##                                                   c(rep(list(NULL), 4),
##                                                   list("DNAString")))[[1]],
##                                n = n)[["top"]]
##                       names(x) <- chartr("-", "N", names(x))
##                       cat("done.\n")
##                       x
##                   })
##          })
## names(topReads) <- names(sp)
## for (i in seq_len(length(sp))) {
##     names(topReads[[i]]) <- names(patSeq)
## }


###################################################
### chunk number 7: LoadTopReads
###################################################
load(file.path("data", "topReads.rda"))


###################################################
### chunk number 8: TopReadsMetadata
###################################################
class(topReads)
names(topReads)


###################################################
### chunk number 9: ElementsTopReadsMetadata
###################################################
sapply(topReads, class)
sapply(topReads, names)


###################################################
### chunk number 10: ClassElementsTopReadsElements
###################################################
sapply(topReads, sapply, class)


###################################################
### chunk number 11: TopElement
###################################################
lapply(topReads, sapply, "[", 1)


###################################################
### chunk number 12: FindPolyNs
###################################################
lane1.1TopReads <- topReads[["experiment1"]][["lane1"]]
alphabetCounts <- alphabetFrequency(DNAStringSet(names(lane1.1TopReads)), baseOnly = TRUE)
lane1.1MaxLetter <-
  pmax(alphabetCounts[,"A"], alphabetCounts[,"C"], alphabetCounts[,"G"], alphabetCounts[,"T"])
DNAStringSet(names(lane1.1TopReads[lane1.1MaxLetter >= 34]))


###################################################
### chunk number 13: CreateAdapter
###################################################
adapter <- DNAString("GATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG")


###################################################
### chunk number 14: MatchAdapterToGenome
###################################################
bsParams <- new("BSParams", X = Mmusculus, FUN = countPattern, simplify = TRUE)
bsapply(bsParams, pattern = adapter)


###################################################
### chunk number 15: AdapterReads
###################################################
uniqueReads <-
  DNAStringSet(sort(unique(unname(unlist(lapply(topReads, lapply, names))))))
whichAdapters <-
  which(vcountPattern(adapter, uniqueReads, max.mismatch = 4, with.indels = TRUE) > 0)
adapterReads <- uniqueReads[whichAdapters]
adapterReads


###################################################
### chunk number 16: TopAdapterReads
###################################################
topAdapterReads <-
  lapply(topReads, lapply,
         function(x) x[intersect(names(x), as.character(adapterReads))])
sapply(topAdapterReads, sapply, length)
sapply(topAdapterReads, sapply, sum)


###################################################
### chunk number 17: TopAdapterReads1-1
###################################################
lane1.1AdapterCounts <- topAdapterReads[["experiment1"]][["lane1"]]
lane1.1AdapterReads <- DNAStringSet(names(lane1.1AdapterCounts))
lane1.1AdapterReads


###################################################
### chunk number 18: TopAdapterAligns1-1
###################################################
lane1.1AdapterAligns <-
  pairwiseAlignment(lane1.1AdapterReads, adapter, type = "patternOverlap")
summary(lane1.1AdapterAligns, weight = lane1.1AdapterCounts)


###################################################
### chunk number 19: TopReads2-1
###################################################
lane2.1TopCounts <- topReads[["experiment2"]][["lane1"]]
lane2.1TopReads <- DNAStringSet(names(lane2.1TopCounts))
lane2.1TopReads


###################################################
### chunk number 20: TopReadsClust2-1
###################################################
lane2.1Clust <- hclust(stringDist(lane2.1TopReads), method = "single")
plot(lane2.1Clust)
lane2.1Groups <- cutree(lane2.1Clust, h = 2)
head(sort(table(lane2.1Groups), decreasing = TRUE))


###################################################
### chunk number 21: UnknownSeqs1
###################################################
reverseComplement(lane2.1TopReads[lane2.1Groups == 9])
lane2.1TopReads[lane2.1Groups == 8]
unknownSeqs <-
  DNAStringSet(
    intersect(as.character(reverseComplement(lane2.1TopReads[lane2.1Groups == 9])),
              as.character(lane2.1TopReads[lane2.1Groups == 8])))
unknownSeqs


###################################################
### chunk number 22: UnknownSeqs2
###################################################
unknownCounts <-
  lane2.1TopCounts[as.character(unknownSeqs)] + 
    lane2.1TopCounts[as.character(reverseComplement(unknownSeqs))]
unknownSeqs <- unknownSeqs[order(unknownCounts, decreasing = TRUE)]
unknownCounts <- unknownCounts[order(unknownCounts, decreasing = TRUE)]
head(unknownCounts)


###################################################
### chunk number 23: StarterSeq
###################################################
submat <- nucleotideSubstitutionMatrix(match = 1, mismatch = -Inf)
whichStarter <-
  which.max(apply(as.matrix(stringDist(unknownSeqs,
                                       method = "substitutionMatrix",
                                       substitutionMatrix = submat,
                                       gapExtension = -Inf,
                                       type = "overlap")),
                  1, function(x) sum(x >= 24)))
starterSeq <- unknownSeqs[[whichStarter]]
starterSeq


###################################################
### chunk number 24: StarterSeqAlign
###################################################
starterAlign <-
  pairwiseAlignment(unknownSeqs, starterSeq, substitutionMatrix = submat,
                    gapExtension = -Inf, type = "overlap")


###################################################
### chunk number 25: UnanimousChars
###################################################
unanimousChars <- function (x) 
{
    letters <- c("A", "C", "G", "T")
    mat <- consensusMatrix(x)[letters, , drop = FALSE]
    paste(apply(mat, 2,
                function(y) {z <- which(y != 0); ifelse(length(z) == 1, letters[z], "?")}),
          collapse = "")
}


###################################################
### chunk number 26: UnknownPrefix
###################################################
whichInPrefix <- (score(starterAlign) >= 10 & start(subject(starterAlign)) == 1 & start(pattern(starterAlign)) != 1)
prefix <- narrow(unknownSeqs[whichInPrefix], 1, start(pattern(starterAlign[whichInPrefix])) - 1)
prefix <-
 DNAStringSet(paste(sapply(max(nchar(prefix)) - nchar(prefix), polyn, nucleotides="-"), as.character(prefix), sep = ""))
consensusMatrix(prefix, baseOnly = TRUE)
unanimousChars(prefix)


###################################################
### chunk number 27: UnknownSuffix
###################################################
whichInSuffix <- (score(starterAlign) >= 10 & end(subject(starterAlign)) == 36 & end(pattern(starterAlign)) != 36)
suffix <- narrow(unknownSeqs[whichInSuffix], end(pattern(starterAlign[whichInSuffix])) + 1, 36)
suffix <-
  DNAStringSet(paste(as.character(suffix), sapply(max(nchar(suffix)) - nchar(suffix), polyn, nucleotides="-"), sep = ""))
consensusMatrix(suffix, baseOnly = TRUE)
unanimousChars(suffix)


###################################################
### chunk number 28: ExtendedUnknown
###################################################
extendedUnknown <-
  DNAString(paste(unanimousChars(prefix), as.character(starterSeq), unanimousChars(suffix), sep = ""))
extendedUnknown


###################################################
### chunk number 29: ExtendedUnknownAlign
###################################################
unknownAlign <-
  pairwiseAlignment(unknownSeqs, extendedUnknown, substitutionMatrix = submat, gapExtension = -Inf, type = "overlap")
table(score(unknownAlign))


###################################################
### chunk number 30: SumOfUnknowns
###################################################
sapply(topReads, sapply,
       function(x) {
           whichNoNs <- (alphabetFrequency(DNAStringSet(names(x)))[,"N"] == 0)
           x <- x[whichNoNs]
           pdict <- PDict(DNAStringSet(names(x)))
           whichMapped <- (countPDict(pdict, extendedUnknown) + countPDict(pdict, reverseComplement(extendedUnknown))) > 0
           sum(x[whichMapped])
       })


###################################################
### chunk number 31: UnknownMapping
###################################################
params <- new("BSParams", X = Mmusculus, FUN = countPattern, simplify = TRUE)
unknownCountPattern <- bsapply(params, pattern = extendedUnknown)
unknownCountPattern 


###################################################
### chunk number 32: UnknownLocation
###################################################
mm9Chr2 <- Mmusculus[["chr2"]]
mm9Ch2View <- matchPattern(extendedUnknown, mm9Chr2)
mm9Ch2View


###################################################
### chunk number 33: PhageReads
###################################################
sp <- SolexaPath(file.path("extdata", "ELAND", "080828_HWI-EAS88_0003"))
phageReads <- readAligned(analysisPath(sp), "s_5_1_export.txt", "SolexaExport")


###################################################
### chunk number 34: UniquePhageReads
###################################################
phageReadTable <- tables(sread(phageReads), n = Inf)[["top"]]


###################################################
### chunk number 35: CleanPhageReads
###################################################
whichNotClean <- grep("N", names(phageReadTable))
head(phageReadTable[whichNotClean])
cleanReadTable <- phageReadTable[- whichNotClean]
head(cleanReadTable)


###################################################
### chunk number 36: PhageGenome
###################################################
data(phiX174Phage)
names(phiX174Phage)
nebPhage <- phiX174Phage[[which(names(phiX174Phage) == "NEB03")]]
nebPhage <-
  DNAString(paste(as.character(nebPhage), as.character(substr(nebPhage, 1, 34)),
                  sep = ""))
nebPhage


###################################################
### chunk number 37: MapPhageData
###################################################
posPDict <- PDict(DNAStringSet(names(cleanReadTable)), max.mismatch = 2)
negPDict <-
  PDict(reverseComplement(DNAStringSet(names(cleanReadTable))), max.mismatch = 2)
whichAlign <- rep(FALSE, length(phageReadTable))
whichAlign[- whichNotClean] <-
  (countPDict(posPDict, nebPhage, max.mismatch = 2) +
   countPDict(negPDict, nebPhage, max.mismatch = 2) > 0)


###################################################
### chunk number 38: SummarizePhageMapping
###################################################
table(whichAlign)
round(sapply(split(phageReadTable, whichAlign), sum) / sum(phageReadTable), 2)


###################################################
### chunk number 39: HooverDeconstructed
###################################################
print(
histogram(~ log10(phageReadTable[phageReadTable > 1]) | whichAlign[phageReadTable > 1],
          xlab = "log10(Read Counts)", main = "Read Counts by IS(Aligned to Phage)")
)


###################################################
### chunk number 40: sessionInfo
###################################################
sessionInfo()