##############################################################################
### Exercise 1
### ==========

### Before you can load a Bioconductor package, you need to install it with:
###   source("http://bioconductor.org/biocLite.R")
###   biocLite("BSgenome")
### The biocLite() function will take care of installing any other BioC or CRAN
### package required by the package that you are requesting.

library(BSgenome.Mmusculus.UCSC.mm9)

Mmusculus

Mmusculus$chr1

load("path/to/topReads.rda")

### 'topReads' is a list with 2 components named "experiment1" and "experiment2"
class(topReads)
names(topReads)

### Access a given element with $:
topReads$experiment2  # this will print a lot of output (not very useful)

### 'topReads$experiment2' is itself a list with 8 elements:
class(topReads$experiment2)
names(topReads$experiment2)

### 'topReads$experiment2$lane1' is an integer vector:
class(topReads$experiment2$lane1)

### Print its first 6 elements:
topReads$experiment2$lane1[1:6]  # the names of the elements are the Solexa
                                 # reads, the integer values are the number of
                                 # times each read is represented in the set
                                 # of reads for this lane

### Extract the names if you want to display the reads only:
names(topReads$experiment2$lane1)[1:6]


##############################################################################
### Exercise 2
### ==========

### Convert the 1st read into a DNAString object:
r1 <- DNAString(names(topReads$experiment2$lane1)[1])

length(r1)  # nb of letters in 'r1'

alphabetFrequency(r1)  # the names are the letters of the DNA alphabet

reverseComplement(r1)

subseq(r1, start=5, end=15)
subseq(r1, end=15)  # extract the first 15 letters
subseq(r1, start=-5)  # extract the last 5 letters


##############################################################################
### Exercise 3
### ==========

dict0 <- DNAStringSet(names(topReads$experiment2$lane1))

length(dict0)

width(dict0)

dict0[-2]

rev(dict0)

dict0[[1]]

DNAStringSet(dict0, end=-3)

DNAStringSet(dict0, start=-10)

alphabetFrequency(dict0)  # returns a matrix with 1000 rows

reverseComplement(dict0)

### Use 'collapse=TRUE' to collapse all the rows
alphabetFrequency(dict0, collapse=TRUE)

alphabetFrequency(reverseComplement(dict0), collapse=TRUE)

### Use [ , ] to subset the matrix returned by alphabetFrequency()
dict0 <- dict0[alphabetFrequency(dict0)[ ,"N"] == 0]


##############################################################################
### Exercise 4
### ==========

v3 <- Views(dict0[[1]], start=c(2, 12, 20), end=c(5, 26, 27))

subject(v3)
start(v3)
end(v3)
gaps(v3)

alphabetFrequency(v3)

DNAStringSet(v3)


##############################################################################
### Exercise 5
### ==========

library(BSgenome.Mmusculus.UCSC.mm9)
Mmusculus

seqlengths(Mmusculus)

Mmusculus$chrM
unmasked(Mmusculus$chrM)

### To see whether the chromosomes contain IUPAC extended letters, we apply
### alphabetFrequency() to each unmasked chromosome:
af <- sapply(seqnames(Mmusculus),
             function(name) alphabetFrequency(unmasked(Mmusculus[[name]])))
af  # shows that the chromosomes contain the N letter (used by NCBI to fill
    # the assembly gaps)

### Bisulfite transformation of the plus strand:
plus_strand <- chartr("C", "T", unmasked(Mmusculus$chr1))
alphabetFrequency(plus_strand)  # no more C's

### Bisulfite transformation of the minus strand:
minus_strand <- chartr("G", "A", unmasked(Mmusculus$chr1))
alphabetFrequency(minus_strand)  # no more G's


##############################################################################
### Exercise 6
### ==========

pattern <- DNAString("ACCGGTTATC")
matchPattern(pattern, Mmusculus$chr1)

### Reverse complement 'pattern' instead of 'Mmusculus$chr1': it's more
### memory efficient and it keeps coordinates relative to the plus strand
### which is what everybody seems to do (NCBI, UCSC, etc...)
matchPattern(reverseComplement(pattern), Mmusculus$chr1)

matchPattern(pattern, Mmusculus$chr1, max.mismatch=2, with.indels=TRUE)
  # 54204 hits in the plus strand!


##############################################################################
### Exercise 7
### ==========

Mmusculus$upstream5000
m <- vmatchPattern(pattern, Mmusculus$upstream5000)

### To get the indices of the references sequences with hits:
which(countIndex(m) != 0)

### To get the hits in reference sequence 2956:
m[[2956]]


##############################################################################
### Exercise 8
### ==========

matchPattern("GAACTTTGCCACTC", Mmusculus$chr1)  # no hit

### By default, 'fixed' is TRUE so the N in the pattern can only match an N
### in the subject:
matchPattern("GAACTNTGCCACTC", Mmusculus$chr1)  # no hit (not surprisingly)

matchPattern("GAACTNTGCCACTC", Mmusculus$chr1, fixed=FALSE)  # 1 hit


##############################################################################
### Exercise 9
### ==========

maskedratio(masks(Mmusculus$chr1)["AGAPS"])  # 2.9%

### 'Mmusculus$chr1' is an immutable object so before we can turn its masks
### on or off, we need to copy it to another variable (the chromosome sequence
### is not copied during this operation):
chr1 <- Mmusculus$chr1
active(masks(chr1)) <- FALSE
active(masks(chr1))["AGAPS"] <- TRUE

chr1  # check that only the AGAPS mask in active

alphabetFrequency(chr1)  # 47 Ns (those are intra-contig Ns)
active(masks(chr1))["AMB"] <- TRUE

alphabetFrequency(chr1)  # no more Ns

alphabetFrequency(unmasked(chr1))  # most Ns belong to the assembly gaps
                                   # (except 47 of them)

### To display the contig as views:
chr1 <- Mmusculus$chr1
active(masks(chr1)) <- FALSE
active(masks(chr1))["AGAPS"] <- TRUE
as(chr1 , "XStringViews")  # 24 contigs

### Activate all masks
active(masks(chr1)) <- TRUE
chr1  # 45.9% of the sequence is masked

matchPattern("ACACACACACACACACACAC", chr1)  # only 4 hits
matchPattern("ACACACACACACACACACAC", unmasked(chr1))  # 91688 hits


##############################################################################
### Exercise 10
### ===========

pdict0 <- PDict(dict0)

m <- matchPDict(pdict0, Mmusculus$chr1)

countIndex(m)

which(countIndex(m) == max(countIndex(m)))  # read 46 has the highest number
                                            # of hits
pdict0[[46]]  # AC repeated 18 times (a low-complexity read)

Views(unmasked(Mmusculus$chr1), start=start(m[[46]]), end=end(m[[46]]))
                                            # 25737 hits

matchPattern(pdict0[[46]], Mmusculus$chr1)  # the same 25737 hits

### Hits in the minus strand:
pdict1 <- PDict(reverseComplement(dict0))
m1 <- matchPDict(pdict1, Mmusculus$chr1)
countIndex(m1)
which(countIndex(m1) == max(countIndex(m1)))  # read 433 has the highest number
                                              # of hits in the minus strand
reverseComplement(pdict1[[433]])  # GT repeated 18 times

### The previous analysis was for exact hits. To find inexact hits
### with at most 2 mismatches per read in the last 20 nucleotides, we
### need to specify a Trusted Band during preprocessing:

pdict2 <- PDict(dict0, tb.end=16)

### and to call matchPDict() with 'max.mismatch=2':
m2 <- matchPDict(pdict2, Mmusculus$chr1, max.mismatch=2)

### Of course we find the same hits or more for each read:
all(countIndex(m2) >= countIndex(m))  # TRUE

which(countIndex(m2) == max(countIndex(m2)))

pdict0[[90]]  # TG repeated 18 times

# etc...