############################################################################## ### Exercise 1 ### ========== ### Before you can load a Bioconductor package, you need to install it with: ### source("http://bioconductor.org/biocLite.R") ### biocLite("BSgenome") ### The biocLite() function will take care of installing any other BioC or CRAN ### package required by the package that you are requesting. library(BSgenome.Mmusculus.UCSC.mm9) Mmusculus Mmusculus$chr1 load("path/to/topReads.rda") ### 'topReads' is a list with 2 components named "experiment1" and "experiment2" class(topReads) names(topReads) ### Access a given element with $: topReads$experiment2 # this will print a lot of output (not very useful) ### 'topReads$experiment2' is itself a list with 8 elements: class(topReads$experiment2) names(topReads$experiment2) ### 'topReads$experiment2$lane1' is an integer vector: class(topReads$experiment2$lane1) ### Print its first 6 elements: topReads$experiment2$lane1[1:6] # the names of the elements are the Solexa # reads, the integer values are the number of # times each read is represented in the set # of reads for this lane ### Extract the names if you want to display the reads only: names(topReads$experiment2$lane1)[1:6] ############################################################################## ### Exercise 2 ### ========== ### Convert the 1st read into a DNAString object: r1 <- DNAString(names(topReads$experiment2$lane1)[1]) length(r1) # nb of letters in 'r1' alphabetFrequency(r1) # the names are the letters of the DNA alphabet reverseComplement(r1) subseq(r1, start=5, end=15) subseq(r1, end=15) # extract the first 15 letters subseq(r1, start=-5) # extract the last 5 letters ############################################################################## ### Exercise 3 ### ========== dict0 <- DNAStringSet(names(topReads$experiment2$lane1)) length(dict0) width(dict0) dict0[-2] rev(dict0) dict0[[1]] DNAStringSet(dict0, end=-3) DNAStringSet(dict0, start=-10) alphabetFrequency(dict0) # returns a matrix with 1000 rows reverseComplement(dict0) ### Use 'collapse=TRUE' to collapse all the rows alphabetFrequency(dict0, collapse=TRUE) alphabetFrequency(reverseComplement(dict0), collapse=TRUE) ### Use [ , ] to subset the matrix returned by alphabetFrequency() dict0 <- dict0[alphabetFrequency(dict0)[ ,"N"] == 0] ############################################################################## ### Exercise 4 ### ========== v3 <- Views(dict0[[1]], start=c(2, 12, 20), end=c(5, 26, 27)) subject(v3) start(v3) end(v3) gaps(v3) alphabetFrequency(v3) DNAStringSet(v3) ############################################################################## ### Exercise 5 ### ========== library(BSgenome.Mmusculus.UCSC.mm9) Mmusculus seqlengths(Mmusculus) Mmusculus$chrM unmasked(Mmusculus$chrM) ### To see whether the chromosomes contain IUPAC extended letters, we apply ### alphabetFrequency() to each unmasked chromosome: af <- sapply(seqnames(Mmusculus), function(name) alphabetFrequency(unmasked(Mmusculus[[name]]))) af # shows that the chromosomes contain the N letter (used by NCBI to fill # the assembly gaps) ### Bisulfite transformation of the plus strand: plus_strand <- chartr("C", "T", unmasked(Mmusculus$chr1)) alphabetFrequency(plus_strand) # no more C's ### Bisulfite transformation of the minus strand: minus_strand <- chartr("G", "A", unmasked(Mmusculus$chr1)) alphabetFrequency(minus_strand) # no more G's ############################################################################## ### Exercise 6 ### ========== pattern <- DNAString("ACCGGTTATC") matchPattern(pattern, Mmusculus$chr1) ### Reverse complement 'pattern' instead of 'Mmusculus$chr1': it's more ### memory efficient and it keeps coordinates relative to the plus strand ### which is what everybody seems to do (NCBI, UCSC, etc...) matchPattern(reverseComplement(pattern), Mmusculus$chr1) matchPattern(pattern, Mmusculus$chr1, max.mismatch=2, with.indels=TRUE) # 54204 hits in the plus strand! ############################################################################## ### Exercise 7 ### ========== Mmusculus$upstream5000 m <- vmatchPattern(pattern, Mmusculus$upstream5000) ### To get the indices of the references sequences with hits: which(countIndex(m) != 0) ### To get the hits in reference sequence 2956: m[[2956]] ############################################################################## ### Exercise 8 ### ========== matchPattern("GAACTTTGCCACTC", Mmusculus$chr1) # no hit ### By default, 'fixed' is TRUE so the N in the pattern can only match an N ### in the subject: matchPattern("GAACTNTGCCACTC", Mmusculus$chr1) # no hit (not surprisingly) matchPattern("GAACTNTGCCACTC", Mmusculus$chr1, fixed=FALSE) # 1 hit ############################################################################## ### Exercise 9 ### ========== maskedratio(masks(Mmusculus$chr1)["AGAPS"]) # 2.9% ### 'Mmusculus$chr1' is an immutable object so before we can turn its masks ### on or off, we need to copy it to another variable (the chromosome sequence ### is not copied during this operation): chr1 <- Mmusculus$chr1 active(masks(chr1)) <- FALSE active(masks(chr1))["AGAPS"] <- TRUE chr1 # check that only the AGAPS mask in active alphabetFrequency(chr1) # 47 Ns (those are intra-contig Ns) active(masks(chr1))["AMB"] <- TRUE alphabetFrequency(chr1) # no more Ns alphabetFrequency(unmasked(chr1)) # most Ns belong to the assembly gaps # (except 47 of them) ### To display the contig as views: chr1 <- Mmusculus$chr1 active(masks(chr1)) <- FALSE active(masks(chr1))["AGAPS"] <- TRUE as(chr1 , "XStringViews") # 24 contigs ### Activate all masks active(masks(chr1)) <- TRUE chr1 # 45.9% of the sequence is masked matchPattern("ACACACACACACACACACAC", chr1) # only 4 hits matchPattern("ACACACACACACACACACAC", unmasked(chr1)) # 91688 hits ############################################################################## ### Exercise 10 ### =========== pdict0 <- PDict(dict0) m <- matchPDict(pdict0, Mmusculus$chr1) countIndex(m) which(countIndex(m) == max(countIndex(m))) # read 46 has the highest number # of hits pdict0[[46]] # AC repeated 18 times (a low-complexity read) Views(unmasked(Mmusculus$chr1), start=start(m[[46]]), end=end(m[[46]])) # 25737 hits matchPattern(pdict0[[46]], Mmusculus$chr1) # the same 25737 hits ### Hits in the minus strand: pdict1 <- PDict(reverseComplement(dict0)) m1 <- matchPDict(pdict1, Mmusculus$chr1) countIndex(m1) which(countIndex(m1) == max(countIndex(m1))) # read 433 has the highest number # of hits in the minus strand reverseComplement(pdict1[[433]]) # GT repeated 18 times ### The previous analysis was for exact hits. To find inexact hits ### with at most 2 mismatches per read in the last 20 nucleotides, we ### need to specify a Trusted Band during preprocessing: pdict2 <- PDict(dict0, tb.end=16) ### and to call matchPDict() with 'max.mismatch=2': m2 <- matchPDict(pdict2, Mmusculus$chr1, max.mismatch=2) ### Of course we find the same hits or more for each read: all(countIndex(m2) >= countIndex(m)) # TRUE which(countIndex(m2) == max(countIndex(m2))) pdict0[[90]] # TG repeated 18 times # etc...