%\VignetteIndexEntry{05 Range-based Containers -- Exercises}
%\VignetteEngine{knitr::knitr}

\documentclass{article}

<<style, eval=TRUE, echo=FALSE, results='asis'>>=
options(max.print=1000)
stopifnot(BiocInstaller::biocVersion() == "2.13")
BiocStyle::latex()
library(knitr)
opts_chunk$set(cache=TRUE, tidy=FALSE)

plotRanges <- function(x, xlim = x, main = deparse(substitute(x)),
                       col = "black", sep = 0.5, ...)
{
  height <- 1
  if (is(xlim, "Ranges"))
    xlim <- c(min(start(xlim)), max(end(xlim)))
  bins <- disjointBins(IRanges(start(x), end(x) + 1))
  plot.new()
  par(mai=c(0.5, 0.2, 1.2, 0.2))
  plot.window(xlim, c(0, max(bins)*(height + sep)))
  ybottom <- bins * (sep + height) - height
  rect(start(x)-0.5, ybottom, end(x)+0.5, ybottom + height, col = col, ...)
  title(main, cex.main=2.8, font.main=1)
  axis(1)
}
@ 


\title{Practical: Range-based containers}
\author{Herv\'e Pag\`es (\url{hpages@fhcrc.org})}
\date{3 February 2014}

\newcommand{\Hsap}{\emph{H.~sapiens}}
\newcommand{\Dmel}{\emph{D.~melanogaster}}

\usepackage{theorem}
\newtheorem{Ext}{Exercise}
\newenvironment{Exercise}{
  \renewcommand{\labelenumi}{\alph{enumi}.}\begin{Ext}%
}{\end{Ext}}
\newenvironment{Solution}{%
  \noindent\textbf{Solution:}\renewcommand{\labelenumi}{\alph{enumi}.}%
}{\bigskip}

\setlength{\abovecaptionskip}{6pt}
\setlength{\belowcaptionskip}{6pt}

\begin{document}

\maketitle
\tableofcontents


\section{An introduction to genomic ranges}

A \emph{genomic range} is a region on a genome that is made of contiguous
nucleotides (i.e. no gaps). It is defined by a chromosome name (a.k.a.
sequence name), a start and an end coordinate, and possibly a strand.
Genomic ranges are used to locate various things on a reference genome such
as genomic features (genes, transcripts, exons, repeat regions, etc...),
aligned reads, or regions of interest (e.g. peaks detected in a ChIP-seq
analysis).

In \Bioconductor{}, the \Rclass{GRanges} class, defined and documented in
the \Rpackage{GenomicRanges} package, is used to represent a vector of
genomic ranges.

\subsection{Mandatory components of a \Rclass{GRanges} object}

A \Rclass{GRanges} object contains the following mandatory components:
\begin{itemize}
  \item A vector of sequence names, accessed with the \Rcode{seqnames()}
        accessor.
  \item A vector of start coordinates, accessed with the \Rcode{start()}
        accessor.
  \item A vector of end coordinates, accessed with the \Rcode{end()}
        accessor.
  \item A vector of strand values (\Rcode{+}, \Rcode{-}, or \Rcode{*}),
        accessed with the \Rcode{strand()} accessor.
\end{itemize}
The vectors returned by the \Rcode{seqnames()}, \Rcode{start()},
\Rcode{end()}, and \Rcode{strand()} accessors have the same length,
which is the number of genomic ranges in the object. By definition, this
is also considered to be the length of the object and it can be obtained
with \Rcode{length()} (like with an ordinary vector).

Here is an example of a \Rclass{GRanges} object of length 7 (i.e. with 7
genomic ranges in it):
<<echo=FALSE>>=
suppressPackageStartupMessages(library(GenomicRanges))
gr1 <- GRanges(Rle(c("chr1", "chr19", "chrX", "chrM", "chr19"),
                   c(2, 1, 2, 1, 1)),
               IRanges(c(10001, 5508, 25644, 685, 142501, 33077, 8001),
                       width=c(400, 285, 1620, 333, 1, 2000, 421)),
               strand=c("-", "*", "+", "+", "-", "*", "+"))
@
<<>>=
gr1
@

Here is another more realistic \Rclass{GRanges} object representing a
tiling of the full Yeast genome (with tiles of size 50000 nucleotides):
<<echo=FALSE>>=
suppressPackageStartupMessages(library(BSgenome.Scerevisiae.UCSC.sacCer2))
tiles <- tileGenome(seqinfo(Scerevisiae), tilewidth=50000,
                    cut.last.tile.in.chrom=TRUE)
@
<<>>=
tiles
@

An important convention to keep in mind about the start and end coordinates
of a genomic range is that they are both \emph{1-based} positions relative
to the 5' end of the plus strand of the reference sequence, \emph{even when
the range is on the minus strand}. Also the nucleotides located at the start
and end positions are both considered to be part of the range (the former
being the left-most nucleotide of the range and the latter its
right-most nucleotide).

\subsection{Metadata columns of a \Rclass{GRanges} object}

In addition to the mandatory components described above, one or more
\emph{metadata columns} can be stored in a \Rclass{GRanges} object in the
form of a \Rclass{DataFrame} that can be accessed with the \Rcode{mcol()}
accessor. The \Rclass{DataFrame} has one row per genomic range and its
columns are considered to be the metadata columns of the \Rclass{GRanges}
object.

Here is an example of a \Rclass{GRanges} object with 2 metadata columns:
<<echo=FALSE>>=
gr2 <- gr1
mcols(gr2) <- DataFrame(desc=c("exon",
                               "repeat region",
                               "gene",
                               "exon",
                               "exon",
                               "snp",
                               "repeat region"),
                        GC_content=c(0.55, 0.495, 0.562, 0.53, 0.552, NA, 0.51))
@
<<>>=
gr2
@

Another example of a \Rclass{GRanges} object with metadata columns is shown
below. This object represents all the known protein coding regions (CDS) on
the Human genome (positions are reported with respect to reference assembly
hg19):
<<echo=FALSE>>=
suppressPackageStartupMessages(library(TxDb.Hsapiens.UCSC.hg19.knownGene))
hg19_cds <- cds(TxDb.Hsapiens.UCSC.hg19.knownGene,
                columns="gene_id")
@
<<>>=
hg19_cds
@


\section{Working with \Rclass{GRanges} objects}

\subsection{Basic manipulation}

\begin{Exercise}
  This exercise illustrates basic manipulation of \Rclass{GRanges} objects.
  \begin{enumerate}
    \item Load the package where the \Rclass{GRanges} class is defined
          and documented.

    \item Open the man page for the \Rclass{GRanges} class and run the
          examples in it.

    \item Display the \Rcode{gr} object.

    \item Use the \Rcode{seqnames()} accessor to get the vector of sequence
          names. What is the class of this vector? Use \Rcode{table()} on
          it to count the number of genomic ranges on each reference
          sequence.

    \item In addition to the start and end coordinates that can be obtained
          with the \Rcode{start()} and \Rcode{end()} accessors, the
          \emph{width} of the ranges can be obtained with the \Rcode{width()}
          accessor. By definition, the \emph{width} of a genomic range is
          the number of nucleotides in it. For any range, we have
          $width = end - start + 1$.

          Obtain the width of the ranges in \Rcode{gr}.

          How do you check that $width$ is indeed the same as
          $end - start + 1$ programmatically?
  \end{enumerate}
\end{Exercise}

\begin{Solution}
  \begin{enumerate}
    \item The \Rclass{GRanges} class is defined and documented in the
          \Rpackage{GenomicRanges} package.
<<echo=FALSE>>=
while (search()[2L] != "package:parallel")
    detach(search()[2L], character.only=TRUE)
detach("package:parallel")
@
<<results='asis'>>=
library(GenomicRanges)
@

    \item Use the \Rcode{class?<classname>} or
          \Rcode{?\`{}<classname>-class\`{}} syntax to open the man page
          for a given class:
<<eval=FALSE>>=
class?GRanges
@
          Since there is also a constructor function for \Rclass{GRanges}
          objects (the \Rcode{GRanges()} function) documented in the same
          man page, the man page can also be accessed with \Rcode{?GRanges}.

          To run the examples in a man page, you can copy/paste them into
          your current session or call the \Rcode{example()} function
          on the same alias you used to open the man page (i.e. on the part
          following the \Rcode{?}):
<<results='hide'>>=
example(GRanges)
@

    \item The code in the examples calls the \Rcode{GRanges()} constructor
          function to create the \Rcode{gr} object:
<<>>=
gr
@

    \item The vector of sequence names can be obtained with the
          \Rcode{seqnames()} accessor:
<<>>=
seqnames(gr)
@
          This is an \Rclass{Rle} (Run Length Encoding) object:
<<>>=
class(seqnames(gr))
@
          Using an \Rclass{Rle} representation of a vector or factor is a
          simple and efficient way to reduce its size in memory (a.k.a.
          its \emph{memory footprint})  when it has long \emph{runs} of
          with the same value.

    \item The \emph{width} of the ranges in \Rcode{gr}:
<<>>=
width(gr)
@
          We can check that $width = end - start + 1$ with:
<<>>=
all(width(gr) == end(gr) - start(gr) + 1)
@
  \end{enumerate}
\end{Solution}

\begin{Exercise}
  This exercise illustrates more basic manipulation of \Rclass{GRanges}
  objects.
  \begin{enumerate}
    \item Like ordinary vectors in \R{}, \Rclass{GRanges} objects can have
          names on them (one per vector element in the object). The mechanism
          for getting or setting the names is with the \Rcode{names} getter
          and setter.

          Obtain the names of \Rcode{gr}.

          Replace the current names with upper case names (same names but
          upper case).

    \item Like ordinary vectors in \R{}, \Rclass{GRanges} objects can be
          subsetted with the \Rcode{[} operator.

          Subset \Rcode{gr} to keep only the ranges located on the \Rcode{+}
          strand.

    \item Obtain the metadata columns in \Rcode{gr}.

          Obtain the \Rcode{GC} column.

          Subset \Rcode{gr} to keep only the ranges for which the \Rcode{GC}
          value is >= 0.3 and <= 0.6.

    \item Finally, add a new metadata column to \Rcode{gr}.
  \end{enumerate}
\end{Exercise}

\begin{Solution}
  \begin{enumerate}
    \item We get the names with:
<<>>=
names(gr)
@

    \item We use the \Rcode{toupper()} function to translate the names from
          lower to upper case, and then we set the new names back on
          \Rcode{gr} with the \Rcode{names()} \emph{setter} (i.e. we call
          \Rcode{names()} on the \emph{left side} of the assignment):
<<>>=
names(gr) <- toupper(names(gr))
@

    \item To obtain all the metadata columns:
<<>>=
mcols(gr)
@
          To obtain the \Rcode{GC} column:
<<>>=
mcols(gr)$GC
@
          To keep only the ranges for which GC is >= 0.3 and <= 0.6:
<<>>=
gr[mcols(gr)$GC >= 0.3 & mcols(gr)$GC <= 0.6]
@
          Note that this subsetting can be performed more conveniently with
          \Rcode{subset()}:
<<>>=
subset(gr, GC >= 0.3 & GC <= 0.6)
@

    \item To set a metadata column on \Rcode{gr}, we use the same form as for
          getting a metadata column but this time on the \emph{left side} of
          the assignment:
<<>>=
mcols(gr)$id <- sprintf("ID%03d", seq_along(gr))
gr
@
  \end{enumerate}
\end{Solution}

\subsection{Common range transformations}

The ranges in a \Rclass{GRanges} object can be transformed in different ways.
Some of the most common range transformations are shown on
figure~\ref{fig:range-transformations} below.

<<ranges-gr0-plot,results='hide',echo=FALSE,fig.keep='none'>>=
gr0 <- IRanges(start=c(7, 9, 12, 14, 22:24),
               end=c(15, 11, 12, 18, 26, 27, 28))

png("ranges-gr0-plot.png", width=800, height=170)
plotRanges(gr0, xlim=c(5, 35), main="gr0", col="blue")
dev.off()
@
<<ranges-shift-gr0-plot,results='hide',echo=FALSE,fig.keep='none'>>=
png("ranges-shift-gr0-plot.png", width=800, height=170)
plotRanges(shift(gr0, 5), xlim=c(5, 35), main="shift(gr0, 5)", col="blue")
dev.off()
@
<<ranges-disjoin-gr0-plot,results='hide',echo=FALSE,fig.keep='none'>>=
png("ranges-disjoin-gr0-plot.png", width=800, height=170)
plotRanges(disjoin(gr0), xlim=c(5, 35), main="disjoin(gr0)", col="blue")
dev.off()
@
<<ranges-reduce-gr0-plot,results='hide',echo=FALSE,fig.keep='none'>>=
png("ranges-reduce-gr0-plot.png", width=800, height=170)
plotRanges(reduce(gr0), xlim=c(5, 35), main="reduce(gr0)", col="blue")
dev.off()
@
\begin{figure}
  \centering
  \includegraphics[width=0.8\textwidth,height=!]{ranges-gr0-plot}\\
  \includegraphics[width=0.8\textwidth,height=!]{ranges-shift-gr0-plot}\\
  \includegraphics[width=0.8\textwidth,height=!]{ranges-disjoin-gr0-plot}\\
  \includegraphics[width=0.8\textwidth,height=!]{ranges-reduce-gr0-plot}
  \caption{Range transformations \Rcode{shift()}, \Rcode{disjoin()}, and
           \Rcode{reduce()}. For clarity, \Rcode{gr0} is a \Rclass{GRanges}
           object with all its ranges on the same reference sequence and
           strand.}
  \label{fig:range-transformations}
\end{figure}

\begin{Exercise}
  In this exercise we perform some range transformations on a \Rclass{GRanges}
  object.
  \begin{enumerate}
    \item Use \Rcode{shift} to shift the ranges in \Rcode{gr} by 500 positions
          to the right. Store the result in \Rcode{gr2}.

    \item Find the man page for the \Rcode{flank} method that operates on
          \Rclass{GRanges} objects. Obtain the upstream flanking regions of
          width 100 for \Rcode{gr2}, that is, the regions corresponding to
          the 100 nucleotides located upstream of each range in \Rcode{gr2}
          when moving in the 5' to 3' direction.

    \item ``Display'' the \Rcode{shift()} function. What kind of function
          is that? Which arguments are involved in the dispatch mechanism?

    \item Use \Rcode{showMethods} to list all the \Rcode{shift} methods.
          Which method is used when we call \Rcode{shift()} on a
          \Rclass{GRanges} object? Try to open the man page for \emph{this}
          method using the \Rcode{?<something-here>} syntax.
  \end{enumerate}
\end{Exercise}

\begin{Solution}
  \begin{enumerate}
    \item
<<>>=
gr2 <- shift(gr, 500)
@

    \item
<<>>=
flank(gr2, width=100)
@

    \item To ``display'' the \Rcode{shift()} function, we just type the
          name of the function followed by <Enter>, like with any other
          object in \R{}. For an ordinary function, this displays the
          \emph{body} of the function (i.e. its implementation).
          However, for \Rcode{shift()}, we get something different:
<<>>=
shift
@
          This is because \Rcode{shift()} is not an ordinary function but
          an \emph{S4 generic function}. A generic function doesn't do the
          work: instead it calls the appropriate method for doing it.
          The selection of the appropriate method is done internally by
          the \emph{dispatch mechanism}. The dispatch mechanism looks at
          the types of the arguments that are passed to the generic function
          and selects the most specific method compatible with these
          arguments. In the case of \Rcode{shift()}, only 1 argument is
          considered for dispatch (the first argument \Rcode{x}).

    \item To list all the \Rcode{shift} methods:
<<>>=
showMethods(shift)
@
          To see the method that is used when \Rcode{x} is a \Rclass{GRanges}
          object:
<<>>=
selectMethod(shift, "GRanges")
@
          Pay attention to the last 2 lines. One complication here is that
          the method that is used when \Rcode{x} is a \Rclass{GRanges}
          object is not defined specifically for the \Rclass{GRanges} class
          but for the \Rclass{GenomicRanges} class which is a parent class
          of the \Rclass{GRanges} class. We can check that \Rcode{gr} is
          indeed a \Rclass{GenomicRanges} object, even though it's not a
          \Rclass{GenomicRanges} \emph{instance}:
<<>>=
is(gr, "GenomicRanges")
class(gr)
@
          To open the man page for \emph{this} method:
<<eval=FALSE>>=
?`shift,GenomicRanges-method`
@
  \end{enumerate}
\end{Solution}


\section{Working with \Rclass{GRangesList} objects}

So far we've seen \Rclass{GRanges} objects and how to work on them.
One limitation of the \Rclass{GRanges} container is that it can
only be used for storing genomic regions that are made of contiguous
nucleotides (i.e. no gaps). However, some types of genomic regions (or
genomic features) cannot always be represented with a single genomic range.
For example, some aligners used on RNA-seq data can produce aligned reads
(called \emph{junction reads} or \emph{gapped reads}) that need to be
represented with 2 or more genomic ranges. In that case a \Rclass{GRangesList}
object needs to be used to store the ranges.

More formally speaking, a \Rclass{GRangesList} object also contains
genomic ranges but now they are divided into groups. Each group consists
of one or more genomic ranges that represent a complex region (e.g.
all the exons of a given gene, a junction read, etc...)

To keep things easy and intuitive, the \Rclass{GRangesList}
container has been designed so that it can be manipulated like an ordinary
list for most usual list operations. For example, each individual group
can be extracted with \Rcode{[[}, and \Rcode{length()} returns the total
number of groups in the object.

\begin{Exercise}
  In this exercise, we use a \Rclass{GRangesList} object to store genomic
  features (e.g. exons) grouped by parent feature (e.g. transcript or gene).
  The input data we use for this is a \emph{gene model}.
  \begin{enumerate}
    \item Load the \Rpackage{TxDb.Hsapiens.UCSC.hg19.knownGene} package.
          This package contains a \emph{gene model} for Human i.e. the
          genomic ranges of all the known Human transcripts, exons, and
          CDS. The gene model can be accessed via the
          \Rcode{TxDb.Hsapiens.UCSC.hg19.knownGene} object which is
          basically a pointer to an SQLite database containing the
          gene model. Display this object.

    \item Use the \Rcode{exonsBy()} function on the
          \Rcode{TxDb.Hsapiens.UCSC.hg19.knownGene} object to extract
          the exons grouped by gene (check the \Rcode{exonsBy} man page
          for how to do that). What kind of object is returned?

    \item How many genes are represented in this object?

    \item Note that this \Rclass{GRangesList} object has names and that
          the names are Entrez Gene ids. Use \Rcode{\%in\%} to check that
          Entrez Gene 1008 is present. Extract the exons of Entrez Gene
          1008 in a \Rclass{GRanges} object.

          How many exons are in Entrez Gene 1008? On which chromosome
          and strand are they?

    \item The \Rcode{elementLengths()} function is a very fast way
          to compute the length of each list element of a list-like object
          \Rcode{x}. It's equivalent to (but much faster than) doing
          \Rcode{sapply(x, length)}. Use it on the \Rclass{GRangesList}
          object to compute the number of exons in each gene.

          Which gene has the most exons? How many exons does it have?
  \end{enumerate}
\end{Exercise}

\begin{Solution}
  \begin{enumerate}
    \item
<<>>=
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
TxDb.Hsapiens.UCSC.hg19.knownGene
@

    \item
<<>>=
ex_by_gn <- exonsBy(TxDb.Hsapiens.UCSC.hg19.knownGene, by="gene")
class(ex_by_gn)
@

    \item Number of genes:
<<>>=
length(ex_by_gn)
@

    \item Is Entrez Gene 1008 present?
<<>>=
"1008" %in% names(ex_by_gn)
@
          To extract the exons in a \Rclass{GRanges} object, we need to use
          \Rcode{[[}, not \Rcode{[}:
<<>>=
ex_by_gn[["1008"]]
@
          Also note that it's important to subset with the \Rcode{"1008"}
          \emph{string} here, not the \Rcode{1008} number. Why?
          Be aware that this is a common pitfall!
<<>>=
seqnames(ex_by_gn[["1008"]])
strand(ex_by_gn[["1008"]])
@
          All the exons in this gene are on chromosome 12 and minus strand.

    \item Number of exons in each gene:
<<>>=
nex_per_gn <- elementLengths(ex_by_gn)
@
          Max number of exons per gene:
<<>>=
max(nex_per_gn)
@
          Gene with the most exons:
<<>>=
which(nex_per_gn == max(nex_per_gn))
@
          Entrez Gene "1302".

          Alternatively, the "which" and "how many" questions can both be
          answered with \Rcode{which.max()}:
<<>>=
which.max(nex_per_gn)
@
  \end{enumerate}
\end{Solution}


\section{Working with aligned reads}

\subsection{Mapping the reads to the reference genome}

A high-throughput sequencing experiment (a.k.a. HTS experiment, or new
generation sequencing experiment, or NGS experiment) generates a huge
quantity of short reads (typically millions). Depending on the technology
used by the sequencing machine, the length of the reads can vary between 40
and 300 nucleotides. One of the first steps of almost any HTS data analysis
is the \emph{mapping} of the reads to the reference genome (or sometimes to
the transcriptome).
This \emph{mapping} process consists in finding the genomic location of the
read based on its similarity with the nucleotide content of the reference
genome at this location. This step is very computational intensive and is
usually performed outside \Bioconductor{} with a 3rd party aligner like
Bowtie, BWA, GSNAP, TopHat, etc...

Reads coming from an RNA-seq experiment (i.e. RNA was sequenced, not DNA)
present an additional challenge: some of them, called \emph{junction reads},
can span 2 or more exons separated by introns. In that case the aligner
needs to be able to break the read into 2 or more pieces and map each
individual piece to the reference genome. A few aligners like TopHat or
GSNAP support \emph{junction reads}. Others like Bowtie or BWA don't.

The result of the alignment process is typically stored in BAM files.
The BAM format (and its plain text version SAM) is a generic format for
storing large nucleotide sequence alignments. It is specified and developped
by the SAMtools project (\url{http://samtools.sourceforge.net/}).
The SAMtools project also provides various utilities for manipulating
alignments in the SAM or BAM format.

\subsection{Working with BAM files in \Bioconductor{}}

After the reads have been aligned and stored in BAM files (typically one
file per sample or sequencing run), many packages are available in
\Bioconductor{} for performing different kinds of downstream analyses.
See the HighThroughputSequencing view under the Software and AssayTechnologies
views on the \Bioconductor{} website for a list of these packages
(\url{http://bioconductor.org/packages/release/BiocViews.html#\_\_\_HighThroughputSequencing}).

One of these packages, the \Rpackage{Rsamtools} package, provides an
interface to the SAMtools utilities for manipulating BAM files, including
for loading aligned reads in \R{}. Other packages like the
\Rpackage{ShortRead} package, provide some high-level tools for
manipulating aligned reads. One container, the \Rclass{GAlignments}
container, has been specifically designed for storing aligned reads,
including junction reads (a.k.a. gapped alignents). It is defined and
documented in the \Rpackage{GenomicRanges} package.

The \Rpackage{parathyroidSE} package is a data package that contains aligned
reads stored in BAM files. The paths to these BAM files can be obtained with:
<<>>=
library(parathyroidSE)
bamdir <- system.file("extdata", package="parathyroidSE")
bampaths <- list.files(bamdir, pattern="bam$", full.names=TRUE)
bampaths
@

The aligned reads can be loaded from a BAM file with the
\Rcode{readGAlignmentsFromBam()} function from the \Rpackage{Rsamtools}
package:
<<>>=
library(Rsamtools)
reads <- readGAlignmentsFromBam(bampaths[1])
reads
@
The returned value is a \Rclass{GAlignments} object. It is a \emph{vector-like}
container that has one element per aligned read. An important difference
with \Rclass{GRanges} objects is that an aligned read can be associated with
more than one genomic range. We'll see later how to extract those ranges but
before we do that, we first need to talk about the \Rcode{cigar} component
of the object.

The components displayed above can be accessed with accessor functions
that are named like the component itself (e.g. \Rcode{qwidth()} for the
\Rcode{qwidth} component, \Rcode{ngap()} for the \Rcode{ngap} component,
etc...). One component of particular interest is the \Rcode{cigar}
component. It contains short strings called CIGAR strings that describe
exactly the geometry of the alignments i.e. how each read aligns to the
reference sequence. CIGAR strings are made of numbers and letters.
Each letters represents a CIGAR operation and is preceeded by a number
that specifies the length of the operation. The most important operations
are \Rcode{M}, \Rcode{I}, \Rcode{D}, and \Rcode{N}. The format of the
CIGAR strings is explained in the SAM Spec document hosted on the SAMtools
website.

\begin{Exercise}
  In this exercise we learn how to load aligned reads from a BAM file
  into a \Rclass{GAlignments} object and do some basic manipulation on it.
  \begin{enumerate}
    \item Load the \Rpackage{parathyroidSE} package and use the
          \Rcode{readGAlignmentsFromBam()} function from the
          \Rpackage{Rsamtools} package to read the third BAM
          file (SRR479054.bam). What kind of object is returned?
          Display it.

    \item A gap in the alignment is called a ``skipped region from the
          reference'' in SAMtools jargon. Refer to the SAM Spec to find
          out how gaps are represented in the CIGAR strings.

    \item Do some of the alignments have gaps? What does this tell
          us about the aligner that was used to align the reads?

    \item The \Rcode{ngap} component summarizes the number of gaps
          per alignment. What's the maximum number of gaps per alignment?
          Use \Rcode{table()} to summarize the number of gaps per
          alignment.
  \end{enumerate}
\end{Exercise}

\begin{Solution}
  \begin{enumerate}
    \item
<<>>=
library(parathyroidSE)
bamdir <- system.file("extdata", package="parathyroidSE")
bampaths <- list.files(bamdir, pattern="bam$", full.names=TRUE)
reads <- readGAlignmentsFromBam(bampaths[3])
class(reads)
reads
@

    \item A gap in the alignment is represented with the \Rcode{N} operation.

    \item
<<>>=
length(grep("N", cigar(reads)))
@
          This tells us that the aligner that was used supports \emph{junction
          reads}.

    \item Maximum number of gaps per alignment:
<<>>=
max(ngap(reads))
table(ngap(reads))
@
  \end{enumerate}
\end{Solution}

\subsection{Extracting the genomic ranges from a \Rclass{GAlignments} object}

There are 2 ways to extract the genomic ranges from a \Rclass{GAlignments}
object: one that preserves the gaps, and one that doesn't (i.e. the gaps are
ignored). This extraction is actually achieved by \emph{coercing} the
\Rclass{GAlignments} object into a \Rclass{GRangesList} or \Rclass{GRanges}
object.

\begin{Exercise}
  In this exercise we learn how to extract the genomic ranges from a
  \Rclass{GAlignments} object.
  \begin{enumerate}
    \item Coerce this \Rclass{GAlignments} object into a
          \Rclass{GRangesList} object. Note that this coercion produces
          an object that is {\it parallel} to the \Rclass{GAlignments}
          object, that is, the i-th element in the \Rclass{GRangesList}
          object corresponds to the i-th element (i.e. alignment)
          in the \Rclass{GAlignments} object.

    \item Use \Rcode{elementLengths()} and \Rcode{table()} to summarize
          the number of ranges per list element in the \Rclass{GRangesList}
          object. How does this relate to the summary of number of gaps per
          alignment obtained earlier?

    \item If we wanted to ignore the gaps in the reads, we would coerce
          the \Rclass{GAlignments} object into a \Rclass{GRanges} object.
          Do that.
  \end{enumerate}
\end{Exercise}

\begin{Solution}
  \begin{enumerate}
    \item
<<>>=
read_ranges <- as(reads, "GRangesList")
@

    \item
<<>>=
table(elementLengths(read_ranges))
@
          Number of ranges per alignment equals number of gaps plus one.

    \item
<<>>=
as(reads, "GRanges")
@
  \end{enumerate}
\end{Solution}

\subsection{Computing the \emph{coverage} of the reads}

Finally, we can use the \Rclass{GRangesList} object obtained in the previous
exercise to compute the \emph{coverage} of the reads, that is, the number
of reads that cover each genomic position in the reference genome.

\begin{Exercise}
  Computing the read coverage.
  \begin{enumerate}
    \item Use the \Rcode{coverage()} function on \Rcode{read\_ranges}
          to compute the \emph{coverage} of the reads.

          What is the class of the object returned by \Rcode{coverage()}?
          Display the object.

    \item Extract the coverage of chromosome 12 as an integer-Rle.
          What is the maximum coverage on chromosome 12?

    \item Use \Rcode{max()} on the full coverage object (\Rclass{RleList})
          to get the maximum coverage on each chromosome.
  \end{enumerate} 
\end{Exercise}

\begin{Solution}
  \begin{enumerate}
    \item
<<>>=
cvg <- coverage(read_ranges)
class(cvg)
cvg
@

    \item Coverage of chromosome 12:
<<>>=
cvg[["12"]]
max(cvg[["12"]])
@

    \item Maximum coverage on each chromosome:
<<>>=
max(cvg)
@
  \end{enumerate}
\end{Solution}

%\section{Finding overlaps between aligned reads and genes}
%
%\begin{Exercise}
%  In this exercise, we will perform a naive counting of the number of
%  aligned reads that overlap with each gene of a gene model.
%  \begin{enumerate}
%    \item \emph{COMING SOON...}
%  \end{enumerate}
%\end{Exercise}
%
%\begin{Solution}
%  \emph{COMING SOON...}
%\end{Solution}

\end{document}