% THIS IS ONLY A placeholder for the actual vignette, which takes too long
% time to build. Therefore, in this placeholder vignette
% eval=FALSE is set for all code chunks

%\VignetteIndexEntry{Supplement to A tutorial on how to analyze ChIP-chip readouts using Bioconductor}
%\VignetteDepends{Ringo, biomaRt, topGO, xtable}
%\VignetteKeywords{microarray ChIP-chip NimbleGen nimblegen}
%\VignettePackage{ccTutorial} % name of package

%%%% HEAD SECTION: START EDITING BELOW %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%% To compile the .Rnw file into a .tex file and figures:
%% library("weaver");Sweave("ccTutorialSupplement.Rnw", driver=weaver())
%% then run "make supp" for producing the PDF

\documentclass[11pt, a4paper, fleqn]{article}
\usepackage{geometry}\usepackage{color}
\definecolor{darkblue}{rgb}{0.0,0.0,0.75}
\usepackage[%
baseurl={http://www.bioconductor.org},%
pdftitle={Supplement: Analyzing ChIP-chip data using Bioconductor},%
pdfauthor={Joern Toedling},%
pdfsubject={Vignette},%
pdfkeywords={Bioconductor},%
pagebackref,bookmarks,%colorlinks,linkcolor=darkblue,citecolor=darkblue,%
filecolor=darkblue,urlcolor=darkblue,pagecolor=darkblue,%
raiselinks,plainpages,pdftex]{hyperref}

\usepackage{amsmath,a4,t1enc, graphicx}

%\usepackage{natbib}
%\bibpunct{(}{)}{;}{a}{,}{,}

\usepackage{verbatim}
\usepackage{subfigure}

\usepackage[compress]{natbib}
\bibpunct{[}{]}{,}{n}{}{,} 

\parindent0mm
\parskip2ex plus0.5ex minus0.3ex

\newcommand{\Robject}[1]{{\texttt{#1}}}
\newcommand{\Rfunction}[1]{{\texttt{#1}}}
\newcommand{\Rpackage}[1]{{\textit{#1}}}
\newcommand{\Rclass}[1]{{\textit{#1}}}
\newcommand{\Rmethod}[1]{{\textit{#1}}}
\newcommand{\Rfunarg}[1]{{\texttt{#1}}}
\newcommand{\phead}[1]{{\flushleft \sf \small \textbf{#1} \quad}}
\newcommand{\todo}{{\textbf{TO DO:} \quad}}

%% prepend a 'S' to the numbers of tables and figures, 
%%  since this is a supplement
\renewcommand{\thetable}{S\arabic{table}}
\renewcommand{\thefigure}{S\arabic{figure}}

\newcommand{\myincfig}[3]{%
  \begin{figure}[tbp]
    \begin{center}
      \includegraphics[width=#2]{#1}
      \caption{\label{#1}\textit{#3}}
    \end{center}
  \end{figure}
}

% only allow puting floats that occupy 80% or more of a page on
%  single pages (default is 0.5)
\renewcommand{\floatpagefraction}{0.8}
\renewcommand{\dblfloatpagefraction}{0.8}
\addtolength{\textfloatsep}{-3pt}

\addtolength{\textwidth}{2cm}
\addtolength{\oddsidemargin}{-1cm}
\addtolength{\evensidemargin}{-1cm}
\addtolength{\textheight}{2cm}
\addtolength{\topmargin}{-1cm}
\addtolength{\skip\footins}{1cm}


%%%%%%% START EDITING HERE %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{document}

\SweaveOpts{eval=FALSE, include=FALSE, keep.source=TRUE, eps=FALSE} %

\title{Supplement: Analyzing ChIP-chip data using Bioconductor}
\author{Joern Toedling, Wolfgang Huber}
%\date{}
\maketitle

\tableofcontents

\section{Introduction}

This document supplements the manuscript ``Analyzing ChIP-chip data
using Bioconductor''~\citep{Toedling2008}. 
The manuscript demonstrates how to use the tools R and Bioconductor
for a ChIP-chip data analysis.

The R software can be obtained by following the installation
instructions at \url{http://www.r-project.org}.

For obtaining the Bioconductor packages that are needed
for redoing this analysis, we recommend
\Rfunction{install} function from the
BiocManager package.
Follow these steps from within R to install the required package
(the data package \Rpackage{ccTutorial} is $> 300$~MB in size).
%
<<installPackages, eval=FALSE>>=
if (!requireNamespace("BiocManager", quietly=TRUE))
    install.packages("BiocManager")
BiocManager::install(c("Ringo", "biomaRt", "topGO", "ccTutorial"))
@ 
%
Further information about the installation of Bioconductor packages
can be found at
\url{http://www.bioconductor.org/docs/install}.
%
<<prepare, echo=FALSE>>=
options(length=60, "stringsAsFactors"=FALSE)
set.seed(123)
options(SweaveHooks=list(
   along=function() par(mar=c(2.5,4.2,4,1.5), font.lab=2),
   boxplot=function() par(mar=c(5,5,1,1), font.lab=4),
   dens=function() par(mar=c(4.1, 4.1, 0.1, 0.1), font.lab=2)))
@

<<loadpackage, results=hide>>=
library("Ringo")
library("biomaRt")
library("topGO")
library("xtable")
library("ccTutorial")
library("CMARRT")
@

This document has been written in the 
\texttt{Sweave}~\cite{Leisch2002} format, which combines
explanatory text and the R source code that has been
used in this analysis. One advantage of this format is that
the analysis can easily be reproduced by the reader.
The R~package \Rpackage{ccTutorial} contains all the data and scripts
used in this manuscript.


\section{Importing the data into R}

The provided data are measurements of enrichment for
H3K4me3 in heart and brain cells.
For each microarray, the scanning output consists of two files, one holding
the Cy3 intensities (the untreated \textit{input} sample), the other
one the Cy5  intensities from the immunoprecipitated sample.
These files are tab-delimited text files
in NimbleGen's \emph{pair} format.

The microarray platform was a set of 4 arrays containing about 
390k reporters each and meant to tile selected promoter regions in the
\emph{Mus musculus} genome (assembly \emph{mm5})
with one base every 100~bp.
Thus for every sample, we have 8 files
\mbox{(4 arrays $\times$ 2 dyes)}.

<<locateData>>=
pairDir <- system.file("PairData",package="ccTutorial") 
list.files(pairDir, pattern="pair$")
@

In addition, there is one text file for each array type that holds
details on the samples, including which two \emph{pair} files belong
to which sample.
%
<<exampleFilesTxt>>=
read.delim(file.path(pairDir,"files_array1.txt"), header=TRUE)
@ 
%
The columns \texttt{FileNameCy3} and \texttt{FileNameCy5} hold which of the
raw data files belong to which sample. 
The immunoprecipitated extract was tagged with the Cy5 dye in the
experiment; so the column \texttt{Cy5} holds which antibody has been
used for the immunoprecipitation, in this case one against the histone
modification \texttt{H3K4me3}.

The file \texttt{spot\-types.text}
describes the reporter categories on the array (such
Spot Types files are also used in the Bioconductor
package \Rpackage{limma}~\cite{limma05}).

From these files, we can read in the raw reporter intensities
and obtain four objects of class \Rclass{RGList}, a class defined
in package \Rpackage{limma}. Each object contains the readouts
from all samples measured on the same array type.
%
<<readNimblegen, cache=TRUE, results=hide>>=
RGs <- lapply(sprintf("files_array%d.txt",1:4),
  readNimblegen, "spottypes.txt", path=pairDir)
@ 
%
An \Rclass{RGList} object is a list and contains the raw
intensities of the two hybridizations for the red and green channel
plus information about the reporters on the array and the analyzed
samples.

<<showRG>>=
head(RGs[[1]]$R)
head(RGs[[1]]$G)
tail(RGs[[1]]$genes)
@ 
%
Among the read-in values are those coming from 
reporters\footnote{the misleading slot name ``genes''
is due to historical reasons, dating back to the time when cDNA
microarrays were mostly used to measure gene expression. In our case,
each reporter is not associated to one gene but to one or more genomic
locations.}
matching the genome sequence as well as some from the manufacturer's
``control'' reporters on the array. 
%
<<showProbeStatus>>=
table(RGs[[1]]$genes$Status)
@ 
%
The \Rclass{RGList} is a common class for raw two-color data.
Thus, the following steps can easily be applied to other,
non-NimbleGen  microarrays, which for example can be read in into R
using \Rpackage{limma}'s function \Rfunction{read.maimages}.


\section{Quality assessment}

First, we look at the spatial distribution of the intensities on the
array.
This is useful for detecting obvious artifacts on the array, such as
scratches, bright spots, finger prints etc., which may render parts or
all of the readouts of one hybridization useless.

For demonstration, we first show three array surface plots with
artifacts. These three microarrays were generated for another
ChIP-chip study for histone modifications~\cite{Fischer2008}.
See the spatial distribution plots of these arrays in
Figure~\ref{arraysWithArtifacts.jpg}.
The coordinates in the picture correspond to coordinates on the
surface of the microarray.
The  color of the dots represents the value of the raw reporter
intensity, with brighter shades of green corresponding to higher
intensities. For well-hybridized microarrays, a homogeneous picture
can be expected.
Two of the displayed three arrays show strong artifacts.
The array in the left panel shows two distinct problems.
The bright rim on the picture suggests that all reporters near the rim
of the array high raw intensities.
The second artifact is the wave pattern on the surface.
This effect is known as a Moir\'e pattern in image processing and
emerged during the scanning process of the microarray.
The spatial distribution of Cy5 channel intensities on this array
looks homogeneous (data not shown), which provides more evidence that
the artifacts in this plot are likely due to errors in scanning the
Cy3 intensities of this array.
The array in the middle panel shows a large artifact that only affects
the right side of the array.
The array in the right panel finally shows a bright spot in the center
of the array and slightly higher intensities in the upper half of the
array than in the lower array.
However, these artifacts are mild in comparison with the other two
arrays. The array on the right side was kept for further analysis in
the study, while the two on the left side and in the middle were
replaced by newly hybridized arrays.

\myincfig{arraysWithArtifacts.jpg}{0.9\textwidth}{Spatial
  distribution of reporter intensities on microarrays from another
  ChIP-chip study~\cite{Fischer2008}.  Coordinates in the picture
  correspond to coordinates on the surface   of the microarray. The
  color of the dots represents the value of the raw reporter
  intensity, with brighter shades of green corresponding to higher
  intensities. For well-hybridized microarrays, a homogeneous picture
  can be expected.The two arrays in the left and   the middle panel
  show strong artifacts and were excluded from later analyses. The
  array in the right panel shows weak artifacts and was kept for
  later analysis.} 

For the data of Barrera~\textit{et~al.}~\cite{Barrera2008}, we
construct one picture showing the spatial distributions for all arrays
and both channels.

<<imageRG, results=hide,echo=TRUE>>=
RG1breaks <- c(0,quantile(RGs[[1]]$G, probs=seq(0,1,by=0.1)),2^16)
png("ccTutorialArrayImages.png", units="in", res=200,
     height=10.74*1.5, width=7.68*1.5)
par(mar=c(0.01,0.01,2.2,0.01))
layout(matrix(c(1,2,5,6,3,4,7,8,9,10,13,14,11,12,15,16),
       ncol=4,byrow=TRUE))
for (this.set in 1:4){
  thisRG <- RGs[[this.set]]
  for (this.channel in c("green","red")){
    my.colors <- colorRampPalette(c("black",paste(this.channel,c(4,1),
                                    sep="")))(length(RG1breaks)-1)
    for (arrayno in 1:2){
      image(thisRG, arrayno, channel=this.channel, 
            mybreaks=RG1breaks, mycols=my.colors)
      mtext(side=3, line=0.2, font=2, text=colnames(thisRG[[toupper(
            substr(this.channel,1,1))]])[arrayno])
}}}
dev.off()
@

\myincfig{ccTutorialArrayImages}{0.95\textwidth}{Spatial
  distribution of raw reporter intensities laid out by the reporter
  position on the microarray surface. Each pair of one green and one
  red image on top of each other are the Cy3 and Cy5 readouts of the
  same hybridized microarray.}

See Figure~\ref{ccTutorialArrayImages} for the images.
Minor artifacts can be seen. The arrays of the first set (48153 and 48172)
show a blotch of lower intensities in the upper right area of the array.
These artifacts affect only a small part of the
array and thus probably have a negligible effect 
on the results. The reporters in those affected areas of the array 
will yield meaningless readouts but enriched regions will be determined
based on a set of multiple reporters that are distributed over the
microarray surface and not on single reporters only.
Also a few arrays are brighter than others, which indicate higher raw
intensities for the respective arrays. 
These effects could be due to a larger amounts of DNA being hybridized.
The scaling step during  preprocessing later on is able to correct for
such shifts.

On all arrays in our set, the Cy3 channel holds the intensities from
the untreated \textit{input} sample, and the Cy5 channel holds the
ChIP result for heart and heart, respectively. 
We investigate whether this experiment setup is reflected in the
reporter intensity correlation per channel (see Figure S3).
Compare these two plots:
<<corPlotRG2G, fig=TRUE, include=FALSE, height=6, width=6, results=hide>>=
corPlot(log2(RGs[[2]]$G))
@ 
<<corPlotRG2R, fig=TRUE, include=FALSE, height=6, width=6, results=hide>>=
corPlot(log2(RGs[[2]]$R))
@ 
%
\begin{figure}[t!b]%
\centering
\subfigure[ ]{
\includegraphics[width=0.4\textwidth]{ccTutorialSupplement-corPlotRG2G}
}
\hspace{0.3cm}
\subfigure[ ]{
\includegraphics[width=0.4\textwidth]{ccTutorialSupplement-corPlotRG2R}
}
\label{corPlotsRG2}
\caption{\textit{Scatterplot and Spearman correlation of the raw
intensities from the two microarrays for
{\bf \textnormal{(a)}} the Cy3 channel, the genomic \textit{input} samples 
{\bf \textnormal{(b)}} the Cy5 channel, the H3K4me3-ChIP sample for
\emph{M. musculus} brain and heart cells.}}
\end{figure}

See Figure S\ref{corPlotsRG2} for plots comparing the two arrays.
In the scatter plots of raw reporter intensities,
the fraction of dots at the diagonal is higher for the
\textit{input} samples than for the ChIP samples.
Concordantly, the correlation between the intensities
of the \textit{input} samples is higher than between
the ChIP samples (0.877 versus 0.734).

We also show the same plots for two of the previous arrays with
artifacts (left and right panels in
Figure~\ref{arraysWithArtifacts.jpg}). 
With these arrays, the Cy3 channel also holds the \textit{input} samples,
while the Cy5 channel are the ChIP samples.

\begin{figure}[t!b]%
\centering
\subfigure[ ]{
\includegraphics[width=0.4\textwidth]{badRGcorPlotG.jpg}
}
\hspace{0.3cm}
\subfigure[ ]{
\includegraphics[width=0.4\textwidth]{badRGcorPlotR.jpg}
}
\label{corPlotsBadRG}
\caption{\textit{Scatterplot and Spearman correlation of the raw
intensities from two microarrays with artifacts (see
Figure~\ref{arraysWithArtifacts.jpg} for
{\bf \textnormal{(a)}} the Cy3 channel, the genomic \textit{input}
samples
{\bf \textnormal{(b)}} the Cy5 channel, ChIP sample for
\emph{M. musculus} brain and heart cells.}}
\end{figure}

These correlation plots are shown in Figure~\ref{corPlotsBadRG}. The
\textit{input} (Cy3) intensities do not show any correlation, while the
ChIP intensities of these samples show a much better correlation. This
is unexpected, since the ChIP samples used antibodies against
different histone modifications (H4ac and H3K4me2), while the
\textit{input} samples are both genomic DNA from the same cell type
(cell line C2C12).


\section{Mapping reporters to the genome}

A mapping of reporters to genomic coordinates is usually provided
by the array manufacturer. 
For several reasons, however, remapping the reporter
sequences to the genome may be useful.
Here, the microarray had been designed on an 
outdated assembly of the mouse genome (mm5, May 2004).
The reporter sequences were remapped to the current,
almost final assembly of the mouse genome (mm9, July 2007).
Remapping also allows you to specify custom criteria
for what degree of sequence identity you require for a match and for
uniqueness of a match.

We extracted the reporter nucleotide sequences from the downloaded
NDF (NimbleGen Design Files) files.
We re-mapped the reporter sequences to the genome, using
the alignment tool \emph{Exonerate} \cite{Slater2005}. 
We required 97\% sequence identity for a 
match\footnote{Remapping 1.5 million reporters took
about 100 processor hours on an AMD Opteron Processor 275.}.
Since the reporters on the microarray were 50mers, a sequence identity
of 97\% corresponds to one mismatch at most.
The implicit assumption is that if 49 out of 50 nucleotides are
complimentary that would be sufficient for hybridization.
We did not consider a more complex hybridisation model, in which the
position of the mismatch in the reporter sequence and its impact on
secondary structure formation are taken into account.
However, with one mismatch per 50mer, there is always a perfert match
segment of $\geq 25$~nucleotides in length.
A matching segment of 18 or more nucleotides in length was reported to
be sufficient for hybridization~\cite{Rennie2008}.

Exonerate was run matching the reporter sequences in the Fasta file
\texttt{RenMM5\-Tiling\-Probe\-Sequences.fsa} against each 
chromosome's sequence using the shell script 
\texttt{runExonerate.sh} and then condensing the per-chromosome
output files into one single file using the Perl script
\texttt{condense\-Exonerate\-Output.pl}\footnote{all the scripts mentioned
here are included in the \texttt{scripts} directory of the package}.

From this result file, we construct an object of class \Rclass{probeAnno}
to store the mapping between reporters and genome positions.
%
<<posToProbeAnno, cache=TRUE, results=hide>>=
probeAnno <- posToProbeAnno(file.path(system.file("exonerateData",
  package="ccTutorial"), "allChromExonerateOut.txt"))
allChrs <- chromosomeNames(probeAnno)
@ 
<<posToProbeAnnoExtent>>=
genome(probeAnno) <- "M. musculus (mm9)"
arrayName(probeAnno) <- "2005-06-17_Ren_MM5Tiling"
@ 
<<showProbeAnno>>=
show(probeAnno)
ls(probeAnno)
@ 
%
For each genomic position matched by a reporter, it was recorded
whether it was the sole match position of that reporter in the genome
or whether that reporter also matched other genomic
positions\footnote{More precisely, a reporter matches a 25-nucleotide
  ``segment''. That segment, however, is uniquely defined by the
  genomic position of its central nucleotide, which is why I refer to
  the ``position'' that is matched by a reporter.}.
For example, how many reporter-match positions on chromosome~9
are unique matches of those reporters (code:~0) and how many are
matched by reporters that have multiple matches in the genome
(code:~3)?
%
<<showProbeAnno2>>=
table(probeAnno["9.unique"])
@ 
%
The majority of match positions are unique reporter match
positions.
The intensities of reporters matching multiple genomic locations will
be excluded from later analysis (smoothing, identification of
ChIP-enriched regions), since the readouts of these reporters are
ambiguous.

\subsection{Average spacing between reporter mach positions}

<<reporterSpacing, figure=TRUE, include=FALSE, width=8, height=6, dens=TRUE>>=
startDiffByChr <- lapply(as.list(allChrs), function(chr){
  chrsta <- probeAnno[paste(chr,"start",sep=".")]
  chruni <- probeAnno[paste(chr,"unique",sep=".")]
  ## get start positions of unique reporter match positions
  return(diff(sort(chrsta[chruni=="0"])))})
startDiff <- unlist(startDiffByChr, use.names=FALSE)
table(cut(startDiff, breaks=c(0,50,99,100,200,1000,max(startDiff))))
@
%
The majority of unique reporter match positions have an offset of
100~bp between their start positions.


\section{Genome annotation}

Later on, found enriched regions will be related to annotated genome
features, such as gene start and end positions.
Using the Bioconductor package \Rpackage{biomaRt} \cite{Durinck2005},
we can obtain an up-to-date annotation of the mouse genome
from the Ensembl data base \cite{Birney2004}.

<<makeGffWithBiomaRt, cache=TRUE>>=
ensembl  <- useMart("ensembl", dataset="mmusculus_gene_ensembl")
gene.ids <- unique(unlist(lapply(as.list(c(1:19,"X","Y")), 
   function(this.chr) 
    getBM(attributes="ensembl_gene_id", filters="chromosome_name", 
          values=this.chr, mart=ensembl)[,1]), use.names=FALSE))
    sel.attributes=c("ensembl_gene_id", "mgi_symbol", "chromosome_name",
          "strand", "start_position","end_position", "description")
mm9genes <- getBM(attributes=sel.attributes, filters="ensembl_gene_id",
                  value=gene.ids, mart=ensembl)
@ 
%
For later use, we replace the formal element names retrieved
from the data base by simpler ones.
%
<<replaceMm9GenesNames>>=
mm9genes$name    <- mm9genes$"ensembl_gene_id"
mm9genes$gene    <- mm9genes$"ensembl_gene_id"
mm9genes$chr     <- mm9genes$"chromosome_name"
mm9genes$symbol  <- mm9genes$"mgi_symbol"
mm9genes$start   <- mm9genes$"start_position"
mm9genes$end     <- mm9genes$"end_position"
mm9genes$feature <- rep("gene",nrow(mm9genes))
@ 
%
Some genes occur in multiples in the table because an Ensembl gene can have
more than one MGI Symbol defined for it. We keep allow only one row in the
table per gene and append additional MGI symbols to the \emph{description}
element of each gene.
%
<<removeDuplicatedSymbols>>=
if (any(duplicated(mm9genes$name))){
  dupl <- unique(mm9genes$name[duplicated(mm9genes$name)])
  G <- lapply(as.list(dupl), function(this.gene){
    this.gff <- subset(mm9genes,name == this.gene)
    if (nrow(unique(this.gff[,c("name","chr","start","end",
        "description")]))>1) return(this.gff[1,,drop=FALSE])
    non.zero.gff <- subset(this.gff, nchar(symbol)>0)
    this.other.sym <- NULL
    if (nrow(non.zero.gff)> 0){
      shortest <- which.min(nchar(non.zero.gff$symbol))
      this.new.sym <- non.zero.gff$symbol[shortest]
      if (nrow(non.zero.gff)>1)
        this.other.sym <- paste("Synonyms", 
           paste(non.zero.gff$symbol[-shortest],collapse=","),sep=":")
    } else { this.new.sym <- "" }
    this.gff$symbol[1] <- this.new.sym
    if (!is.null(this.other.sym))
      this.gff$description[1] <- paste(this.gff$description[1],
                                       this.other.sym,sep=";")
    return(this.gff[1,,drop=FALSE])
  })
  mm9genes <- rbind(mm9genes[-which(mm9genes$name %in% dupl),],
                    do.call("rbind",G))
}
@ 
%
Finally, we reorder the table rows by gene chromosome and start position.
<<reorderMm9>>=
mm9genes <- mm9genes[order(mm9genes$chr, mm9genes$start),
  c("name","chr","strand","start","end","symbol","description","feature")]
rownames(mm9genes) <- NULL
@ 
% alternative:
<<loadMM9Genes, eval=FALSE, echo=FALSE>>=
data(mm9genes)
@ 
%
The resulting table holds the coordinates, Ensembl gene identifiers,
MGI symbols, and description of all the genes annotated for the
\emph{mm9} mouse assembly. Have a look at a few example lines from the
table.
%
<<setSeed1, echo=FALSE>>=
set.seed(1)
@ 
<<showmm9genes>>=
mm9genes[sample(seq(nrow(mm9genes)),4), 
  c("name", "chr", "strand", "start", "end", "symbol")]
@ 
%
We also retrieve the Gene Ontology (GO, \cite{Ashburner2000})
annotation for each gene, but discard those annotations 
that have only been
\emph{inferred from electronic annotation} (evidence code: IEA), 
are based on a \emph{non-traceable author statement} (NAS) 
or for which there is \emph{no biological data} (ND) available.
%
<<getGenesGOAnnotation, cache=TRUE>>=
ensembl  <- useMart("ensembl", dataset="mmusculus_gene_ensembl")
ontoGOs <- lapply(as.list(c("biological_process","cellular_component", 
                  "molecular_function")), function(onto){
  ontoBM <- getBM(mart=ensembl, attributes=c("ensembl_gene_id", 
                   paste("go",onto,"id", sep="_"), 
                   paste("go",onto,"linkage_type", sep="_")), 
                  filters="ensembl_gene_id", value=mm9genes$name)
  names(ontoBM) <- c("ensembl_gene_id","go","evidence_code")
  ontoBM <- subset(ontoBM,!( evidence_code %in% c("","IEA","NAS","ND")))
})
mm9GO <- do.call("rbind", ontoGOs)
@ 

<<getGOperGene, cache=TRUE>>=
mm9.gene2GO <- with(mm9GO, split(go, ensembl_gene_id))
@ 
<<loadGenesGOAnnotation, eval=FALSE, echo=FALSE>>=
data(mm9.gene2GO)
data(mm9.g2p)
@

Finally, we create a mapping of gene identifiers to reporters that
had been mapped into the gene or its upstream region.
%
<<mappingGenesToProbes, cache=TRUE, results=hide>>=
mm9.g2p <- features2Probes(gff=mm9genes, probeAnno=probeAnno)
@ 
%
<<showMm9g2p>>=
table(cut(listLen(mm9.g2p),breaks=c(-1,0,10,50,100,500,1200)))
@ 
%
This last table shows how many genes have that number of reporters
mapped into their upstream region or inside of them. The numbers
of reporters are given in open interval notation 
with, e.g., (10,50] meaning 11 to 50 reporters.

For later use, we determine which genes have a sufficient number
- arbitrarily we say 5 -
of reporters mapped to their upstream region or inside of them.
We also determine which of these genes have been annotated with at
least one GO term.
%
<<arrayGenes, cache=TRUE>>=
arrayGenes <- names(mm9.g2p)[listLen(mm9.g2p)>=5]
arrayGenesWithGO <- intersect(arrayGenes, names(mm9.gene2GO))
@ 


\section{Preprocessing}

We derive $\log_2$ fold changes Cy5/Cy3 for each reporter and scale
these by subtracting Tukey's biweight mean from each log2 ratio,
the standard scaling procedure suggested by NimbleGen.
We only perform this scaling procedure, since we are not aware of any
normalization method that is completely appropriate for ChIP-chip
with antibodies against histone modifications.
One common assumption of many normalization methods is that the
variation of almost all reporter levels does not reflect biological
variation between samples/conditions (\textit{input}, ChIP) but is
non-biological variation, \textit{e.~g.}, due to differences in sample
processing and hybridization.
This assumption probably does not hold in this case, as
the fraction of histones bearing post-translational modifications
cannot safely be assumed to be small.

Each of the four microarrays used contains a unique set of reporters.
Thus, we preprocess the arrays separately by
type and only then combine the results into
one object holding the preprocessed readouts for all reporters.
%
<<preprocess, cache=TRUE, results=hide>>=
MAs <- lapply(RGs, function(thisRG)
  preprocess(thisRG[thisRG$genes$Status=="Probe",], 
             method="nimblegen", returnMAList=TRUE))
MA <- do.call("rbind",MAs)
X  <- asExprSet(MA)
sampleNames(X) <- paste(X$Cy5, X$Tissue, sep=".")
@ 
%
The result is an object of class \Rclass{ExpressionSet},
the Bioconductor class for storing preprocessed microarray data.
Note that first creating an \Rclass{MAList} for each array type,
combining them with \Rfunction{rbind} and then converting the result
into an \Rclass{ExpressionSet} is only necessary if the reporters are
distributed over more than one microarray design (four in this
case). 
%
<<showX>>=
show(X)
@ 


\section{Preprocessed reporter intensities around the gene \emph{Crmp1} }

We visualize the preprocessed H3K4me3 ChIP-chip reporter-wise
readouts around the start of the \textsl{Crmp1} gene. 
H3K4me3 has frequently been shown to be associated to active 
transcription (e.\ g.\ , \cite{Fischer2008}) and the gene
\textsl{Crmp1} has been reported as being expressed in brain
cells~\cite{Hamajima1996}.
%
<<chipAlongChromCrmp1, fig=TRUE, include=FALSE, width=8, height=4, along=TRUE, results=hide>>=
plot(X, probeAnno, chrom="5", xlim=c(37.63e6,37.64e6), ylim=c(-3,5),
     gff=mm9genes, paletteName="Set2")
@
%
\myincfig{ccTutorialSupplement-chipAlongChromCrmp1}{0.9\textwidth}{%
Normalized reporter intensities for H3K4me3 ChIP around the TSS of the
gene~\textsl{Crmp1} in \emph{M. musculus} brain and heart cells. The
ticks on the genomic coordinate axis on below indicate genomic
positions matched by reporters on the microarray. 
The blue box above the genomic coordinate axis marks
the position of the gene~\textsl{Crmp1} with its position above the axis
indicating that the gene is located on the Watson strand.}

See the result in Figure~\ref{ccTutorialSupplement-chipAlongChromCrmp1}.
In brain cells, the intensities for enrichment of H3K4me3 around the
gene's start position tend to be positive, 
while the signal for heart cells is around or below zero.


\section{Smoothing of reporter intensities}

To ameliorate specific reporter effects as well as the stochastic
noise, we perform a smoothing over individual reporter intensities
before looking for ChIP-enriched regions.
A window of 900~bp width is slided along the chromosome, and the
reporter level at genomic position $x_0$ is replaced by the median
over the intensities of those reporters mapped inside the window
centered at $x_0$.

Factors taken into account in the choice of the sliding-window width
were the size distribution of DNA fragments after sonication
(commonly around 1~kbp) and the spacing between reporter matches
on the genome (mostly~100~bp).
We chose a window-width of 900~bp, which was slightly less than the
average fragment size and meant that most windows included $>=5$
reporters.
With this window width, we could be sure that the signal is not
smoothed over many fragments and was calculated as the median
over $\geq 5$ reporters.
At any position $x_0$ at which the window comprised less than three
reporter-matched positions, the smoothed level was flagged as missing,
as the data was insufficient to provide information about ChIP
enrichment at such a position.
%
<<smoothing, cache=TRUE, results=hide>>=
smoothX <- computeRunningMedians(X, probeAnno=probeAnno, 
   modColumn="Tissue", allChr=allChrs, winHalfSize=450, min.probes=5)
sampleNames(smoothX) <- paste(sampleNames(X),"smoothed",sep=".")
combX <- cbind2(X, smoothX)
@
%
Compare the smoothed reporter intensities with the non-smoothed ones around the
start of the gene \emph{Crmp1}.
%
<<smoothAlongChromCrmp1, fig=TRUE, include=FALSE, width=8, height=4, along=TRUE, results=hide>>=
plot(combX, probeAnno, chrom="5", xlim=c(37.63e6,37.64e6),
     gff=mm9genes, ylim=c(-3,5),
     colPal=c(brewer.pal(8,"Set2")[1:2],brewer.pal(8,"Dark2")[1:2]))
@ 
%
\myincfig{ccTutorialSupplement-smoothAlongChromCrmp1}{0.9\textwidth}{Normalized and smoothed reporter intensities for H3K4me3 ChIP around the TSS of the gene \emph{Crmp1} in \emph{M. musculus} brain and heart cells.}

See Figure \ref{ccTutorialSupplement-smoothAlongChromCrmp1} for a
comparison of the original and smoothed reporter levels around the
gene \emph{Crmp1}.


\section{Alternative methods for finding ChIP-enriched regions}

We have presented the algorithm of package \Rpackage{Ringo} for
finding Chip-enriched regions in ChIP-Chip against H3K4me3.
There are important differences between ChIP-chip against histone
modifications and ChIP-chip against transcription factors.
With transcription factors, it is safe to assume that the majority of
genomic regions will not show a real binding site for that
transcription factor.
Hence, most reporters on the microarray will not indicate true
enrichment, at least not when the tiling microarrays represent
the whole genome or an unbiased subset of the genome.
This situation is beneficial for the data preprocessing and for
identifying ChIP-enriched regions, since most of the data
can safely be assumed to show non-enrichment.
With histone modifications, on the other hand, the degree and extent
to which the genome shows a certain histone modification can only be
guessed at the present time.
This situation make estimation of the background distribution of
reporter levels under non-enrichment difficult.

Moreover, transcription factor binding sites are highly localized
point effects, meaning that the transcription factor binds at one
specific position directly or indirectly to the DNA and the signal
will show a peak shape around this position.
The highest point of the signal peak will be as close to the actual
binding site as the reporter-tiling on the microarray allows
(see~\cite{BourgonPhD} for an extended discussion and for a derived
model of TF ChIP-chip data).
With histone modifications, the enzyme which modifies the
histone tail is unlikely to act on only one single histone protein,
but will modify a number of nearby histones.
A single-nucleosome resolution study of histone modifications in
\emph{Saccharomyces cerevisiae} has shown that modifications occur in
the form of broad modified domains and that adjacent nucleosomes
mostly share the same modifications~\cite{Liu2005}.

Many other suggested algorithms for finding ChIP-enriched regions
are based on the assumption that the fraction of reporters that show
enrichment is very small and are therefore not applicable to ChIP-chip
against histone modifications.
However, the algorithm in \Rpackage{Ringo} is by no means the only
suitable algorithm for this task.
Users can choose to apply other algorithms that are contained in other
R/Bioconductor packages.
In the following, we demonstrate an example application of one other
algorithm to the data.

\phead{CMARRT}
The package \Rpackage{CMARRT}~\cite{Kuan2008} can be obtained from
\url{http://www.stat.wisc.edu/~kuanp/CMARRT}~.
Based on the example source code in the package vignette,
we use \Rpackage{CMARRT} to identify which regions are enriched by
ChIP against H3K4me3 in brain cells.
%
<<getChersXD, echo=FALSE>>=
data("chersX")
chersXD <- as.data.frame(chersX)
@
<<tryCmarrt, cache=TRUE>>=
cmarrtDat <- do.call("rbind", lapply(as.list(allChrs), function(chr){
  areUni <- probeAnno[paste(chr,"unique",sep=".")]==0
  chrIdx <- match(probeAnno[paste(chr,"index",sep=".")][areUni],
                  featureNames(X))
  chrDat <- data.frame("chr"=rep(chr, sum(areUni)),
     "start"=probeAnno[paste(chr,"start",sep=".")][areUni],
     "stop"=probeAnno[paste(chr,"end",sep=".")][areUni],
     "logR"=exprs(X)[chrIdx,1],
     stringsAsFactors=FALSE)
}))

cmarrtRes <- cmarrt.ma(cmarrtDat, M=0.5, frag.length=900,
                       window.opt = "fixed.gen.dist")

cmarrtReg <- cmarrt.peak(cmarrtRes, alpha=0.05, method="BY", minrun=4) 
cmarrtRegDf <- lapply(cmarrtReg, as.data.frame)$cmarrt.bound
names(cmarrtRegDf)[1:3] <- c("chr","start","end")
@ 
%
These are a few of the ChIP-enriched regions for H3K4me3 in brain cells,
as identified by \Rpackage{CMARRT}:
%
<<showCmarrt>>=
head(cmarrtRegDf)
@ 
%
We assess the overlap between ChIP-enriched regions identified by the
two methods, exemplarily for chromosome 9.
%
<<getRingoAndCmarrt>>=
ringoChersChr9 <- subset(chersXD, chr=="9" & cellType=="brain")
cmarrtChersChr9 <- subset(cmarrtRegDf, chr=="9")
dim(ringoChersChr9)
dim(cmarrtChersChr9)
@
%
The overlap between two ChIP-enriched regions
$R_{i, \mbox{\scriptsize \Rpackage{Ringo}}}$ and
$R_{j, \mbox{\scriptsize \Rpackage{CMARRT}}}$, which were identified
by \Rpackage{Ringo} and \Rpackage{CMARRT}, respectively,
is computed as
\begin{equation}
\textup{Ov}\left(R_{i, \mbox{\scriptsize \Rpackage{Ringo}}},
 R_{j, \mbox{\scriptsize \Rpackage{CMARRT}}} \right) =  \frac{%
\textnormal{length}\left( R_{i, \mbox{\scriptsize \Rpackage{Ringo}}}
\cap R_{j, \mbox{\scriptsize \Rpackage{CMARRT}}} \right)}{%
\min \left( \textnormal{length}(R_{i, h_1}),
\textnormal{length}(R_{j, h_2}) \right)}
\end{equation}
where ``$\cap$'' denotes region intersection and
$\textnormal{length}(R_i)$ is the length of region $R_i$ in
nucleotides.

<<overlapRingoVsCmarrt, cache=TRUE>>=
chersChr9Overlap <- as.matrix(
   regionOverlap(ringoChersChr9, cmarrtChersChr9))
minRegChr9Len <- outer(with(ringoChersChr9, end-start+1), 
                       with(cmarrtChersChr9, end-start+1), pmin)
fracChr9Overlap <- chersChr9Overlap /minRegChr9Len
@ 
<<showOverlapRingoVsCmarrt>>=
summary(apply(fracChr9Overlap, 1, max))
@ 
One average, a ChIP-enriched region identified by \Rpackage{Ringo} is
overlapped to $\approx 91\%$ by a ChIP-enriched region identified by
\Rpackage{CMARRT}. The identified regions are highly consistent
between the two methods.


\section{Comparing ChIP-enrichment between the tissues}

<<loadCherFinding, echo=FALSE, results=hide>>=
data(chersX)
chersXD <- as.data.frame(chersX)
@

First, we have taken a gene-centric position and consider which
genes are associated to each tissue specifically.

<<getGenesEnrichedPerTissue>>=
brainGenes <- getFeats(chersX[sapply(chersX, cellType)=="brain"])
heartGenes <- getFeats(chersX[sapply(chersX, cellType)=="heart"])
brainOnlyGenes <- setdiff(brainGenes, heartGenes)
heartOnlyGenes <- setdiff(heartGenes, brainGenes)
@ 

<<useTopGO, echo=FALSE, results=hide>>=
brainRes <- sigGOTable(brainOnlyGenes, gene2GO=mm9.gene2GO,
                       universeGenes=arrayGenesWithGO)
heartRes <- sigGOTable(heartOnlyGenes,  gene2GO=mm9.gene2GO,
                       universeGenes=arrayGenesWithGO)
@ 


\subsection{Enriched-region-wise comparison}

We compute the base-pair overlap between enriched regions found in brain cells
with those found in heart cells. 
We define that region $R_i$ is defined to \emph{overlap} with region
$R_j$ if
\begin{equation}
\textnormal{length}(R_i \cap R_j)~~\geq~~0.7 \cdot
\min(\textnormal{length}(R_i),\textnormal{length}(R_j))
\end{equation}
where $\textnormal{length}(R_i)$ denotes the length of region $R_i$ in
nucleotides.
We define an enriched region as tissue-specific if it does not overlap
with any region from another tissue according to the definition above.

<<computeRegionsOverlap, cache=TRUE, results=hide>>=
brainRegions <- subset(chersXD, cellType=="brain")
heartRegions <- subset(chersXD, cellType=="heart")
chersOBL <- as.matrix(regionOverlap(brainRegions, heartRegions))
minRegLen <- outer(with(brainRegions, end-start+1),
                   with(heartRegions, end-start+1), pmin)
fracOverlap <- chersOBL/minRegLen
@ 
%
<<compTissueSpecificRegions>>=
brainSpecReg <-  brainRegions[rowMax(fracOverlap)<0.7,]
heartSpecReg <-  heartRegions[rowMax(t(fracOverlap))<0.7,]
mean(is.element(unlist(strsplit(brainSpecReg$features, 
   split="[[:space:]]"), use.names=FALSE), brainOnlyGenes))
selGenes <- intersect(unlist(strsplit(brainSpecReg$features, 
   split="[[:space:]]"), use.names=FALSE), heartGenes)
@
%
Note that only
\Sexpr{round(mean(is.element(unlist(strsplit(brainSpecReg[["features"]], split="[[:space:]]"), use.names=FALSE), brainOnlyGenes)),digits=3)*100}\%
of the genes related to non-overlapping ChIP-enriched regions show such 
regions in brain cells only. 
The other genes show such regions in both tissues but their 
positions differ between the tissues.

We can assess whether these \Sexpr{length(selGenes)} 
genes show a typical positioning of H3K4me3
to each other, such as 'in heart cells they display H3K4me3 enriched
regions upstream of the genes, while in brain cells the show H3K4me3
between gene start and stop coordinates'. We use the genes-to-reporters
mapping that we have created earlier for this investigation.
%
<<targetPos>>=
targetPos <- seq(-5000, 10000, by=250)
@ 
%
For each gene, we assess the genomic region from 5kb upstream to
10 kb downstream of the gene start, obtain the fold-change values
for each distance bin and for all selected genes, summarize the
fold-changes by specified quantiles and obtain densities of
fold-change over distance to gene start.
We normalize these densities of the observed fold changes
by the densities of mapped reporters.
%
<<newQuantsOverPostions, cache=TRUE>>=
selQop <- quantilesOverPositions(smoothX,
   selGenes=selGenes, quantiles=c(0.5, 0.9),
   g2p=mm9.g2p, positions=targetPos)
@

<<sepRegGenesSmoothedQuantiles, fig=TRUE, include=FALSE, width=8, height=5, dens=TRUE, results=hide, keep.source=TRUE>>=
plot(selQop, c("green","orange"))
@
%
\myincfig{ccTutorialSupplement-sepRegGenesSmoothedQuantiles}{0.85\textwidth}{Densities of selected quantiles of the smoothed fold-changes for H3K4me3 ChIP in \emph{M. musculus} brain and heart cells for genes that show H3K4me3 enriched regions in both tissues but in separate positions.}
%
See Figure~\ref{ccTutorialSupplement-sepRegGenesSmoothedQuantiles}
for the densities.
There are no clear tissue-wise trends where these enriched regions are in
relation to the gene start coordinate. In both tissues, the smoothed
intensities on average are highest within 1kb after the gene start
coordinate, while in brain cells the density shows a second, smaller cher
within 1kb upstream of the gene start.

We also investigate genes that have separate enriched regions in both tissues
for over-represented GO annotations.

<<topGOsepRegGenes, results=hide>>=
sepRegRes <- sigGOTable(selGenes=selGenes, gene2GO=mm9.gene2GO,
                        universeGenes=arrayGenesWithGO)
print(sepRegRes)
@ 
<<printSepRegRes, echo=FALSE, results=tex>>=
# this chunk only provides a prettier output of the table sepRegRes
#  in latex format
print(xtable(sepRegRes, label="tab-sepRegResGO", 
   caption="\\sl GO terms that are significantly over-represented among genes that show different H3K4me3 regions in heart and brain cells"),
   type="latex", table.placement="htb", size="scriptsize",
   include.rownames=FALSE)
@
%
See the results in Table \ref{tab-sepRegResGO}.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{ChIP results and expression microarray data}

Barrera~\textit{et\,al.}~\cite{Barrera2008} also provide expression
microarray data for five their analyzed \emph{M. musculus} tissues.

\subsection{Preprocess the microarray expression data}

The data were obtained from the supplementary web page
to the publication~\cite{Barrera2008},
imported into R and preprocessed as follows.
%
<<processBarerraExpressionData, cache=TRUE, keep.source=TRUE, results=hide>>=
library("affy")
library("mouse4302cdf")
AB <- ReadAffy(celfile.path=system.file("expression", 
                                        package="ccTutorial"))
barreraExpressionX <- mas5(AB)
barreraExpressionX$Tissue <- sapply(
   strsplit(sampleNames(barreraExpressionX),split="\\."),"[",3)
@

\subsection{Map Ensembl identifier to Affymetrix probe sets}

The expression data were generated using the \texttt{Mouse\_430\_2}
oligonucleotide microarray platform from Affymetrix.
Using biomaRt, we create a mapping of Ensembl gene identifiers
to the probe set identifiers on that microarray design.
% new one for author-provided expression data:
<<mapEnsToAffy, keep.source=TRUE, cache=TRUE, results=hide>>=
ensembl <- useMart("ensembl", dataset="mmusculus_gene_ensembl")
bmRes <- getBM(attributes=c("ensembl_gene_id","affy_mouse430_2"), 
               filters="ensembl_gene_id", value=arrayGenes, 
               mart=ensembl)
bmRes <- subset(bmRes, nchar(affy_mouse430_2)>0)
arrayGenesToProbeSets <- split(bmRes[["affy_mouse430_2"]], 
                               bmRes[["ensembl_gene_id"]])
@
How many probe sets are mapped to each gene?
%
<<lookArrayGenesToProbeSets>>=
table(listLen(arrayGenesToProbeSets))
@

\small

\section*{Software versions}

This supplement was generated using the following package versions:
%
<<sessionInfo, echo=FALSE, results=tex>>=
toLatex(sessionInfo())
@

\small
\section*{Acknowledgments}
%
This work was supported by the
European Union (FP6 HeartRepair, LSHM-CT-2005-018630).


%%% BIBLIOGRAPHY STARTS HERE %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\bibliographystyle{plos}
\bibliography{ccTutorial.bib}

\end{document}