\documentclass{article} \usepackage{url} \usepackage{hyperref} \usepackage{breakurl} \usepackage{amsmath} \usepackage{amssymb} %\VignetteIndexEntry{DMRcate for bisulfite sequencing assays (WGBS and RRBS)} %\VignetteEngine{knitr::knitr} \begin{document} \title{\texttt{DMRcate} for bisulfite sequencing} \author{Peters TJ} \maketitle \renewcommand{\abstractname}{Summary} \begin{abstract} Worked example to find DMRs from whole genome bisulfite sequencing data. \end{abstract} <>= if (!require("BiocManager")) install.packages("BiocManager") BiocManager::install("DMRcate") @ Load \texttt{DMRcate} into the workspace: <>= library(DMRcate) @ Bisulfite sequencing assays are fundamentally different to arrays, because methylation is represented as a pair of methylated and unmethylated reads per sample, instead of a single beta value. Although we could simply take the logit-transformed fraction of methylated reads per CpG, this removes the effect of varying read depth across the genome. For example, a sampling depth of 30 methylated reads and 10 unmethylated reads is a much more precise estimate of the methylation level of a given CpG site than 3 methylated and 1 unmethylated. Hence, we take advantage of the fact that the overall effect can be expressed as an interaction between the coefficient of interest and a two-level factor representing methylated and unmethylated reads \cite{Chenb}. The example shown here will be performed on a BSseq object containing bisulfite sequencing of regulatory T cells from various tissues as part of the \texttt{tissueTreg} package\cite{Delacher}, imported using ExperimentHub. First, we will import the data: <>= library(ExperimentHub) eh <- ExperimentHub() bis_1072 <- eh[["EH1072"]] bis_1072 colnames(bis_1072) @ The data contains 15 samples: 3 (unmatched) replicates of mouse Tregs from fat, liver, skin and lymph node, plus a group of 3 CD4+ conventional lymph node T cells (Tcon). We will annotate the BSseq object to reflect this phenotypic information: <>= bsseq::pData(bis_1072) <- data.frame(replicate=gsub(".*-", "", colnames(bis_1072)), tissue=substr(colnames(bis_1072), 1, nchar(colnames(bis_1072))-3), row.names=colnames(bis_1072)) colData(bis_1072)$tissue <- gsub("-", "_", colData(bis_1072)$tissue) as.data.frame(colData(bis_1072)) @ For standardisation purposes (and for \texttt{DMR.plot} to recognise the genome) we will change the chromosome naming convention to UCSC: <>= bis_1072 <- renameSeqlevels(bis_1072, mapSeqlevels(seqlevels(bis_1072), "UCSC")) @ For demonstration purposes, we will retain CpGs on chromosome 19 only: <>= bis_1072 <- bis_1072[seqnames(bis_1072)=="chr19",] bis_1072 @ Now we can prepare the model to be fit for \texttt{sequencing.annotate()}. The arguments are equivalent to \texttt{cpg.annotate()} but for a couple of exceptions: \begin{itemize} \item There is an extra argument \texttt{all.cov} giving an option whether to retain only CpGs where \textit{all} samples have non-zero coverage, or whether to retain CpGs with only partial sample representation. \item The design matrix should be constructed to reflect the 2-factor structure of methylated and unmethylated reads. Fortunately, \texttt{edgeR::modelMatrixMeth()} can take a regular design matrix and transform is into the appropriate structure ready for model fitting. \end{itemize} <>= tissue <- factor(pData(bis_1072)$tissue) tissue <- relevel(tissue, "Liver_Treg") #Regular matrix design design <- model.matrix(~tissue) colnames(design) <- gsub("tissue", "", colnames(design)) colnames(design)[1] <- "Intercept" rownames(design) <- colnames(bis_1072) design #Methylation matrix design methdesign <- edgeR::modelMatrixMeth(design) methdesign @ Just like for \texttt{cpg.annotate()}, we can specify a contrast matrix to find our comparisons of interest. <>= cont.mat <- limma::makeContrasts(treg_vs_tcon=Lymph_N_Treg-Lymph_N_Tcon, fat_vs_ln=Fat_Treg-Lymph_N_Treg, skin_vs_ln=Skin_Treg-Lymph_N_Treg, fat_vs_skin=Fat_Treg-Skin_Treg, levels=methdesign) cont.mat @ Say we want to find DMRs between the regulatory and conventional T cells from the lymph node. First we would fit the model, where \texttt{sequencing.annotate()} transforms counts into log2CPMs (via \texttt{limma::voom()}) and uses \texttt{limma} under the hood to generate per-CpG \textit{t}-statistics, indexing the FDR at 0.05: <>= seq_annot <- sequencing.annotate(bis_1072, methdesign, all.cov = TRUE, contrasts = TRUE, cont.matrix = cont.mat, coef = "treg_vs_tcon", fdr=0.05) seq_annot @ And then, just like before, we can call DMRs with \texttt{dmrcate()}: <>= dmrcate.res <- dmrcate(seq_annot, C=2, min.cpgs = 5) dmrcate.res treg_vs_tcon.ranges <- extractRanges(dmrcate.res, genome="mm10") treg_vs_tcon.ranges @ Looks like the top DMR is associated with the \textit{Jak2} locus and hypomethylated in the Treg cells (since \texttt{meandiff < 0}). We can plot it like so: <>= cols <- as.character(plyr::mapvalues(tissue, unique(tissue), c("darkorange", "maroon", "blue", "black", "magenta"))) names(cols) <- tissue DMR.plot(treg_vs_tcon.ranges, dmr = 1, CpGs=bis_1072[,tissue %in% c("Lymph_N_Tcon", "Lymph_N_Treg")], phen.col = cols[tissue %in% c("Lymph_N_Tcon", "Lymph_N_Treg")], genome="mm10") @ Now, let's find DMRs between fat and skin Tregs. <>= seq_annot <- sequencing.annotate(bis_1072, methdesign, all.cov = TRUE, contrasts = TRUE, cont.matrix = cont.mat, coef = "fat_vs_skin", fdr=0.05) @ Because this comparison is a bit more subtle, there are very few significantly differential CpGs at this threshold. So we can use \texttt{changeFDR()} to relax the FDR to 0.25, taking into account that there is an increased risk of false positives. <>== seq_annot <- changeFDR(seq_annot, 0.25) @ <>= dmrcate.res <- dmrcate(seq_annot, C=2, min.cpgs = 5) fat_vs_skin.ranges <- extractRanges(dmrcate.res, genome="mm10") @ Now let's plot the top DMR with not only fat and skin, but with all samples: <>= cols DMR.plot(fat_vs_skin.ranges, dmr = 1, CpGs=bis_1072, phen.col = cols, genome="mm10") @ Here we can see the methylation of skin cells over this region near the \textit{Gcnt1} promoter is hypomethylated not only relative to fat, but to the other tissues as well. <>= sessionInfo() @ \begin{thebibliography}{99} \bibitem{Chenb} Chen Y, Pal B, Visvader JE, Smyth GK. Differential methylation analysis of reduced representation bisulfite sequencing experiments using edgeR. \emph{F1000Research}. 2017 \textbf{6}, 2055. \bibitem{Delacher} Delacher M, Imbusch CD, Weichenhan D, Breiling A, Hotz-Wagenblatt A, Trager U, ... Feuerer M. (2017). Genome-wide DNA-methylation landscape defines specialization of regulatory T cells in tissues. \emph{Nature Immunology}. 2017 \textbf{18}(10), 1160-1172. \end{thebibliography} \end{document}