%\VignetteIndexEntry{Analysis of single cell mTEC data.} %\VignettePackage{Single.mTec.Transcriptomes} %\VignetteEngine{knitr::knitr} % To compile this document % library('knitr'); rm(list=ls()); knit('DESeq2.Rnw') \documentclass[12pt]{article} \usepackage[numbers,sort&compress]{natbib} \usepackage{amssymb,amsfonts,amsmath} \usepackage{amsmath} \newcommand{\lowtilde}{\raise.17ex\hbox{$\scriptstyle\mathtt{\sim}$}} \renewcommand{\figurename}{Plot } \renewcommand{\thefigure}{\Roman{figure}} <>= library("knitr") knit_hooks$set(fig.bg = function(before, options, envir) { if (before) par(bg = options$fig.bg) }) opts_chunk$set(tidy=FALSE,dev="png",fig.show="hide", fig.width=4,fig.height=4.5, message=FALSE, fig.bg='white') @ <>= BiocStyle::latex() @ \newcommand{\papertitle}{\emph{Single-cell transcriptome analysis reveals coordinated ectopic-gene expression patterns in medullary thymic epithelial cells}} \newcommand{\listofauthors}{Philip Brennecke, Alejandro Reyes, Sheena Pinto, Kristin Rattay,\\Michelle Nguyen, Rita K\"uchler, Wolfgang Huber, Bruno Kyewski and Lars M. Steinmetz} \renewcommand{\citenumfont}[1]{Supp#1} \renewcommand{\bibnumfmt}[1]{[Supp#1]} \author{\listofauthors} \title{Supplementary Code I: \papertitle} \begin{document} \date{} \maketitle \begin{abstract} This document contains all the code used to analyse the single-cell RNA-seq and the bulk ATAC-seq data from the manuscript titled: \papertitle. The purpose of this document is to provide full transparency of the results presented in the manuscript, as well as to provide a documented and reproducible workflow of the code that was used to generate each number and figure from the manuscript. The detailed legends for each figure can be found in the main text and the Supplementary Figures file. \end{abstract} The data package accompanying this vignette, \Rpackage{Single.mTEC.Transcriptomes}, provides data objects from several sources, including data from~\cite{sansom2014}, among others. Each object is documented in the manual pages of this experiment data package. <>= library("DESeq2") library(GenomicRanges) library(GenomicFeatures) library(genefilter) library(statmod) library(gdata) library(RColorBrewer) library(genefilter) library(ggplot2) library(gplots) library(cluster) library(clue) library(grid) library(gridExtra) library(Single.mTEC.Transcriptomes) @ \section{Mapping statistics} For each single-cell transcriptome, we mapped the sequenced fragments to the Mouse reference genome GRCm38 downloaded from ENSEMBL release 75. For each sample of each sequencing batch, we classified the reads as either mapping uniquely to the reference genome, mapping multiple times to the reference genome, reads which we could not assign to any position of the reference genome or others (e.g reads that only one read pair could be mapped but the read other could not be mapped). The resulting figure can be found in Plot~\ref{fig:mappingstats} of this document. <>= data("percentsGG") ggplot( percentsGG, aes(index, percent, fill=type) ) + geom_bar(stat="identity") + facet_grid( batch ~ . ) + theme(axis.title = element_text(size = 18), axis.text = element_text(size=12), legend.text = element_text(size = 12), axis.text.x=element_text(angle=90), legend.position="top") @ \begin{figure} \centering \includegraphics[width=.8\textwidth]{figure/Figure_plot1_MappingStats-1} \caption{ \textbf{Mapping statistics stratified by the alignment type.}} \label{fig:mappingstats} \end{figure} \section{Identifying variable genes} We summarize single-cell transcriptomes in the form of count matrices in which each row represents a gene and each column represents one cell. The matrix is filled with the number of sequenced fragments whose genomic alignment overlaps with the genomic coordinates of the genes. For this, we only use cells for which more than 40\% of the sequenced fragments could be uniquely assigned to the Mouse reference genome. We also discarded 28 cells from the batch 4 (since they were from another cell type that was sequenced together with some of the mTECs). With the final set of high-quality sequenced transcriptomes, we generated a \emph{DESeq2DataSet} object that also contains the annotation for each cell. We define also the number of cores to use for the calculations. <>= data("mTECdxd") numCores=1 @ In order to identify genes whose biological variation is above the expected technical noise, we used the method described by \cite{brennecke2013}. We modified the code from the supplementary material from \cite{brennecke2013}, and implemented the function \Rfunction{testForVar}. This function identifies the genes whose coefficient of variation varies for more than 50\% at a FDR of 10\% and plots the results (e.g. the one presented in the Plot~\ref{fig:deGenesNone}). <>= testForVar <- function( countTable, FDR=0.1, minVar=.5, main=""){ normalizedCountTable <- t( t(countTable)/estimateSizeFactorsForMatrix(countTable) ) splitCounts <- split(as.data.frame(countTable), grepl("ERCC", rownames(countTable))) mouseCounts <- splitCounts[["FALSE"]] spikeCounts <- splitCounts[["TRUE"]] mouseNF <- estimateSizeFactorsForMatrix( mouseCounts ) spikeNF <- estimateSizeFactorsForMatrix( spikeCounts ) mouseCounts <- t( t( mouseCounts ) / mouseNF ) spikeCounts <- t( t( spikeCounts ) / spikeNF ) meansSpike <- rowMeans( spikeCounts ) varsSpike <- rowVars( spikeCounts ) cv2Spike <- varsSpike / meansSpike^2 minMeanForFit <- unname( quantile( meansSpike[ which( cv2Spike > .3 ) ], .9 ) ) useForFit <- meansSpike >= minMeanForFit fit <- glmgam.fit( cbind( a0 = 1, a1tilde = 1/meansSpike[useForFit] ), cv2Spike[useForFit] ) xi <- mean( 1 / spikeNF ) a0 <- unname( fit$coefficients["a0"] ) a1 <- unname( fit$coefficients["a1tilde"] - xi ) meansMouse <- rowMeans( mouseCounts ) varsMouse <- rowVars( mouseCounts ) cv2Mouse <- varsMouse / meansMouse^2 psia1theta <- mean( 1 / mouseNF ) + a1 * mean( spikeNF / mouseNF ) minBiolDisp <- minVar^2 m <- ncol( mouseCounts ) cv2th <- a0 + minBiolDisp + a0 * minBiolDisp testDenom <- ( meansMouse * psia1theta + meansMouse^2 * cv2th ) / ( 1 + cv2th/m ) p <- 1 - pchisq( varsMouse * (m-1) / testDenom, m-1 ) padj <- p.adjust(p, method="BH") sig <- padj < FDR deGenes <- names( which( sig ) ) suppressWarnings( plot( NULL, xaxt="n", yaxt="n", bty = 'l', log="xy", xlim = c( 1e-2, 3e5 ), ylim = c( .005, ncol(mouseCounts) ), xlab = "Average normalized read count", ylab = "Squared coefficient of variation (SCV)", cex.lab=1.3, main=main)) axis( 1, 10^(-1:5), c( "0.1", "1", "10", "100", "1000", expression(10^4), expression(10^5) ), cex.axis=1.25) axis( 2, 10^(-2:2), c( "0.01", "0.1", "1", "10", "100"), las=2, cex.axis=1.25) # abline( h=10^(-2:1), v=10^(-1:5), col="#D0D0D0" ) points( meansMouse, cv2Mouse, pch=20, cex=.2, col = ifelse( padj < .1, "#b2182b", "#878787" ) ) points( meansSpike, cv2Spike, pch=20, cex=.8, col="black" ) xg <- 10^seq( -2, 6, length.out=1000 ) lines( xg, (xi+a1)/xg + a0, col="black", lwd=3)#, lty="dashed") lines( xg, psia1theta/xg + a0 + minBiolDisp, col="#67001f", lwd=3 ) return( deGenes ) } @ We run the function \Rfunction{testForvar} with the data from those cells that were not selected for the expression of a TRA-encoding gene (Plot~\ref{fig:deGenesNone}). <>= deGenesNone = testForVar( countTable=counts(dxd)[,colData(dxd)$SurfaceMarker == "None"] ) #save(deGenesNone, file="../data/deGenesNone.RData") length(deGenesNone) nrow(dxd) ncol(dxd) table( grepl("ERCC", rownames(dxd)) ) @ The analysis reveals more than 9,000 genes with a coefficient of variation larger than 50\% at a False Discovery Rate (FDR) of 10\%. This set of genes is used for further analysis. <>= cat( sprintf("The number of genes with coefficient of variation larger than 50 percent at FDR of 0.1 is: %s\n", length(deGenesNone)) ) @ \begin{figure} \centering \includegraphics[width=.5\textwidth]{figure/Figure_1C_variableNoMarker-1} \caption{ \textbf{Highly variable genes of the unselected cells (n=203).} Corresponds to Figure 1C} \label{fig:deGenesNone} \end{figure} %We used the same function to test for <>= deGenes = testForVar( countTable=counts(dxd) ) @ %We run the function also for each group of cells that were selected for being %Tspan8 positive or Ceacam1 positive. <>= par(mfrow=c(1, 2)) deGenesCeacam1 <- testForVar( counts(dxd)[,colData(dxd)$SurfaceMarker == "Ceacam1"], main=expression(Ceacam^+ cells~FACS) ) deGenesTspan8 <- testForVar( counts(dxd)[,colData(dxd)$SurfaceMarker == "Tspan8"], main=expression(Tspan^+ cells~FACS)) @ %\begin{figure} %\centering %\includegraphics[width=.7\textwidth]{figure/Figure_Supp3_variableSubGroups-1} %\caption{ % \textbf{Differentially variable genes, TRA positive cell fraction (FACS)}} %\label{fig:deGenesMarkers} %\end{figure} %Numbers of variable genes within the Tspan8 and Ceacam1 positive (FACS) cells <>= length(deGenesCeacam1) length(deGenesTspan8) @ \section{Number of TRA genes detected per cell compared to the number of genes.} We explore the number of TRA genes (as defined by~\cite{pinto2013}) and compare this number with the the total number of genes detected per cell. The results can be seen in Plot~\ref{fig:genesvstras}. <>= data("tras") tras <- unique( tras$`gene.ids` ) data("geneNames") data("biotypes") proteinCoding <- names( which( biotype == "protein_coding" ) ) isTRA <- rownames( dxd ) %in% tras & rownames(dxd) %in% proteinCoding isProteinCoding <- rownames(dxd) %in% proteinCoding sum(isProteinCoding) par(mar=c(5, 5, 1, 1)) unselectedCells <- colData(dxd)$SurfaceMarker=="None" sum(unselectedCells) plot( colSums(counts(dxd)[isProteinCoding,unselectedCells] > 0), colSums(counts(dxd)[isTRA,unselectedCells] > 0), pch=19, cex=.7, col="#00000080", # col=lscol, cex.lab=1.3, cex.axis=1.3, xlab="Number of genes detected", ylab="Number of TRA genes detected") @ \begin{figure} \centering \includegraphics[width=.4\textwidth]{figure/Figure_1A_trasvsgenes-1} \caption{ \textbf{Number of detected genes vs number of TRA genes.} Corresponds to Figure 1a.} \label{fig:genesvstras} \end{figure} The two numbers seem very correlated and seem to follow a linear relation, indicating that the number of TRA genes that are detected within a cell is proportional to the number of detected genes. The distribution of the fraction of detected genes classified as TRA is depicted in Plot~\ref{fig:percentages}. <>= par(mar=c(5, 5, 1, 1)) hist( colSums(counts(dxd)[isTRA,unselectedCells] > 0)/ colSums(counts(dxd)[isProteinCoding,unselectedCells] > 0), col="white", cex.axis=1.4, xlab="", cex.lab=1.4, pch=19, cex=.5, main="", xlim=c(0, .5)) title(xlab="Fraction of detected genes\n classified as TRA", line=4, cex.lab=1.4) @ <>= rng <- range( colSums(counts(dxd)[isTRA,unselectedCells] > 0)/ colSums(counts(dxd)[isProteinCoding,unselectedCells] > 0) ) rng <- round(rng*100, 2) summary(rng) sd(rng) cat(sprintf("The proportion of TRAs expressed per cell with respect to the number of protein coding genes ranges from %s to %s percent\n", rng[1], rng[2] ) ) @ \begin{figure} \centering \includegraphics[width=.4\textwidth]{figure/Figure_Supp1_percentageTRAs-1} \caption{ \textbf{Histogram of the distribution of percentage of TRAs detected per cell.} Corresponds to Figure S1.} \label{fig:percentages} \end{figure} Next, we explore the saturation of genes and TRA genes detection by counting how many genes are detected when progressively adding one cell at the time. To remain our estimates as conservative as possible, we sort the cells based on the number of detected genes (increasingly from left to right in the Plot~\ref{fig:saturation}). <>= detectedTRAs <- 0+( counts(dxd)[isTRA,unselectedCells] > 0 ) cellOrder <- order( colSums(counts(dxd)[isProteinCoding,unselectedCells] > 0) ) cumFreqTRAs <- sapply( seq_along(cellOrder), function(x){ sum(!rowSums( detectedTRAs[,cellOrder[seq_len(x)], drop=FALSE] ) == 0) }) detectedGenes <- 0+( counts(dxd)[isProteinCoding,unselectedCells] > 0 ) cumFreqGenes <- sapply( seq_along(cellOrder), function(x){ sum(!rowSums( detectedGenes[,cellOrder[seq_len(x)], drop=FALSE] ) == 0) }) par(mar=c(5, 6, 2, 1)) plot(cumFreqTRAs/nrow(detectedTRAs), type="l", lwd=4, col="#1b7837", cex.axis=1.4, cex.lab=1.4, ylab="Cumulative fraction\nof genes detected", ylim=c(0, 1), xlab="Number of mature mTECs") lines(cumFreqGenes/nrow(detectedGenes), type="l", lwd=4, col="#762a83") legend(x=10, y=.3, legend=c("TRA-encoding genes", "Protein coding genes"), fill=c("#1b7837", "#762a83"), cex=1.2, bty="n") @ \begin{figure} \centering \includegraphics[width=.4\textwidth]{figure/Figure_1B_saturation-1} \caption{ \textbf{Cumulative frequency of detected protein coding genes and TRA genes detected per cell.} Corresponds to Figure 1B.} \label{fig:saturation} \end{figure} Pooling our set of 203 mature mTECs, we are able to detect above 85\% percent of both TRA genes and the rest of the protein coding genes (Plot~\ref{fig:saturation}). <>= cat(sprintf("Total fraction of TRA genes detected: %s", round( max(cumFreqTRAs/nrow(detectedTRAs)),2) ) ) cat(sprintf("Total number of TRA genes: %s", nrow(detectedTRAs) ) ) cat(sprintf("Total fraction of protein coding genes detected: %s\n", round( max(cumFreqGenes/nrow(detectedGenes)), 2) ) ) cat(sprintf("Total number of detected protein coding genes: %s", max(cumFreqGenes))) cat(sprintf("Total number of protein coding genes: %s", nrow(detectedGenes) ) ) @ \section{ Variable genes are enriched for TRA genes } We create a contigency table comparing the highly variable protein coding genes with respect to all protein coding genes that were not detected as highly variable. <>= background <- rownames(dxd)[!rownames(dxd) %in% deGenesNone] background <- names( which( rowSums( counts(dxd)[background,] > 0 ) != 0 ) ) background <- intersect(background, proteinCoding) foreground <- intersect(deGenesNone, proteinCoding ) allTras <- intersect( tras, proteinCoding ) mat <- rbind( `variable genes`=table( !foreground %in% allTras ), `background`=table( !background %in% allTras ) ) colnames( mat ) <- c("is TRA", "is not TRA") mat[,1]/rowSums(mat) par(mar=c(4, 6, 1, 1)) barplot( mat[,1]/rowSums(mat), names.arg=c("variable", "not-variable"), las=1, col="black", cex.axis=1.2, cex.lab=1.3, cex=1.3, ylab="Fraction of\nTRA-encoding genes") @ \begin{figure} \centering \includegraphics[width=.4\textwidth]{figure/Figure_Supp2_traenrichment-1} \caption{ \textbf{Highly variable genes are enriched for TRA-encoding genes.} Corresponds to Fig. S2.} \label{fig:fisher1} \end{figure} Then, we run the Fisher test on the contigency table, which indicates that the variable genes within the 203 mature mTECs are significantly enriched for TRA genes. <>= fisher.test(mat) @ \section{Both Aire-dependent and Aire-independent TRAs are expressed at low frequencies in single mTECs} This section investigates the number of TRA genes with respect to Aire dependency as defined by \cite{sansom2014}. We also use data from the FANTOM consortium (91 manually selected tissues, manually excluding samples that were composed of more than one tissue, e.g. whole body)~\cite{fantom}. For the definition of a gene being detected for a tissues, a threshold of at least five counts was selected, nevertheless this is not a critical parameter for our analysis (One can vary it and the result remains the same). <>= data("fantom") data("aireDependentSansom") aireDependent <- aireDependentSansom countTable <- counts(dxd)[,colData(dxd)$SurfaceMarker == "None"] meansFANTOM <- sapply( split(seq_len(ncol(dxdFANTOM)), colData( dxdFANTOM )$tissue), function(x){ rowMeans( counts(dxdFANTOM, normalized=TRUE)[,x, drop=FALSE] ) }) meansFANTOM <- sapply( split( seq_len( nrow(meansFANTOM) ), sapply( strsplit( rownames( meansFANTOM ), "," ), "[[", 1 )), function(x){ colMeans( meansFANTOM[x,,drop=FALSE] ) }) meansFANTOM <- t( meansFANTOM ) cat( sprintf("The total number of tissues used from the FANTOM dataset was:%s\n", length( unique( colnames(meansFANTOM) ) ) ) ) matchedIndexes <- match( geneNames, rownames(meansFANTOM)) stopifnot( rownames(meansFANTOM)[matchedIndexes[!is.na(matchedIndexes)]] == geneNames[!is.na(matchedIndexes)] ) rownames(meansFANTOM)[matchedIndexes[!is.na(matchedIndexes)]] <- names( geneNames[!is.na(matchedIndexes)] ) meansFANTOM <- meansFANTOM[grep("ENS", rownames(meansFANTOM)),] numbersOfTissues <- rowSums( meansFANTOM > 5 ) numbersOfTissues <- numbersOfTissues[names(numbersOfTissues) %in% deGenesNone] aireDependent <- aireDependent[aireDependent %in% deGenesNone] numbersOfCells <- rowSums( countTable[names(numbersOfTissues),] > 0 ) table( numbersOfTissues < 10 ) FifteenPercent <- round( sum(colData(dxd)$SurfaceMarker == "None") * .15 ) tableResults <- table( lessThan10Cells=numbersOfCells < FifteenPercent, isAireDependent=names(numbersOfTissues) %in% aireDependent, isLessThan10Tissues=numbersOfTissues < 10) cat( sprintf("Total number of genes detected in less than 10 tissues: %s\n", sum( tableResults[,,"TRUE"]) ) ) cat(sprintf("From which %s are Aire dependent and %s are Aire-independent\n", colSums( tableResults[,,"TRUE"] )["TRUE"], colSums( tableResults[,,"TRUE"] )["FALSE"] ) ) tableResults["TRUE",,"TRUE"] percentsLessThan15 <- round( tableResults["TRUE",,"TRUE"]/colSums( tableResults[,,"TRUE"] ), 2) cat(sprintf("And %s and %s were detected in less than 15 percent of the cells, respectively\n", percentsLessThan15[2], percentsLessThan15[1])) @ The Plot~\ref{fig:histogram} replicates the results from~\cite{sansom2014}, that shows that Aire dependent genes are transcribed at low frequencies in mTECs. Additionally, our analysis shows that transcription at low frequencies is a common property of both Aire-dependent and Aire-independent TRA-encoding genes. <>= df <- data.frame( numberOfCells=numbersOfCells, numberOfTissues=numbersOfTissues, aireDependent=ifelse( names(numbersOfCells) %in% aireDependent, "Aire-dependent genes", "Aire-independent genes"), TRAs=ifelse( names(numbersOfCells) %in% tras, "TRA", "not TRA" ) ) histogramList <- lapply(c("Aire-independent genes", "Aire-dependent genes"), function(x){ dfAD <- df[df$aireDependent == x,] m <- ggplot( dfAD, aes( x=numberOfCells ) ) m <- m + geom_histogram(colour = "black", fill = "white", binwidth = 5) m <- m + facet_grid( ~ aireDependent ) m <- m + xlab("Mature mTECs with\ngene detection") + ylab("Number of genes") + scale_x_continuous(limits=c(0, ncol(countTable)+0), expand=c(.01,0) ) + ylim(0, 600) m <- m + theme(axis.text.x = element_text(size=14, colour="black"), panel.border = element_rect(colour = "white", fill=NA), panel.background = element_rect(colour="white", fill="white"), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line=element_line(colour="black"), axis.text.y = element_text(size=14, colour="black"), axis.title=element_text(size=14, vjust=.2), strip.text.x = element_text(size = 14), axis.ticks=element_line(colour="black"), strip.background = element_rect(colour="white", fill="white")) m }) names(histogramList) <- c("Aire-independent genes", "Aire-dependent genes") scatterList <- lapply(c("Aire-independent genes", "Aire-dependent genes"), function(x){ dfAD <- df[df$aireDependent == x,] m2 <- ggplot(dfAD, aes(x=numberOfCells,y=numberOfTissues)) + geom_point(colour="#15151540", size=1.2) + facet_grid( ~ aireDependent ) + xlab("Mature mTECs with\n gene detection") + ylab("Tissues with gene detection") + geom_hline(yintercept=10, size=1.1, colour="darkred") + #linetype="dashed") + scale_y_continuous(limits=c(0, max(df$numberOfTissues)+3), expand=c(0,0) ) + scale_x_continuous(limits=c(0, ncol(countTable)+0), expand=c(0.01,0) ) m2 <- m2 + theme( strip.background = element_rect(colour="white", fill="white"), strip.text.x =element_text(size = 14), panel.border = element_rect(colour = "white", fill=NA), axis.line=element_line(colour="black"), axis.text = element_text(size=14, colour="black"), axis.title=element_text(size=14, vjust=.2), legend.position="none", panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank()) m2}) names(scatterList) <- c("Aire-independent genes", "Aire-dependent genes") @ <>= print( histogramList[["Aire-independent genes"]]) @ \begin{figure} \centering \includegraphics[width=.3\textwidth]{figure/Figure_1D1_histogramAire-1} \caption{ \textbf{Histogram and heatscatter of genes stratified by Aire regulation.}} \label{fig:histogram} \end{figure} <>= print( histogramList[["Aire-dependent genes"]]) @ \begin{figure} \centering \includegraphics[width=.3\textwidth]{figure/Figure_1D2_histogramAire-1} \caption{ \textbf{Histogram and heatscatter of genes stratified by Aire regulation.}} \label{fig:histogram2} \end{figure} <>= print( scatterList[["Aire-dependent genes"]]) @ \begin{figure} \centering \includegraphics[width=.3\textwidth]{figure/Figure_1D3_histogramAire-1} \caption{ \textbf{Histogram and heatscatter of genes stratified by Aire regulation.}} \label{fig:histogram3} \end{figure} <>= print( scatterList[["Aire-independent genes"]]) @ \begin{figure} \centering \includegraphics[width=.3\textwidth]{figure/Figure_1D4_histogramAire-1} \caption{ \textbf{Histogram and heatscatter of genes stratified by Aire regulation.}} s\label{fig:histogram4} \end{figure} \section{ Gene-gene correlation analyses } Since cell cycle is a potential confounding factor for gene-gene correlations, we regress out cell cycle variation by using the method \emph{scLVM} described in~\cite{scLVM}. Then, in order to explore potential patterns of co-regulation in mTEC cells, we use k-medoids clustering. Importantly, k-medoids minimizes the eucledian distance between the data points labeled to be in a cluster and a point designated as the center of that cluster (centroid). We then assess the stability of the clusters using the \emph{R} package ``clue'', we resampled 1,000 times and estimate the consensus clustering. As we are performing 1,000 permutations to asses the stability of the clusters, the code below does not run during the compilation of this document. Instead, the results were pre-saved in an object of the data package. To continue with the vignette, we load the object that contains the pre-calculated results of the k-medoids clustering. <>= library(clue) library(cluster) data("scLVM_output") defineGeneConsensusClustering <- function( expressionMatrix=Ycorr, selectGenes="", nClusters=12, B=40, prob=0.8, nCores=20, ...){ mat <- expressionMatrix[rownames(expressionMatrix) %in% selectGenes,] ce <- cl_ensemble( list=mclapply( seq_len(B), function(dummy){ subSamples <- sample(colnames(mat), round( ncol(mat) * prob ) ) pamSub <- pam( mat[,subSamples], nClusters ) pred <- cl_predict( pamSub, mat[,subSamples], "memberships" ) as.cl_partition(pred) }, mc.cores=nCores )) cons <- cl_consensus( ce ) ag <- sapply( ce, cl_agreement, y=cons ) return(list(consensus=cons, aggrementProbabilities=ag)) } nomarkerCellsClustering <- defineGeneConsensusClustering( expressionMatrix=Ycorr[,colData(dxd)$SurfaceMarker == "None"], selectGenes=intersect( deGenesNone, aireDependent ), nClusters=12, B=1000, prob=0.8, nCores=10 ) save( nomarkerCellsClustering, file="../data/nomarkerCellsClustering.RData") @ We visualize the result of the clustering next to the gene-gene Spearman correlation matrix. The resulting heatmap reveals that most of the defined gene clusters, as expected, tend to have higher correlations with the genes grouped in the same the cluster compared to genes from other clusters (Figure~\ref{fig:geneNoMark}). <>= ### definition of heatmap3 function heatmap.3 <- function(x, Rowv = TRUE, Colv = if (symm) "Rowv" else TRUE, distfun = dist, hclustfun = hclust, dendrogram = c("both","row", "column", "none"), symm = FALSE, scale = c("none","row", "column"), na.rm = TRUE, revC = identical(Colv,"Rowv"), add.expr, breaks, symbreaks = max(x < 0, na.rm = TRUE) || scale != "none", col = "heat.colors", colsep, rowsep, sepcolor = "white", sepwidth = c(0.05, 0.05), cellnote, notecex = 1, notecol = "cyan", na.color = par("bg"), trace = c("none", "column","row", "both"), tracecol = "cyan", hline = median(breaks), vline = median(breaks), linecol = tracecol, margins = c(5,5), ColSideColors, RowSideColors, side.height.fraction=0.3, cexRow = 0.2 + 1/log10(nr), cexCol = 0.2 + 1/log10(nc), labRow = NULL, labCol = NULL, key = TRUE, keysize = 1.5, density.info = c("none", "histogram", "density"), denscol = tracecol, symkey = max(x < 0, na.rm = TRUE) || symbreaks, densadj = 0.25, main = NULL, xlab = NULL, ylab = NULL, lmat = NULL, lhei = NULL, lwid = NULL, NumColSideColors = 1, NumRowSideColors = 1, KeyValueName="Value",...){ invalid <- function (x) { if (missing(x) || is.null(x) || length(x) == 0) return(TRUE) if (is.list(x)) return(all(sapply(x, invalid))) else if (is.vector(x)) return(all(is.na(x))) else return(FALSE) } x <- as.matrix(x) scale01 <- function(x, low = min(x), high = max(x)) { x <- (x - low)/(high - low) x } retval <- list() scale <- if (symm && missing(scale)) "none" else match.arg(scale) dendrogram <- match.arg(dendrogram) trace <- match.arg(trace) density.info <- match.arg(density.info) if (length(col) == 1 && is.character(col)) col <- get(col, mode = "function") if (!missing(breaks) && (scale != "none")) warning("Using scale=\"row\" or scale=\"column\" when breaks are", "specified can produce unpredictable results.", "Please consider using only one or the other.") if (is.null(Rowv) || is.na(Rowv)) Rowv <- FALSE if (is.null(Colv) || is.na(Colv)) Colv <- FALSE else if (Colv == "Rowv" && !isTRUE(Rowv)) Colv <- FALSE if (length(di <- dim(x)) != 2 || !is.numeric(x)) stop("`x' must be a numeric matrix") nr <- di[1] nc <- di[2] if (nr <= 1 || nc <= 1) stop("`x' must have at least 2 rows and 2 columns") if (!is.numeric(margins) || length(margins) != 2) stop("`margins' must be a numeric vector of length 2") if (missing(cellnote)) cellnote <- matrix("", ncol = ncol(x), nrow = nrow(x)) if (!inherits(Rowv, "dendrogram")) { if (((!isTRUE(Rowv)) || (is.null(Rowv))) && (dendrogram %in% c("both", "row"))) { if (is.logical(Colv) && (Colv)) dendrogram <- "column" else dedrogram <- "none" warning("Discrepancy: Rowv is FALSE, while dendrogram is `", dendrogram, "'. Omitting row dendogram.") } } if (!inherits(Colv, "dendrogram")) { if (((!isTRUE(Colv)) || (is.null(Colv))) && (dendrogram %in% c("both", "column"))) { if (is.logical(Rowv) && (Rowv)) dendrogram <- "row" else dendrogram <- "none" warning("Discrepancy: Colv is FALSE, while dendrogram is `", dendrogram, "'. Omitting column dendogram.") } } if (inherits(Rowv, "dendrogram")) { ddr <- Rowv rowInd <- order.dendrogram(ddr) } else if (is.integer(Rowv)) { hcr <- hclustfun(distfun(x)) ddr <- as.dendrogram(hcr) ddr <- reorder(ddr, Rowv) rowInd <- order.dendrogram(ddr) if (nr != length(rowInd)) stop("row dendrogram ordering gave index of wrong length") } else if (isTRUE(Rowv)) { Rowv <- rowMeans(x, na.rm = na.rm) hcr <- hclustfun(distfun(x)) ddr <- as.dendrogram(hcr) ddr <- reorder(ddr, Rowv) rowInd <- order.dendrogram(ddr) if (nr != length(rowInd)) stop("row dendrogram ordering gave index of wrong length") } else { rowInd <- nr:1 } if (inherits(Colv, "dendrogram")) { ddc <- Colv colInd <- order.dendrogram(ddc) } else if (identical(Colv, "Rowv")) { if (nr != nc) stop("Colv = \"Rowv\" but nrow(x) != ncol(x)") if (exists("ddr")) { ddc <- ddr colInd <- order.dendrogram(ddc) } else colInd <- rowInd } else if (is.integer(Colv)) { hcc <- hclustfun(distfun(if (symm) x else t(x))) ddc <- as.dendrogram(hcc) ddc <- reorder(ddc, Colv) colInd <- order.dendrogram(ddc) if (nc != length(colInd)) stop("column dendrogram ordering gave index of wrong length") } else if (isTRUE(Colv)) { Colv <- colMeans(x, na.rm = na.rm) hcc <- hclustfun(distfun(if (symm) x else t(x))) ddc <- as.dendrogram(hcc) ddc <- reorder(ddc, Colv) colInd <- order.dendrogram(ddc) if (nc != length(colInd)) stop("column dendrogram ordering gave index of wrong length") } else { colInd <- 1:nc } retval$rowInd <- rowInd retval$colInd <- colInd retval$call <- match.call() x <- x[rowInd, colInd] x.unscaled <- x cellnote <- cellnote[rowInd, colInd] if (is.null(labRow)) labRow <- if (is.null(rownames(x))) (1:nr)[rowInd] else rownames(x) else labRow <- labRow[rowInd] if (is.null(labCol)) labCol <- if (is.null(colnames(x))) (1:nc)[colInd] else colnames(x) else labCol <- labCol[colInd] if (scale == "row") { retval$rowMeans <- rm <- rowMeans(x, na.rm = na.rm) x <- sweep(x, 1, rm) retval$rowSDs <- sx <- apply(x, 1, sd, na.rm = na.rm) x <- sweep(x, 1, sx, "/") } else if (scale == "column") { retval$colMeans <- rm <- colMeans(x, na.rm = na.rm) x <- sweep(x, 2, rm) retval$colSDs <- sx <- apply(x, 2, sd, na.rm = na.rm) x <- sweep(x, 2, sx, "/") } if (missing(breaks) || is.null(breaks) || length(breaks) < 1) { if (missing(col) || is.function(col)) breaks <- 16 else breaks <- length(col) + 1 } if (length(breaks) == 1) { if (!symbreaks) breaks <- seq(min(x, na.rm = na.rm), max(x, na.rm = na.rm), length = breaks) else { extreme <- max(abs(x), na.rm = TRUE) breaks <- seq(-extreme, extreme, length = breaks) } } nbr <- length(breaks) ncol <- length(breaks) - 1 if (class(col) == "function") col <- col(ncol) min.breaks <- min(breaks) max.breaks <- max(breaks) x[x < min.breaks] <- min.breaks x[x > max.breaks] <- max.breaks if (missing(lhei) || is.null(lhei)) lhei <- c(keysize, 4) if (missing(lwid) || is.null(lwid)) lwid <- c(keysize, 4) if (missing(lmat) || is.null(lmat)) { lmat <- rbind(4:3, 2:1) if (!( missing(ColSideColors) | is.null(ColSideColors) )) { #if (!is.matrix(ColSideColors)) #stop("'ColSideColors' must be a matrix") if (!is.character(ColSideColors) || nrow(ColSideColors) != nc) stop("'ColSideColors' must be a matrix of nrow(x) rows") lmat <- rbind(lmat[1, ] + 1, c(NA, 1), lmat[2, ] + 1) #lhei <- c(lhei[1], 0.2, lhei[2]) lhei=c(lhei[1], side.height.fraction*NumColSideColors, lhei[2]) } if (!missing(RowSideColors)) { #if (!is.matrix(RowSideColors)) #stop("'RowSideColors' must be a matrix") if (!is.character(RowSideColors) || ncol(RowSideColors) != nr) stop("'RowSideColors' must be a matrix of ncol(x) columns") lmat <- cbind(lmat[, 1] + 1, c(rep(NA, nrow(lmat) - 1), 1), lmat[,2] + 1) #lwid <- c(lwid[1], 0.2, lwid[2]) lwid <- c(lwid[1], side.height.fraction*NumRowSideColors, lwid[2]) } lmat[is.na(lmat)] <- 0 } if (length(lhei) != nrow(lmat)) stop("lhei must have length = nrow(lmat) = ", nrow(lmat)) if (length(lwid) != ncol(lmat)) stop("lwid must have length = ncol(lmat) =", ncol(lmat)) op <- par(no.readonly = TRUE) on.exit(par(op)) layout(lmat, widths = lwid, heights = lhei, respect = FALSE) if (!missing(RowSideColors)) { if (!is.matrix(RowSideColors)){ par(mar = c(margins[1], 0, 0, 0.5)) image(rbind(1:nr), col = RowSideColors[rowInd], axes = FALSE ) } else { par(mar = c(margins[1], 0, 0, 0.5)) rsc = t(RowSideColors[,rowInd, drop=FALSE]) rsc.colors = matrix() rsc.names = names(table(rsc)) rsc.i = 1 for (rsc.name in rsc.names) { rsc.colors[rsc.i] = rsc.name rsc[rsc == rsc.name] = rsc.i rsc.i = rsc.i + 1 } rsc = matrix(as.numeric(rsc), nrow = dim(rsc)[1]) image(t(rsc), col = as.vector(rsc.colors), axes = FALSE) if (length(colnames(RowSideColors)) > 0) { axis(1, 0:(dim(rsc)[2] - 1)/(dim(rsc)[2] - 1), colnames(RowSideColors), las = 2, tick = FALSE) } } } if (!( missing(ColSideColors) | is.null(ColSideColors))) { if (!is.matrix(ColSideColors)){ par(mar = c(0.5, 0, 0, margins[2])) image(cbind(1:nc), col = ColSideColors[colInd], axes = FALSE, cex.lab=1.5) } else { par(mar = c(0.5, 0, 0, margins[2])) csc = ColSideColors[colInd, , drop=FALSE] csc.colors = matrix() csc.names = names(table(csc)) csc.i = 1 for (csc.name in csc.names) { csc.colors[csc.i] = csc.name csc[csc == csc.name] = csc.i csc.i = csc.i + 1 } csc = matrix(as.numeric(csc), nrow = dim(csc)[1]) image(csc, col = as.vector(csc.colors), axes = FALSE) if (length(colnames(ColSideColors)) > 0) { axis(2, 0:(dim(csc)[2] - 1)/max(1,(dim(csc)[2] - 1)), colnames(ColSideColors), las = 2, tick = FALSE, cex=1.5, cex.axis=1.5) } } } par(mar = c(margins[1], 0, 0, margins[2])) x <- t(x) cellnote <- t(cellnote) if (revC) { iy <- nr:1 if (exists("ddr")) ddr <- rev(ddr) x <- x[, iy] cellnote <- cellnote[, iy] } else iy <- 1:nr image(1:nc, 1:nr, x, xlim = 0.5 + c(0, nc), ylim = 0.5 + c(0, nr), axes = FALSE, xlab = "", ylab = "", col = col, breaks = breaks, ...) retval$carpet <- x if (exists("ddr")) retval$rowDendrogram <- ddr if (exists("ddc")) retval$colDendrogram <- ddc retval$breaks <- breaks retval$col <- col if (!invalid(na.color) & any(is.na(x))) { # load library(gplots) mmat <- ifelse(is.na(x), 1, NA) image(1:nc, 1:nr, mmat, axes = FALSE, xlab = "", ylab = "", col = na.color, add = TRUE) } axis(1, 1:nc, labels = labCol, las = 2, line = -0.5, tick = 0, cex.axis = cexCol) if (!is.null(xlab)) mtext(xlab, side = 1, line = margins[1] - 1, cex=1.2, padj=-.1) axis(4, iy, labels = labRow, las = 2, line = -0.5, tick = 0, cex.axis = cexRow) if (!is.null(ylab)) mtext(ylab, side = 4, line = margins[2] - 1.2, cex=1.2, adj=-.1) if (!missing(add.expr)) eval(substitute(add.expr)) if (!missing(colsep)) for (csep in colsep) rect(xleft = csep + 0.5, ybottom = rep(0, length(csep)), xright = csep + 0.5 + sepwidth[1], ytop = rep(ncol(x) + 1, csep), lty = 1, lwd = 1, col = sepcolor, border = sepcolor) if (!missing(rowsep)) for (rsep in rowsep) rect(xleft = 0, ybottom = (ncol(x) + 1 - rsep) - 0.5, xright = nrow(x) + 1, ytop = (ncol(x) + 1 - rsep) - 0.5 - sepwidth[2], lty = 1, lwd = 1, col = sepcolor, border = sepcolor) min.scale <- min(breaks) max.scale <- max(breaks) x.scaled <- scale01(t(x), min.scale, max.scale) if (trace %in% c("both", "column")) { retval$vline <- vline vline.vals <- scale01(vline, min.scale, max.scale) for (i in colInd) { if (!is.null(vline)) { abline(v = i - 0.5 + vline.vals, col = linecol, lty = 2) } xv <- rep(i, nrow(x.scaled)) + x.scaled[, i] - 0.5 xv <- c(xv[1], xv) yv <- 1:length(xv) - 0.5 lines(x = xv, y = yv, lwd = 1, col = tracecol, type = "s") } } if (trace %in% c("both", "row")) { retval$hline <- hline hline.vals <- scale01(hline, min.scale, max.scale) for (i in rowInd) { if (!is.null(hline)) { abline(h = i + hline, col = linecol, lty = 2) } yv <- rep(i, ncol(x.scaled)) + x.scaled[i, ] - 0.5 yv <- rev(c(yv[1], yv)) xv <- length(yv):1 - 0.5 lines(x = xv, y = yv, lwd = 1, col = tracecol, type = "s") } } if (!missing(cellnote)) text(x = c(row(cellnote)), y = c(col(cellnote)), labels = c(cellnote), col = notecol, cex = notecex) par(mar = c(margins[1], 0, 0, 0)) if (dendrogram %in% c("both", "row")) { plot(ddr, horiz = TRUE, axes = FALSE, yaxs = "i", leaflab = "none") } else plot.new() par(mar = c(0, 0, if (!is.null(main)) 5 else 0, margins[2])) if (dendrogram %in% c("both", "column")) { plot(ddc, axes = FALSE, xaxs = "i", leaflab = "none") } else plot.new() if (!is.null(main)) title(main, cex.main = 1.5 * op[["cex.main"]]) if (key) { par(mar = c(5, 4, 2, 1), cex = 0.75) tmpbreaks <- breaks if (symkey) { max.raw <- max(abs(c(x, breaks)), na.rm = TRUE) min.raw <- -max.raw tmpbreaks[1] <- -max(abs(x), na.rm = TRUE) tmpbreaks[length(tmpbreaks)] <- max(abs(x), na.rm = TRUE) } else { min.raw <- min(x, na.rm = TRUE) max.raw <- max(x, na.rm = TRUE) } z <- seq(min.raw, max.raw, length = length(col)) image(z = matrix(z, ncol = 1), col = col, breaks = tmpbreaks, xaxt = "n", yaxt = "n") par(usr = c(0, 1, 0, 1)) lv <- pretty(breaks) xv <- scale01(as.numeric(lv), min.raw, max.raw) axis(3, at = xv, labels = lv, cex.axis=1.5, padj=.4) if (scale == "row") mtext(side = 1, "Row Z-Score", line = 2) else if (scale == "column") mtext(side = 1, "Column Z-Score", line = 2) else mtext(side = 1, KeyValueName, line = 2, cex=1.2) if (density.info == "density") { dens <- density(x, adjust = densadj, na.rm = TRUE) omit <- dens$x < min(breaks) | dens$x > max(breaks) dens$x <- dens$x[-omit] dens$y <- dens$y[-omit] dens$x <- scale01(dens$x, min.raw, max.raw) lines(dens$x, dens$y/max(dens$y) * 0.95, col = denscol, lwd = 1) axis(2, at = pretty(dens$y)/max(dens$y) * 0.95, pretty(dens$y)) title("Color Key\nand Density Plot") par(cex = 0.5) mtext(side = 2, "Density", line = 2) } else if (density.info == "histogram") { h <- hist(x, plot = FALSE, breaks = breaks) hx <- scale01(breaks, min.raw, max.raw) hy <- c(h$counts, h$counts[length(h$counts)]) lines(hx, hy/max(hy) * 0.95, lwd = 1, type = "s", col = denscol) axis(2, at = pretty(hy)/max(hy) * 0.95, pretty(hy)) title("Color Key\nand Histogram") par(cex = 0.5) mtext(side = 2, "Count", line = 2) } else title("") } else plot.new() retval$colorTable <- data.frame(low = retval$breaks[-length(retval$breaks)], high = retval$breaks[-1], color = retval$col) invisible(retval) } @ <>= library(clue) library(cluster) data("nomarkerCellsClustering") data("scLVM_output") data("corMatsNoMarker") #source( file.path( # system.file(package="Single.mTec.Transcriptomes"), # "extfunction", "heatmap3.R" ) ) nClusters <- 12 colClusters <- brewer.pal(9, "Set1") colClusters[9] <- colClusters[3] colClusters[3] <- "black" colClusters[10] <- "darkgray" colClusters[11] <- "#000078" colClusters[12] <- "#AA0078" nonAssignedCluster <- 10 specialSort <- function(cons=nomarkerCellsClustering[["consensus"]], sendLast=10){ rt <- sort( cl_class_ids(cons) ) wLast <- rt == sendLast c( rt[!wLast], rt[wLast]) } specialOrder <- function(cons=nomarkerCellsClustering[["consensus"]], sendLast=10){ or <- order(cl_class_ids(cons)) rt <- sort( cl_class_ids(cons) ) wLast <- rt == sendLast c( or[!wLast], or[wLast]) } geneGeneCorHeatmap <- function( cons=allCellsClustering[["consensus"]], corMat=corMatSp, expressionMatrix=Ycorr, selectGenes=intersect(deGenes, aireDependent), colorClusters=""){ mat <- expressionMatrix[rownames(expressionMatrix) %in% selectGenes,] or <- rownames(mat)[specialOrder( cl_class_ids(cons), nonAssignedCluster)] orderRows <- specialSort( cl_class_ids(cons), nonAssignedCluster) corMat <- corMat[rownames(corMat) %in% rownames(mat), colnames(corMat) %in% rownames(mat)] rowCols <- matrix( colClusters[orderRows], nrow=1 ) br <- seq(-1, 1, length.out=101) ** 3 cols <- colorRampPalette( brewer.pal(9, "RdBu"), interpolate="spline", space="Lab")(100) heatmap.3( corMat[or,or], symm=TRUE, Colv=FALSE, Rowv=FALSE, dendrogram="none", trace="none", breaks=br, col=cols, RowSideColors=rowCols, ColSideColors=NULL, labCol=rep("", nrow(corMat) ), labRow=rep("", nrow(corMat) ), margins = c(4,4), KeyValueName="Spearman\nCorrelation", keysize=1.5, xlab="Aire dependent genes", ylab="\t\tAire dependent genes", NumColSideColors=1.8, NumRowSideColors=1.2) } par(xpd=TRUE) geneGeneCorHeatmap(cons=nomarkerCellsClustering[["consensus"]], corMat=corMatSpNoMarker, expressionMatrix=Ycorr[,colData(dxd)$SurfaceMarker == "None"], selectGenes=intersect( deGenesNone, aireDependent ), colorClusters=colClusters) freqs <- rle( specialSort( cl_class_ids(nomarkerCellsClustering[["consensus"]]), nonAssignedCluster) )$lengths freqs <- c(0, freqs) freqs <- cumsum( freqs ) / max(cumsum(freqs)) freqs <- 1-freqs freqs <- sapply( seq_len(length(freqs)-1), function(x){ (freqs[x] + freqs[x+1])/2 }) freqs <- ( freqs-.125 ) / 1.085 text(LETTERS[seq_len(nClusters)], x=.12, y=(freqs), cex=1.5) length(intersect( deGenesNone, aireDependent ) ) @ \begin{figure} \centering \includegraphics[width=.6\textwidth]{figure/Figure_2A_geneGeneCor_NoMarkCells-1} \caption{ \textbf{Gene-gene correlation matrix. Rows and columns are ordered based on the k-medoids clustering results.} Corresponds to Figure 2A.} \label{fig:geneNoMark} \end{figure} When plotting the expression matrix in a heatmap representation, we can see that most of the gene clusters are particularly pronounced in a small fraction of the cells (Plot~\ref{fig:exprNoMark}). <>= colTspanRNA <- "#b2df8a" geneExprHeatmap <- function(cons=allCellsClustering[["consensus"]], corMat=corMatSp, expressionMatrix=Ycorr, selectGenes=intersect(deGenes, aireDependent), colorClusters="", ylab="\t\tAire dependent genes"){ mat <- expressionMatrix[rownames(expressionMatrix) %in% selectGenes,] or <- rownames(mat)[specialOrder( cl_class_ids(cons), nonAssignedCluster)] orderRows <- specialSort( cl_class_ids(cons), nonAssignedCluster) cols2 <- colorRampPalette( brewer.pal(9, name="Blues"), interpolate="spline", space="Lab")(100) br2 <- seq(0.01, 5, length.out=101) mat <- expressionMatrix[rownames(expressionMatrix) %in% selectGenes,] rowCols <- matrix( colClusters[orderRows], nrow=1 ) colCols1 <- matrix( ifelse( colData(dxd)[colnames(mat),]$SurfaceMarker == "Tspan8", "#c51b7d", "white"), ncol=1) colCols2 <- matrix( ifelse( colData(dxd)[colnames(mat),]$SurfaceMarker == "Ceacam1", "#6a3d9a", "white"), ncol=1) colCols= cbind(colCols1, colCols2) colnames(colCols) <- c("Tspan8 +", "Ceacam1 +") if(all(colCols == "white")){ colCols=NULL ranks <- rank( counts(dxd, normalized=TRUE)[ names( geneNames[geneNames %in% "Tspan8"] ), colnames(mat)], ties.method="min") cols <- colorRampPalette(c("white", colTspanRNA))(length(unique(ranks))) names(cols) <- as.character( sort(unique( ranks ) )) colCols <- matrix( cols[as.character(ranks)], ncol=1) mat <- mat[,order( ranks )] colCols <- colCols[order(ranks),, drop=FALSE] } heatmap.3( mat[or,], symm=FALSE, Colv=FALSE, Rowv=FALSE, dendrogram="none", trace="none", breaks=br2, col=cols2, RowSideColors=rowCols, ColSideColors=colCols, labCol=rep("", nrow(mat) ), labRow=rep("", nrow(mat) ), margins=c(4,4), keysize=1.45, NumColSideColors=1.2, NumRowSideColors=1.2, KeyValueName="Expression level\n(log10)", xlab="Cells ordered by Tspan8 expression", ylab=ylab) } genesForClustering <- intersect( deGenesNone, aireDependent ) genesForClustering <- rownames(Ycorr)[rownames(Ycorr) %in% genesForClustering ] geneClusters <- split(genesForClustering, cl_class_ids( nomarkerCellsClustering[["consensus"]] )) geneClusters <- c( geneClusters[-nonAssignedCluster], geneClusters[nonAssignedCluster]) grep(names( geneNames[geneNames %in% "Tspan8"] ),geneClusters) par(xpd=TRUE) geneExprHeatmap(cons=nomarkerCellsClustering[["consensus"]], corMat=corMatSpNoMarker, expressionMatrix=Ycorr[,colData(dxd)$SurfaceMarker == "None"], selectGenes=intersect(deGenesNone, aireDependent) ) legend( x=.3, y=1.01, legend="Tspan8 mRNA detected", fill=colTspanRNA, cex=1.3, bty="n") freqs <- rle( specialSort( cl_class_ids(nomarkerCellsClustering[["consensus"]]) ) )$lengths freqs <- c(0, freqs) freqs <- cumsum( freqs ) / max(cumsum(freqs)) freqs <- 1-freqs freqs <- sapply( seq_len(length(freqs)-1), function(x){ (freqs[x] + freqs[x+1])/2 }) freqs <- ( freqs-.135 ) / 1.155 text(LETTERS[seq_len(nClusters)], x=.12, y=(freqs), cex=1.5) @ \begin{figure} \centering \includegraphics[width=.6\textwidth]{figure/Figure_2B_geneExpr_NoMarkCells-1} \caption{ \textbf{Gene expression heatmap. The rows are ordered based on clustering results and the columns are ordered by the \emph{Tspan8} expression levels.} Corresponds to Figure 2B.} \label{fig:exprNoMark} \end{figure} \subsubsection{Correlations in \emph{Samson et al.} dataset} As an additional check, we processed the \emph{Sansom et al.} single-cell dataset in the same manner as our data. We estimated a gene-gene correlation matrix using the \emph{Samson et al.} dataset. Then, for each gene group defined using our data, we test if we see high correlations within the gene groups compared to the genes outside the gene group groups on the \emph{Samson et al.} dataset. <>= data("corMatsSansom") data("deGenesSansom") geneClustersDESansom <- lapply( geneClusters, function(x){ intersect( x, rownames(corMatSp) ) }) matToUse <- corMatSp densities <- lapply( seq_along(geneClusters), function(x){ inClust <- rownames(matToUse) %in% geneClustersDESansom[[x]] as.numeric(matToUse[inClust,inClust][upper.tri(matToUse[inClust,inClust])]) }) backgroundDensity <- sample( unlist(geneClustersDESansom), 500 ) inBack <- rownames(matToUse) %in% backgroundDensity backgroundDensity <- as.numeric(matToUse[inBack,inBack][upper.tri(matToUse[inBack,inBack])]) densitiesComp <- list() for(i in seq_along(densities)){ densities[[i]] <- densities[[i]][!is.na(densities[[i]])] densitiesComp[[i]] <- backgroundDensity } t.test(unlist(densities[1:11]), backgroundDensity) pvals <- sapply( seq_along(geneClusters), function(i){ t.test( densities[[i]], densitiesComp[[i]], alternative="greater")$p.value }) significant <- which( p.adjust( pvals, method="BH") < 0.05 ) LETTERS[significant] significant significant <- 1:12 library(geneplotter) dfDensities <- do.call(rbind, lapply(significant, function(i){ dfDensities <- data.frame( correlations=c( densities[[i]], densitiesComp[[i]] ), between=rep( c("within cluster", "background"), time=c(length(densities[[i]]), length(densitiesComp[[i]]) ) )) dfDensities <- dfDensities[!is.nan(dfDensities$correlations),] dfDensities$cluster <- LETTERS[i] dfDensities }) ) ggplot( dfDensities, aes(correlations, colour=between)) + stat_ecdf(lwd=1.2) + facet_wrap(~cluster) + coord_cartesian(xlim = c(0, .49), ylim=c(.5, 1.01)) @ \begin{figure} \centering \includegraphics[width=\textwidth]{figure/Figure_R3_sansomcor-1} \caption{ \textbf{Correlation distribution from the \emph{Sansom et al.} data within our 11 gene clusters.} We estimate the gene-gene correlation coefficients from the \emph{Sansom et al.} data.} \label{fig:sansomCors} \end{figure} \section{Data analysis for TRA-selected cells} In order to validate the presence of co-regulated groups of genes based on single mTECs, we perform a second analytical approach inspired on the analysis by~\cite{pinto2013}. \subsection{\emph{Tspan8}} We select \emph{Tspan8}, that is one of the genes belonging to the Group ``B'' of the k-medoids clustering. <>= expressionMat <- as.matrix(Ycorr[,colData(dxd)$SurfaceMarker=="None"]) whichCluster <- grep( names( geneNames[geneNames %in% "Tspan8"] ), geneClusters ) cat(sprintf("Tspan8 belonged to cluster %s\n", whichCluster)) maxCorTspan8 <- which.max( sapply(seq_len(nClusters), function(i){ cor( expressionMat[names( geneNames[geneNames %in% "Tspan8"] ),], colMeans( expressionMat[geneClusters[[i]],]) , method="spearman") }) ) maxCorTspan8 cat(sprintf("Tspan8 displayed the highest correlation with Cluster %s\n", maxCorTspan8)) @ Next, we screen the cells for which the expression of Tspan8 is detected. <>= expressingTspan8 <- counts(dxd)[names(geneNames[geneNames %in% "Tspan8"]), colData(dxd)$SurfaceMarker == "None"] > 0 sum( expressingTspan8 ) table( expressingTspan8 )[["TRUE"]]/sum(table(expressingTspan8)) expressingCeacam1 <- counts(dxd)[names(geneNames[geneNames %in% "Ceacam1"]), colData(dxd)$SurfaceMarker == "None"] > 0 sum( expressingCeacam1 ) table( expressingCeacam1 )[["TRUE"]]/sum(table(expressingCeacam1)) @ Then, for each of the highly variable genes, we test for instances that are up-regulated in the cells detecting expression of \emph{Tspan8} with respect to the cells where the Tspan8 expression is not detected (co-expression). We do this using a Wilcoxon test. As expected, the set of \emph{Tspan8} co-expressed genes is highly enriched for genes from Cluster ``B'' when compared to the rest of the clusters. <>= dxdUnselected <- dxd[,colData( dxd )$SurfaceMarker == "None"] rowwilcoxtests <- function(x, fac){ sp <- split( seq_len(ncol(x)), fac ) true <- sp[["TRUE"]] false <- sp[["FALSE"]] rs <- ( mclapply( seq_len(nrow(x)), function(i){ wt <- wilcox.test( x[i,true], x[i,false] ) c(meanA=mean(x[i,true]), meanB=mean(x[i,false]), pval=wt$p.value) }, mc.cores=numCores) ) as.data.frame( do.call(rbind, rs) ) } deGenesNoneCorrected <- deGenesNone[deGenesNone %in% rownames(Ycorr)] getCoexpressionGroup <- function(gene){ cellGroups <- counts(dxdUnselected, normalized=TRUE)[gene,] > 0 res <- rowwilcoxtests( x= as.matrix(Ycorr[deGenesNoneCorrected, colData(dxd)$SurfaceMarker=="None"] ), fac=cellGroups ) indFilter <- apply( counts(dxdUnselected, normalized=TRUE)[deGenesNoneCorrected,], 1, function(x){ mean(x[x != 0]) }) > 150 & rowSums(counts(dxdUnselected)[deGenesNoneCorrected,] > 0) > 5 res$pval[!indFilter] <- NA coexpressionGroup <- deGenesNoneCorrected[which(p.adjust( res$pval, method="BH" ) < 0.1 & res$meanA - res$meanB > 0)] return(coexpressionGroup) } gene <- names( geneNames[geneNames %in% "Tspan8"] ) names(gene) <- "Tspan8" tspan8CoexpressionGroup <- getCoexpressionGroup(gene) cat( sprintf("Number of differentially expressed genes at a FDR of .05: %s\n", length(tspan8CoexpressionGroup)-1) ) coexpressedAndAireDependent <- tspan8CoexpressionGroup[tspan8CoexpressionGroup %in% aireDependent] mat <- rbind( table( geneClusters[[whichCluster]] %in% coexpressedAndAireDependent ), table( unlist(geneClusters[!seq_len(nClusters) %in% whichCluster] ) %in% coexpressedAndAireDependent ))[,2:1] fisher.test(mat) dfPrint <- data.frame( `ensembl_gene_name`=tspan8CoexpressionGroup, `gene_name`=geneNames[tspan8CoexpressionGroup], `aire_dependent`=as.numeric(tspan8CoexpressionGroup %in% aireDependent), `cluster_2`=as.numeric( tspan8CoexpressionGroup %in% geneClusters[[whichCluster]]) ) write.table(dfPrint, quote=FALSE, sep="\t", row.names=FALSE, file="figure/tspan8CoexpressionGroup.txt") @ <>= par(mar=c(4, 6, 1, 1)) barplot( mat[,1]/rowSums(mat), names.arg=c("Cluster B", "All others"), las=1, col="black", cex.axis=1.2, cex.lab=1.3, cex=1.3, ylab="Fraction of genes") @ \begin{figure} \centering \includegraphics[width=.3\textwidth]{figure/Figure_Supp3_tspan8enrichment-1} \caption{ \textbf{Enrichment of Tspan8 co-expressed gene set in cluster B.} Corresponds to S3.} \label{fig:tspan8enrichment} \end{figure} Next, we used FACS to select mTECs that were expressing Tspan8 in the cell surface. We evaluate the consistency of the fold changes between the \emph{Tspan8} unselected positive cells and the Tspan8 FACS positive cells, both with respect to the \emph{Tspan8} unselected negative cells. We observe that the fold changes are highly consistent. <>= getFoldChangesForViolin <- function(gene, coexpressedGenes){ cellGroups <- counts(dxdUnselected, normalized=TRUE)[gene,] > 0 cellGroups <- split( names(cellGroups), cellGroups) outGroup <- rowMeans( Ycorr[deGenesNoneCorrected, cellGroups[["FALSE"]]] ) inGroupSelected <- rowMeans( Ycorr[deGenesNoneCorrected, colData(dxd)$SurfaceMarker==names(gene)] ) names(outGroup) <- deGenesNoneCorrected names(inGroupSelected) <- deGenesNoneCorrected geneIndexes <- deGenesNoneCorrected %in% coexpressedGenes l2fc <- (inGroupSelected - outGroup)/log10(2) toRet <- list() toRet[["coexpressed"]] <- l2fc[geneIndexes] toRet[["background"]] <- l2fc[!geneIndexes] toRet } foldChangesTRApos <- list() foldChangesTRApos[["Tspan8"]] <- getFoldChangesForViolin( gene, tspan8CoexpressionGroup) getFoldChangesForScatter <- function(gene, coexpressedGenes){ cellGroups <- counts(dxdUnselected, normalized=TRUE)[gene,] > 0 cellGroups <- split( names(cellGroups), cellGroups) outGroup <- rowMeans( Ycorr[deGenesNoneCorrected, cellGroups[["FALSE"]]] ) inGroup <- rowMeans( Ycorr[deGenesNoneCorrected, cellGroups[["TRUE"]]] ) inGroupSelected <- rowMeans( Ycorr[deGenesNoneCorrected, colData(dxd)$SurfaceMarker==names(gene)] ) names(outGroup) <- deGenesNoneCorrected names(inGroupSelected) <- deGenesNoneCorrected names(inGroup) <- deGenesNoneCorrected toRet <- list() toRet[["preenriched"]] <- (inGroupSelected - outGroup)/log10(2) toRet[["unselected"]] <- (inGroup - outGroup)/log10(2) toRet } foldChangesAll <- list() foldChangesAll[["Tspan8"]] <- getFoldChangesForScatter(gene, tspan8CoexpressionGroup) havePosFoldChange <- table( foldChangesTRApos[["Tspan8"]][["coexpressed"]] > 0 ) cat(sprintf("%s percent (%s out of %s) of the up-regulated genes based on the unselected mTECs have a consistent fold change in the Tspan8+ cells\n", round( havePosFoldChange["TRUE"] / sum( havePosFoldChange ), 2 ) * 100, havePosFoldChange["TRUE"], sum(havePosFoldChange))) @ We can observe the validation of the \emph{Tspan8} co-expression group in a heatmap representation of expression Z-scores, defined as, \begin{equation} Z_{ij}=\frac{X_{ij} - \operatornamewithlimits{Median}_{j}{(X_{ij})}}{\sqrt{\operatornamewithlimits{Var}_{j}{(X_{ij})}}} \end{equation} where the genes are indexed by $i$ and the cells are indexed by $j$. $X_{ij}$ represents the expression levels after regressing the cell cycle variation. <>= library(matrixStats) colTspanPos <- "#33a02c" makeHeatmapForGene <- function(gene, coexpressionGroup, colPos, colRNA, enrichMethod, legendLabs){ cellsToUse <- colData(dxd)$SurfaceMarker %in% c("None", names(gene)) goodOrder <- order( counts(dxd, normalized=TRUE)[gene,cellsToUse] ) colPca <- ifelse( colData(dxd)[cellsToUse,"SurfaceMarker"] == names(gene), colPos, "white") nonZero <- counts(dxd, normalized=TRUE)[gene,cellsToUse] > 0 colPca2 <- rep("white", sum(cellsToUse)) colPca2[nonZero] <- colRNA cols <- colorRampPalette( brewer.pal(11, name="RdBu"), interpolate="spline", space="Lab")(100) expr <- as.matrix(Ycorr[coexpressionGroup,cellsToUse] / log10(2)) rowCols <- ifelse( rownames(expr) %in% aireDependent, "black", "white" ) br <- seq( min(expr), 4, length.out=101 ) rowCols <- matrix(rowCols, nrow=1) colCols <- matrix(colPca[goodOrder], ncol=1) colCols <- as.matrix(cbind(colPca, colPca2))[goodOrder,] colnames(colCols) <- NULL heatmapMatrix <- (expr[,goodOrder] - rowMedians(expr[,goodOrder])) / rowSds(expr[,goodOrder]) br <- seq(-4, 4, length.out=101) par(xpd=TRUE) heatmap.3( heatmapMatrix, trace="none", ColSideColors=colCols, Colv=FALSE, col=cols, dendrogram="none", labCol=rep("", ncol(expr)), labRow=rep("", nrow(expr) ), RowSideColor=rowCols, margins = c(4,4), KeyValueName="Expression\n(Z-score)", keysize=1.9, NumColSideColors=2, NumRowSideColors=1.2, breaks=br, xlab=sprintf("Cells ordered by\n%s expression", names(gene)), ylab=sprintf("Genes co-expressed with \n%s in single mature mTECs", names(gene)) ) legend( x=.15, y=1.01, legend=legendLabs, fill=c(colRNA, colPos), cex=1.1, bty="n") legend( x=-.3, y=.6, legend="Aire\ndependent\ngenes", fill="black", cex=1.2, bty="n") } makeHeatmapForGene(gene, tspan8CoexpressionGroup, colTspanPos, colTspanRNA, legendLabs=c("Tspan8 mRNA detected", expression(Tspan8^"pos"~(FACS)))) @ \begin{figure} \centering \includegraphics[width=.6\textwidth]{figure/Figure_3B_tspan8Heatmap-1} \caption{ \textbf{Heatmap of the expression of co-expressed genes with Tspan8.} Corresponds to Figure 3B.} \label{fig:tspan8Heatmap} \end{figure} \subsection{Ceacam1} Next, we perform the same analysis as with \emph{Tspan8}, but this time we select an Aire-independent TRA. <>= nonZeros=!counts(dxd)[names( geneNames[geneNames %in% "Ceacam1"] ), colData(dxd)$SurfaceMarker == "None"] == 0 nonZeros <- colData(dxd)$SurfaceMarker == "None" & counts(dxd)[names( geneNames[geneNames %in% "Ceacam1"]),] != 0 cor.test( as.numeric(Ycorr[names(geneNames[geneNames %in% "Ceacam1"]),nonZeros][1,]), colMeans( Ycorr[geneClusters[[whichCluster]],nonZeros]), method="spearman" ) @ We do the testing as we did with \emph{Tspan8} previously, but now for \emph{Ceacam1}. We observe a signifcant overlap with the \emph{Tspan8} co-expressed gene set. <>= gene <- names( geneNames[geneNames %in% "Ceacam1"] ) names(gene) <- "Ceacam1" ceacam1CoexpressionGroup <- getCoexpressionGroup(gene) table( ceacam1CoexpressionGroup %in% aireDependent ) cat( sprintf("Number of differentially expressed genes at a FDR of .1: %s\n", length(ceacam1CoexpressionGroup) -1) ) cat(sprintf("The number of genes overlapping between co-expression groups is %s\n", table( ceacam1CoexpressionGroup %in% tspan8CoexpressionGroup )["TRUE"])) percentageOverlap <- table( ceacam1CoexpressionGroup %in% tspan8CoexpressionGroup )[["TRUE"]] / length( ceacam1CoexpressionGroup ) cat(sprintf("In percentage of Ceacam1 genes %s\n", round( percentageOverlap * 100, 2))) mat <- rbind( table(ceacam1CoexpressionGroup %in% tspan8CoexpressionGroup), table( deGenesNone[!deGenesNone %in% ceacam1CoexpressionGroup] %in% tspan8CoexpressionGroup ) )[,2:1] rownames( mat ) <- c("coexpressed", "not coexpressed") colnames( mat ) <- c("overlaps with Tspan8", "does not overlap") fisher.test( mat ) mat[1,] /sum(mat[1,]) table( ceacam1CoexpressionGroup %in% tspan8CoexpressionGroup ) table( ceacam1CoexpressionGroup %in% tspan8CoexpressionGroup )[["FALSE"]] / length( ceacam1CoexpressionGroup ) table( tspan8CoexpressionGroup %in% ceacam1CoexpressionGroup ) table( tspan8CoexpressionGroup %in% ceacam1CoexpressionGroup )[["FALSE"]] / length(tspan8CoexpressionGroup) length( union( tspan8CoexpressionGroup, ceacam1CoexpressionGroup ) ) ceacam1Aire <- ceacam1CoexpressionGroup[ceacam1CoexpressionGroup %in% aireDependent] mat <- rbind( table( geneClusters[[whichCluster]] %in% ceacam1Aire ), table( unlist(geneClusters[!seq_len(nClusters) %in% whichCluster] ) %in% ceacam1Aire ))[,2:1] fisher.test(mat, alternative="greater") dfPrint <- data.frame( `ensembl_gene_name`=ceacam1CoexpressionGroup, `gene_name`=geneNames[ceacam1CoexpressionGroup], `aire_dependent`=as.numeric(ceacam1CoexpressionGroup %in% aireDependent), `cluster_2`=as.numeric( ceacam1CoexpressionGroup %in% geneClusters[[whichCluster]]) ) write.table(dfPrint, quote=FALSE, sep="\t", row.names=FALSE, file="figure/ceacam1CoexpressionGroup.txt") @ We also observe a very good agreement of co-expression between the ad hoc \emph{Ceacam1} positive cells and the FACS Ceacam1 positive cells with respect to the unselected Ceacam1 negative cells. Reflecting the validation of our second co-expression group and confirming that the co-expression phenomena is not restricted to Aire-dependent genes. <>= foldChangesTRApos[["Ceacam1"]] <- getFoldChangesForViolin( gene, ceacam1CoexpressionGroup) foldChangesAll[["Ceacam1"]] <- getFoldChangesForScatter(gene, ceacam1CoexpressionGroup) havePosFoldChange <- table( foldChangesTRApos[["Ceacam1"]][["coexpressed"]] > 0 ) cat(sprintf("%s percent (%s out of %s) of the up-regulated genes based on the unselected mTECs have a consistent fold change in the Ceacam1+ cells\n", round( havePosFoldChange["TRUE"] / (sum(havePosFoldChange)), 2 ) * 100, havePosFoldChange["TRUE"], (sum(havePosFoldChange)))) @ In a similar manner, we plot the heatmap as for the \emph{Tspan8} co-expression group, but using the respective \emph{Ceacam1} FACS data (Plot~\ref{fig:ceacam1Heatmap}). <>= colCeacamPos <- "#c51b7d" colCeacamRNA <- "#edbad8" makeHeatmapForGene(gene, ceacam1CoexpressionGroup, colCeacamPos, colCeacamRNA, legendLabs=c("Ceacam1 mRNA detected", expression(Ceacam1^"pos"~(FACS)))) @ \begin{figure} \centering \includegraphics[width=.6\textwidth]{figure/Figure_3C_ceacam1Heatmap-1} \caption{ \textbf{Heatmap of expression of genes detected as being co-expressed with Ceacam1.} Corresponds to Figure 3D} \label{fig:ceacam1Heatmap} \end{figure} We perform the same validation, now with the TRA Klk5. Plot~\ref{fig:klk5Heatmap}. \subsection{Klk5} <>= gene <- names( geneNames[geneNames %in% "Klk5"] ) names(gene) <- "Klk5" cellGroups <- counts(dxdUnselected, normalized=TRUE)[gene,] > 0 #dxdUnselected2 <- estimateSizeFactors(dxdUnselected[deGenesNone,]) cat( sprintf("The number of unselected mTECs detecting the expression of Klk5 is %s\n", table(cellGroups)["TRUE"] )) klk5CoexpressionGroup <- getCoexpressionGroup(gene) wCluster <- grep( names( geneNames[geneNames %in% "Klk5"] ), geneClusters) wCluster matKlk <- rbind( table( geneClusters[[wCluster]] %in% klk5CoexpressionGroup ), table( unlist(geneClusters[-wCluster]) %in% klk5CoexpressionGroup ) )[,2:1] matKlk fisher.test( matKlk ) foldChangesTRApos[["Klk5"]] <- getFoldChangesForViolin( gene, klk5CoexpressionGroup) foldChangesAll[["Klk5"]] <- getFoldChangesForScatter(gene, klk5CoexpressionGroup) havePosFoldChange <- table( foldChangesTRApos[["Klk5"]][["coexpressed"]] > 0 ) table( klk5CoexpressionGroup %in% aireDependent ) cat(sprintf("%s percent (%s out of %s) of the up-regulated genes based on the unselected mTECs have a consistent fold change in the Klk5+ cells\n", round( havePosFoldChange["TRUE"] / (sum(havePosFoldChange)), 2 ) * 100, havePosFoldChange["TRUE"], (sum(havePosFoldChange)))) colKlkRNA <- "#cab2d6" colKlkPos <- "#6a3d9a" makeHeatmapForGene(gene, klk5CoexpressionGroup, colKlkPos, colKlkRNA, legendLabs=c("Klk5 mRNA detected", expression(Klk5^"pos"~(qPCR)))) dfPrint <- data.frame( `ensembl_gene_name`=klk5CoexpressionGroup, `gene_name`=geneNames[klk5CoexpressionGroup], `aire_dependent`=as.numeric(klk5CoexpressionGroup %in% aireDependent), `cluster_4`=as.numeric( klk5CoexpressionGroup %in% geneClusters[[whichCluster]]) ) write.table(dfPrint, quote=FALSE, sep="\t", row.names=FALSE, file="figure/klk5CoexpressionGroup.txt") @ \begin{figure} \centering \includegraphics[width=.6\textwidth]{figure/Figure_3D_klk5Heatmap-1} \caption{ \textbf{Heatmap of expression of genes detected as being co-expressed with Klk5.} Corresponds to 3F} \label{fig:klk5Heatmap} \end{figure} <>= par(mar=c(4, 6, 1, 1)) barplot( mat[,1]/rowSums(mat), names.arg=c("Cluster D", "All others"), las=1, col="black", cex.axis=1.2, cex.lab=1.3, cex=1.3, ylab="Fraction of genes") @ \begin{figure} \centering \includegraphics[width=.3\textwidth]{figure/Figure_Supp5_klk4-1} \caption{ \textbf{Enrichment of Klk5 co-expressed gene set of genes in cluster D.} Corresponds to S5} \label{fig:fisher3} \end{figure} Distribution of fold changes significantly skewed towards positive values for the three validation experiments, confirming the co-expression phenomena (~\ref{fig:violinVal1, fig:violinVal2, fig:violinVal3}). <>= dfValidations <- data.frame( marker= rep( names(foldChangesTRApos), sapply(foldChangesTRApos, function(x) length(unlist(x)))), gene = unlist( lapply(foldChangesTRApos, function(x){ rep(names(x), listLen(x)) }) ), foldChange=unlist(foldChangesTRApos) ) dfValidations$gene <- relevel(factor(dfValidations$gene), 2) dfValidations$marker <- relevel(factor(dfValidations$marker), 3) levels( dfValidations$gene ) <- c("Co-expressed", "All others") #print( ggplot( dfValidations, aes(x=gene, y=foldChange, fill=gene)) + # geom_violin() + # geom_boxplot(width=0.15, outlier.shape=NA, notch=.2) + # facet_grid(~marker) + scale_y_continuous(limits = c(-3, 5)) + # geom_abline(intercept = 0, slope = 0, col="#ff00ff60", lwd=1.3) + # theme(legend.position="top", legend.title=element_blank(), # strip.text.x = element_text(size = 12, face="bold"), # axis.ticks.x = element_blank(), axis.text.x = element_blank(), # axis.ticks.y = element_line(colour="black"), # legend.text=element_text(size=13), # axis.line = element_blank(), # legend.key = element_blank(), # axis.text.y = element_text(size=12, colour="black"), # axis.title=element_text(size=14), # panel.background = element_rect(fill='white', colour='white'), # panel.border = element_rect(colour = "black", fill=NA), # axis.line = element_line(colour = "black")) + #ylab("Log fold change (base 2) between #TRA enriched cells (FACS or qPCR) #and TRA negative cells (unselected)") + xlab("") + # scale_fill_manual( values = c("#fc8d62", "#b3b3b3") ) ) plotViolin <- function(geneName="Tspan8", ylab="Tspan8 expression (log2 fold)\nflow cytometry TRA+ vs\nunselected TRA-"){ dfOp <- dfValidations[dfValidations$marker == geneName,] print( ggplot( dfOp, aes(x=gene, y=foldChange, fill=gene)) + geom_violin() + geom_boxplot(width=0.15, outlier.shape=NA, notch=.2) + scale_y_continuous(limits = c(-3, 5)) + geom_abline(intercept = 0, slope = 0, col="#ff00ff60", lwd=1.3) + theme(legend.position="none", legend.title=element_blank(), strip.text.x = element_text(size = 12, face="bold"), axis.ticks.x = element_line(colour="black"), axis.text.x = element_text(size=14, colour="black", angle=25, hjust=1), axis.ticks.y = element_line(colour="black"), legend.text=element_text(size=13), legend.key = element_blank(), axis.text.y = element_text(size=13, colour="black"), axis.title=element_text(size=14), panel.background = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.border = element_rect(colour = "white", fill=NA), axis.line = element_line(colour = "black")) + ylab(ylab) + xlab("") + scale_fill_manual( values = c("#fc8d62", "#b3b3b3") ) ) } @ <>= plotViolin() @ \begin{figure} \centering \includegraphics[width=.3\textwidth]{figure/Figure_3A1_violinPlotValidation-1} \caption{ \textbf{Violin plot depicting distribution of fold changes.} Corresponds to Figure 3A. } \label{fig:violinVal1} \end{figure} <>= plotViolin(geneName="Ceacam1", ylab="Ceacam1 expression (log2 fold)\nflow cytometry TRA+ vs\nunselected TRA-") @ \begin{figure} \centering \includegraphics[width=.3\textwidth]{figure/Figure_3A2_violinPlotValidation-1} \caption{ \textbf{Violin plot depicting distribution of fold changes.} Corresponds to Figure 3C. } \label{fig:violinVal2} \end{figure} <>= plotViolin(geneName="Klk5", ylab="Klk5 expression (log2 fold)\nqPCR TRA+ vs\nunselected TRA-") @ \begin{figure} \centering \includegraphics[width=.3\textwidth]{figure/Figure_3A3_violinPlotValidation-1} \caption{ \textbf{Violin plot depicting distribution of fold changes.} Corresponds to Figure 3E. } \label{fig:violinVal3} \end{figure} The p-values confirming the validation experiments <>= sapply(names(foldChangesTRApos), function(x){ t.test( foldChangesTRApos[[x]][["coexpressed"]], foldChangesTRApos[["background"]]) }) @ Scatterplot of fold changes (Plot~\ref{fig:violin2p}). <>= coexpressionGroupList <- list( tspan8=tspan8CoexpressionGroup, ceacam1=ceacam1CoexpressionGroup, klk5=klk5CoexpressionGroup ) save(coexpressionGroupList, file="../data/coexpressionGroupList.RData") foldChangesDf <- lapply( foldChangesAll, function(x){ as.data.frame(do.call(cbind, x)) } ) names(foldChangesDf) <- names(foldChangesAll) for(x in seq_along( foldChangesDf )){ foldChangesDf[[x]]$marker <- names(foldChangesDf)[x] } stopifnot( all( names(coexpressionGroupList) == tolower(names( foldChangesDf)))) for(x in seq_along( foldChangesDf )){ foldChangesDf[[x]]$marker <- names(foldChangesDf)[x] foldChangesDf[[x]]$coexpressed <- rownames(foldChangesDf[[x]]) %in% coexpressionGroupList[[x]] } foldChangesDf <-do.call(rbind, foldChangesDf) foldChangesDf$marker <- relevel(factor( foldChangesDf$marker ), 3) foldChangesDf$coexpressed <- factor( foldChangesDf$coexpressed ) levels( foldChangesDf$coexpressed ) <- c("rest of the genes", "co-expressed") #png("prueba.png", res=300, width=9, height=9, units="in") print( ggplot( foldChangesDf, aes(x=preenriched, y=unselected)) + geom_point(shape=19, size=1, colour="#00000070") + # stat_density2d(geom="tile", aes(fill = ..density..), contour = FALSE) + facet_grid(coexpressed~marker) + theme(legend.position="none", legend.title=element_blank(), strip.text = element_text(size = 14, face="bold"), axis.ticks.x = element_line(colour="black"), axis.text.x = element_text(size=16, colour="black"), axis.ticks.y = element_line(colour="black"), legend.text=element_blank(), # legend.key = element_blank(), axis.text.y = element_text(size=16, colour="black"), axis.title=element_text(size=16),# panel.background = element_blank(), panel.background = element_rect(fill='white', colour='black'), panel.border = element_rect(colour = "black", fill=NA), axis.line = element_line(colour = "black")) + ylim(c(-5, 5)) + xlim(c(-4.9,4.9)) + xlab("Log fold change (base 2) between TRA enriched cells (TRA or qPCR) and TRA negative cells (unselected)") + ylab("Log fold change (base 2) between TRA positive cells (unselected) and TRA negative cells (unselected)") + geom_abline(intercept=0, slope=0, col="red") + geom_vline(xintercept=0, col="red") + coord_fixed() ) #dev.off() @ \begin{figure} \centering \includegraphics[width=.6\textwidth]{figure/Figure_Supp4_violinPlotValidation-1} \caption{ \textbf{Violin plot depicting distribution of fold changes.} Corresponds to Figure S4. } \label{fig:violin2p} \end{figure} <>= back <- table(isTRA) lapply(coexpressionGroupList, function(x){ fore <- table( x %in% rownames(dxd)[isTRA] ) fisher.test( rbind( fore, back - fore)[,2:1]) }) @ \subsection{PCA} In order to explore the hypothesis of sliding windows of co-regulated groups of genes~\cite{pinto2013}, we perform a Principal Component Analysis on the gene expression levels of both the \emph{Ceacam1} and the \emph{Tspan8} co-expression groups. <>= tspan8 <- names( geneNames[geneNames %in% "Tspan8"] ) ceacam1 <- names( geneNames[geneNames %in% "Ceacam1"] ) genesForPca <- union( tspan8CoexpressionGroup, ceacam1CoexpressionGroup ) cellsForPca <- names( which( colSums( counts( dxd, normalized=TRUE )[c(tspan8, ceacam1),] ) >= 0 ) ) length( cellsForPca ) dataForPca <- Ycorr[rownames(Ycorr) %in% genesForPca,cellsForPca] pr <- prcomp(t(dataForPca)) colUnselected <- "#b1592850" colors <- c(colUnselected, colCeacamPos, colTspanPos, colKlkPos) names(colors) <- c("None", "Ceacam1", "Tspan8", "Klk5") colsForPca <- colors[as.character(colData(dxd)[rownames(pr$x),"SurfaceMarker"])] spCellNames <- split( colnames(dataForPca), colData(dxd)[cellsForPca,"SurfaceMarker"]) par(mar=c(5, 4.8, 7, 1), xpd=FALSE) plot( pr$x[spCellNames[["None"]],"PC1"], pr$x[spCellNames[["None"]],"PC2"], col=colors["None"], pch=19, cex=1.2, ylim=c(-10, 10), asp=1, cex.axis=1.45, cex.lab=1.5, xlab="Principal component 1", ylab="Principal component 2") points( pr$x[spCellNames[["Tspan8"]],"PC1"], pr$x[spCellNames[["Tspan8"]],"PC2"], col=colors["Tspan8"], pch=19, cex=1.2) points( pr$x[spCellNames[["Ceacam1"]],"PC1"], pr$x[spCellNames[["Ceacam1"]],"PC2"], col=colors["Ceacam1"], pch=19, cex=1.2) points( pr$x[spCellNames[["Klk5"]],"PC1"], pr$x[spCellNames[["Klk5"]],"PC2"], col=colors["Klk5"], pch=19, cex=1.2) abline(v=10, lwd=3, col="#1a1a1a", lty="dashed") legend( x=-10, y=28, legend=c(expression(Tspan8^"pos"~cells~(FACS)), expression(Ceacam1^"pos"~cells~(FACS)), expression(Klk5^"pos"~cells~(qPCR)), "Unselected cells"), fill=c(colTspanPos, colCeacamPos, colKlkPos, colUnselected), xpd=TRUE, cex=1.3, horiz=FALSE, bty="n") @ \begin{figure} \centering \includegraphics[width=.6\textwidth]{figure/Figure_4A_PCA-1} \caption{ \textbf{Principal component analysis of cells using the union of the \emph{Tspan8} and the \emph{Ceacam1} co-expression groups.} Corresponds to Figure 4A.} \label{fig:pca} \end{figure} We observe that the Principal Component 1 is correlated with the transcript levels of \emph{Tspan8} and \emph{Ceacam1}, meaning that most of the variation is explained by the co-expression phenomena. <>= cor( as.vector(as.matrix(dataForPca[tspan8,cellsForPca])), pr$x[,"PC1"], method="spearman") cor( as.vector(as.matrix(dataForPca[ceacam1,cellsForPca])), pr$x[,"PC1"], method="spearman") klk5 <- names( geneNames[geneNames %in% "Klk5"] ) @ However, the Tspan8$^{pos}$ cells tend to be more separated from the unselected cells with respect to the Ceacam1$^{pos}$. For example, considering a theshold of 10 on PC1, this is the fraction of the cells of each group that were above this threshold: <>= more10InPC1 <- sapply( split( pr$x[,"PC1"] > 10, as.character(colData(dxd)[rownames(pr$x),"SurfaceMarker"] )), table) if(is.na(more10InPC1[["Klk5"]]["TRUE"])){ more10InPC1[["Klk5"]]["TRUE"] <- 0 } more10InPC1 <- do.call(cbind, more10InPC1) round( more10InPC1["TRUE",] / colSums(more10InPC1), 2) @ <>= plotPCAHeatmap <- function(whichCells=colData(dxd)$SurfaceMarker!="None", showMarkers=FALSE){ pr <- prcomp(t(dataForPca)) expr <- asinh( counts(dxd, normalized=TRUE) ) expr <- ( expr - rowMedians(expr) )/ rowSds(expr) expr <- expr[rownames(dataForPca),whichCells] colors["None"] <- "white" colors["Tspan8"] <- colTspanRNA colors["Ceacam1"] <- colCeacamRNA colMiddle <- "#fdbf6f" colRows <- ifelse( rownames(expr) %in% tspan8CoexpressionGroup, colors["Tspan8"], "black" ) colRows <- ifelse( rownames(expr) %in% ceacam1CoexpressionGroup, colors["Ceacam1"], colRows ) colRows <- ifelse( rownames(expr) %in% intersect( tspan8CoexpressionGroup, ceacam1CoexpressionGroup ), colMiddle, colRows) colRows <- matrix(colRows, nrow=1) colors["None"] <- "white" colors["Tspan8"] <- colTspanPos colors["Ceacam1"] <- colCeacamPos br <- seq(-4, 4, length.out=101) cols <- colorRampPalette( brewer.pal(9, "RdBu"), interpolate="spline", space="Lab")(100) colCols1 <- colors[as.character(colData(dxd)[colnames(expr),"SurfaceMarker"])] colCols2 <- rep("white", ncol(expr)) nonZeros <- counts(dxd)[ names( geneNames[geneNames %in% "Tspan8"] ), colnames(expr)] > 0 rankOrder <- order( expr[names( geneNames[geneNames %in% "Tspan8"] ), nonZeros] ) colCols2[nonZeros][rankOrder] <- colorRampPalette( c("white", colors["Tspan8"]))(sum(nonZeros)) colCols3 <- rep("white", ncol(expr)) nonZeros <- counts(dxd)[ names( geneNames[geneNames %in% "Ceacam1"] ), colnames(expr)] > 0 rankOrder <- order( expr[names( geneNames[geneNames %in% "Ceacam1"] ), nonZeros] ) colCols3[nonZeros][rankOrder] <- colorRampPalette( c("white", colors["Ceacam1"]))(sum(nonZeros)) if( showMarkers ){ colCols <- cbind( colCols1, colCols2, colCols3) colnames(colCols) <- c("Surface Marker", "Tspan8 expression", "Ceacam1 expression") }else{ colCols <- cbind( colCols2, colCols3) colnames(colCols) <- c("Tspan8 expression", "Ceacam1 expression") } pr$x <- pr$x[colnames(expr),] par(xpd=TRUE) heatmap.3( expr[rev(order(colRows[1,])),order( pr$x[,"PC1"] )], trace="none", col=cols, dendrogram="none", labCol=rep("", ncol(expr)), labRow=rep("", nrow(expr) ), ColSideColors=colCols[order(pr$x[,"PC1"]),], RowSideColors=colRows[,rev(order( colRows[1,]) ), drop=FALSE], Rowv=FALSE, Colv=FALSE, NumColSideColors=2.5, breaks=br, margins = c(4,4), KeyValueName="Expression\n(Z-scores)", keysize=1.7, NumRowSideColors=1.1, xlab="Cells ordered by\nprincipal component 1", ylab="Co-expressed genes in mature mTECs\n") if( showMarkers ){ legend( x=.45, y=1.1, legend=c(expression(Tspan8^"pos"~(FACS)), expression(Ceacam1^"pos"~(FACS)), expression(Klk5^"pos"~(qPCR))), fill=c(colTspanPos, colCeacamPos, colKlkPos), cex=1.2, bty="n") } legend( x=-.1, y=.4, title="co-expressed\nwith:", legend=c("Tspan8", "Ceacam1", "both"), fill=c(colTspanRNA, colCeacamRNA, colMiddle), cex=1.2, bty="n") } @ Moreover, when focusing only on the cells that were selected for the protein expression of Tspan8 or Ceacam1, the \emph{Tspan8} transcript levels show a strong correlation with both \emph{Ceacam1} and \emph{Tspan8} co-expression groups. <>= expr <- asinh( counts(dxd, normalized=TRUE) ) expr <- ( expr - rowMedians(expr) )/ rowSds(expr) expr <- expr[rownames(dataForPca), !colData(dxd)$SurfaceMarker %in% c( "None", "Klk5")] factorForSplit <- rep("", nrow(expr) ) factorForSplit[rownames(expr) %in% tspan8CoexpressionGroup] <- "Tspan8" factorForSplit[rownames(expr) %in% ceacam1CoexpressionGroup] <- "Ceacam1" factorForSplit[rownames(expr) %in% intersect( tspan8CoexpressionGroup, ceacam1CoexpressionGroup)] <- "Both" meanExpressionPerGroup <- lapply( split( as.data.frame(expr), factorForSplit), function(x){ colMeans(x) }) sapply( c(`ceacam1`=ceacam1, tspan8=`tspan8`), function(y){ sapply( meanExpressionPerGroup, function(x){ nonZeros <- rep(TRUE, ncol(expr)) # nonZeros <- expr[y,] != 0 cor( expr[y,nonZeros], x[nonZeros], method="spearman") }) }) sapply( c( `ceacam1`=ceacam1, `tspan8`=tspan8), function(y){ nonZeros <- rep(TRUE, ncol(expr)) # nonZeros <- expr[y,] != 0 cor( colMeans(expr)[nonZeros], expr[y,nonZeros], method="spearman") }) @ Now, we calculate the same correlations only in the Ceacam1$^{pos}$ cells, where we observe a positive correlation between the co-expressed genes and the transcript levels of \emph{Tspan8}. <>= expr <- asinh( counts(dxd, normalized=TRUE) ) expr <- ( expr - rowMedians(expr) )/ rowSds(expr) expr <- expr[rownames(dataForPca),colData(dxd)$SurfaceMarker=="Ceacam1"] factorForSplit <- rep("", nrow(expr) ) factorForSplit[rownames(expr) %in% tspan8CoexpressionGroup] <- "Tspan8" factorForSplit[rownames(expr) %in% ceacam1CoexpressionGroup] <- "Ceacam1" factorForSplit[rownames(expr) %in% intersect( ceacam1CoexpressionGroup, tspan8CoexpressionGroup)] <- "Both" meanExpressionPerGroup <- lapply( split( as.data.frame(expr), factorForSplit), function(x){ colMeans(x) }) sapply( c(`ceacam1`=ceacam1, tspan8=`tspan8`), function(y){ sapply( meanExpressionPerGroup, function(x){ nonZeros <- rep(TRUE, ncol(expr)) # nonZeros <- expr[y,] != 0 cor( expr[y,nonZeros], x[nonZeros], method="spearman") }) }) sapply( c( `ceacam1`=ceacam1, `tspan8`=tspan8), function(y){ nonZeros <- rep(TRUE, ncol(expr)) # nonZeros <- expr[tspan8,] != 0 cor( colMeans(expr)[nonZeros], expr[y,nonZeros], method="spearman") }) @ We plot the Ceacam1$^{pos}$ cells (Plot~\ref{fig:coexpressionheatmapceacam1}). Where we see an increase of \emph{Tspan8} levels in the Ceacam1$^{pos}$ cells that is accompanied with an increase in the expression of the co-expressed genes and increasing similarity to the Tspan8$^{pos}$ cells. <>= plotPCAHeatmap(colData(dxd)$SurfaceMarker=="Ceacam1") @ \begin{figure} \centering \includegraphics[width=.6\textwidth]{figure/Figure_4B_coexpressionHeatmapCeacam1-1} \caption{ \textbf{Heatmap of co-expressed genes only from the Ceacam1$^{pos}$ cells.} Corresponds to Figure 4B} \label{fig:coexpressionheatmapceacam1} \end{figure} <>= data("gseaReactome") deGenesTspan8Coding <- deGenesTspan8[deGenesTspan8 %in% proteinCoding] deGenesCeacam1Coding <- deGenesTspan8[deGenesCeacam1 %in% proteinCoding] testTspan8 <- testForReactome( foreground=deGenesTspan8Coding, background=proteinCoding[!proteinCoding %in% deGenesTspan8Coding], processesDF=processesDF, uniprots=uniprots, geneName=geneNames, cores=numCores ) testTspan8 <- testTspan8[which(testTspan8$padj < 0.05),] testTspan8$geneNames testCeacam1 <- testForReactome( foreground=deGenesCeacam1Coding, background=proteinCoding[!proteinCoding %in% deGenesCeacam1Coding], processesDF=processesDF, uniprots=uniprots, geneName=geneNames, cores=numCores ) testCeacam1 <- testCeacam1[which(testCeacam1$padj < 0.05),] testCeacam1$geneNames @ \section{Co-expressed genes tend to be clustered in the genome} As data preparation, we first convert the ensembl gene identifiers to gene names from our 11 groups of co-regulated genes resulting from the k-medoids clustering. <>= geneClusterNames <- sapply( geneClusters, function(x){ geneNames[names( geneNames ) %in% x] }) @ In order to explore the localization of co-expressed genes in the genome. We load the transcript model annotation and estimate the genomic ranges for each gene. <>= #library(GenomicFeatures) #transcriptDb <- loadDb( system.file("extdata", # "Mus_musculus.GRCm38.75.primary_assembly.sqlite", # package="Single.mTec.Transcriptomes") ) #exonsByGene <- exonsBy( transcriptDb, "gene" ) #save(geneRanges, file="../data/geneRanges.RData") #geneRanges <- unlist( range( exonsByGene ) ) #geneRanges <- keepSeqlevels(geneRanges, paste0( c(1:19, "X"))) data("geneRanges") dn <- geneRanges[names(geneRanges) %in% names( which( biotype == "protein_coding" ) ),] @ The code below was used to generate the null models of the expected genomic distance between genes when selecting a random set of genes (of the same size as the group of genes to test) for 1,000 permutations. For computational running times, the code below was run and its output was saved into the \emph{R} object ``permutationResults.RData''. <>= dnAire <- dn[names(dn) %in% aireDependentSansom] dnTested <- dn[names(dn) %in% deGenesNone] permutationsForCluster <- function( groupSize, numPerm=1000){ distancePermuted <- mclapply( seq_len(numPerm), function(x){ sampleGenes <- sample(seq_len(length(dn)), groupSize ) median( distanceToNearest( dn[sampleGenes] )@elementMetadata$distance, na.rm=TRUE) }, mc.cores=10) unlist( distancePermuted ) } numberOfPermutations <- 1000 permsAllClusters <- lapply( seq_len(length(geneClusters)), function(i){ permutationsForCluster( length(geneClusters[[i]]), numberOfPermutations ) }) realAllClusters <- sapply(seq_len(length(geneClusters)), function(i){ median( distanceToNearest( dn[names(dn) %in% geneClusters[[i]],])@elementMetadata$distance, na.rm=TRUE ) }) names( permsAllClusters ) <- LETTERS[seq_len(length(permsAllClusters))] names( realAllClusters ) <- LETTERS[seq_len(length(permsAllClusters))] permsAllClusters <- permsAllClusters[!names(permsAllClusters) %in% "L"] realAllClusters <- realAllClusters[!names(realAllClusters) %in% "L"] save(permsAllClusters, realAllClusters, file="../data/permutationResults.RData") @ Based on this permutations, we can estimate the p-values for each of the 11 groups of genes, testing the hypothesis if the median distance between genes tend to be closer in genomic distance than the expected median distributions (obtained from the null models). <>= names(colClusters) <- LETTERS[1:12] data("permutationResults") numberOfPermutations <- 1000 pvals <- sapply( names(permsAllClusters), function(x){ sum( realAllClusters[x] > permsAllClusters[[x]] ) }) / numberOfPermutations p.adjust(pvals, method="BH") sum( p.adjust( pvals, method="BH") < 0.1 ) adjusted <- p.adjust( pvals, method="BH" ) < 0.1 names(permsAllClusters)[adjusted] @ We create a figure based on the expected value distribution and the observed value for each group of co-regulated genes from Plot~\ref{fig:geneNoMark}. <>= par(mfrow=c(3, 4), mar=c(5, 5, 2, 2)) for( i in names(permsAllClusters) ){ coexpressedCoordinates <- dn[names(dn) %in% geneClusters[[i]]] distancePermuted <- permsAllClusters[[i]] xmin <- min(distancePermuted, realAllClusters[i]) xmax <- max(distancePermuted, realAllClusters[i]) hist( distancePermuted, 15, xlab="Expected genomic distance (Mb)", cex.lab=1.4, #main="", cex.axis=1.8, xaxt="n", xlim=c(xmin, xmax), main=paste("Group", i ), cex.main=1.8) sq <- seq(0, 20000000, 1000000) axis(1, at=sq, label=sq/1000000, cex.axis=1.7) md <- realAllClusters[i] abline( v=md, col=colClusters[i], lwd=4) } @ \begin{figure} \centering \includegraphics[width=.9\textwidth]{figure/Figure_Supp6_clusterPval-1} \caption{ \textbf{expected vs observed distances.} Corresponds to Figure S6} \label{fig:clusterPval} \end{figure} We also plot karyograms for each of the gene group from Figure~\ref{fig:geneNoMark}. <>= names( geneClusters ) <- LETTERS[1:12] library(ggbio) for(x in names(permsAllClusters)){ coexpressedCoordinates <- dn[names(dn) %in% geneClusters[[x]]] kr <- autoplot(coexpressedCoordinates, layout="karyogram", col=colClusters[x], fill=colClusters[x]) + theme( panel.background = element_blank(), strip.text.y = element_text(size = 16), axis.text.x = element_text(size=14, colour="black") ) print(kr) } @ We also plot the karyogram for the Group ``D'', that is used for further examples. The resulting figure is the Plot~\ref{fig:karyogram}. <>= clusterNumber <- LETTERS[wCluster] coexpressedCoordinates <- geneRanges[names(geneRanges) %in% geneClusters[[clusterNumber]]] dn2 <- GRanges("7", IRanges(start=18474583, end=18656725)) dn4 <- GRanges("9", IRanges(start=14860210, end=14903949)) library(ggbio) print( autoplot(coexpressedCoordinates, layout="karyogram", col=colClusters[clusterNumber], fill=colClusters[clusterNumber]) + theme( panel.background = element_blank(), strip.text.y = element_text(size = 16), axis.text.x = element_text(size=14, colour="black") ) ) @ \begin{figure} \centering \includegraphics[width=.4\textwidth]{figure/Figure_5A_karyogram-1} \caption{ \textbf{Karyogram of cluster D.} Corresponds to Figure 5a.} \label{fig:karyogram} \end{figure} For the same Group ``D'', we plot the expected vs observed genomic distance in Plot~\ref{fig:exprDistance} <>= i <- clusterNumber distancePermuted <- permsAllClusters[[i]] xmin <- min(distancePermuted, realAllClusters[i]) xmax <- max(distancePermuted, realAllClusters[i]) par(mar=c(4.2, 4.5, 1, 1)) hist( distancePermuted, 30, xlab="Expected genomic distance (Mb)", cex.lab=1.4, #main="", cex.axis=1.8, xaxt="n", xlim=c(xmin, xmax), main="", cex.main=1.8) sq <- seq(0, 20000000, 1000000) axis(1, at=sq, label=sq/1000000, cex.axis=1.7) md <- realAllClusters[i] abline( v=md, col=colClusters[i], lwd=4) @ \begin{figure} \centering \includegraphics[width=.4\textwidth]{figure/Figure_5B_expectedGenomicDistance-1} \caption{ \textbf{Expected vs observed genomic distance for gene group D.} Corresponds to Figure 5b.} \label{fig:exprDistance} \end{figure} Now, we plot the region of the genomic region of unrelated genes, as an example of neighboring genes different families of genes (Plot~\ref{fig:psg}). <>= library(Gviz) colVector <- rep(colClusters, sapply(geneClusters, length) ) names(colVector) <- unlist(geneClusters) range <- dn2 data("geneNames") length(unlist(geneClusters[1:11])) / length(unlist(geneClusters)) sum( unlist(geneClusters[1:11]) %in% tras )/ sum( unlist(geneClusters) %in% tras ) makePlotForRegion <- function(range, left=100000, right=100000, colClusterNumber=NULL){ start(range) <- start( range ) - left end(range) <- end( range ) + right geneRanges <- geneRanges[names( geneRanges ) %in% proteinCoding] overlap <- findOverlaps( range, geneRanges, ignore.strand=TRUE ) toPlot <- geneRanges[names(geneRanges[subjectHits(overlap)])] toPlot <- keepSeqlevels( toPlot, as.character(1:19)) ch <- unique( as.character(seqnames(toPlot)) ) if( !is.null(colClusterNumber)){ cols <- ifelse( names(toPlot) %in% geneClusters[[colClusterNumber]], colClusters[[colClusterNumber]], "white") }else{ cols <- colVector[names( toPlot )] cols[is.na(cols)] <- "white" } labels <- sprintf( "%s (%s)", geneNames[names(toPlot)], as.character(strand(toPlot))) plotTracks( list( GeneRegionTrack( toPlot, symbol=labels, name=paste("chr", ch)), GenomeAxisTrack()), showId=TRUE, geneSymbol=TRUE, fill=cols, fontsize=20, fontcolor="black", detailsBorder.col="black", col.line="black", col="black", sizes=c(5, 1)) } dn2 <- GRanges("6", IRanges(start=122447296,end=122714633)) makePlotForRegion(dn2, left=1000, right=1500, colClusterNumber=clusterNumber) @ \begin{figure} \centering \includegraphics[width=.5\textwidth]{figure/Figure_Supp10_unrelated-1} \caption{ \textbf{Unrelated genes.} Supplementary 8C.} \label{fig:psg} \end{figure} Example of the \emph{Gmst} gene family (Plot~\ref{fig:psg2}). <>= dn2 <- GRanges("3", IRanges(start=107923453,end=107999678)) makePlotForRegion(dn2, left=1000, right=1500, colClusterNumber=clusterNumber) @ \begin{figure} \centering \includegraphics[width=.5\textwidth]{figure/Figure_Supp9_related-1} \caption{ \textbf{Other related genes.} Supplementary 8B.} \label{fig:psg2} \end{figure} The example of the \emph{Bpfib} gene family (Plot~\ref{fig:psg3}). <>= dn2 <- GRanges("2", IRanges(start=154049564,end=154479003)) makePlotForRegion(dn2, left=100, right=100, colClusterNumber=clusterNumber) @ \begin{figure} \centering \includegraphics[width=.5\textwidth]{figure/Figure_Supp8_related-1} \caption{ \textbf{Other related genes.} Supplementary 8A.} \label{fig:psg3} \end{figure} A high fraction of the genes of the \emph{Klk} loci belong to the same gene group defined in the k-medoids clustering (Plot~\ref{fig:KlkLoci}). <>= dn5 <- GRanges("7", IRanges(start=43690418,end=44229617)) makePlotForRegion(dn5, left=20000, right=3500, colClusterNumber=clusterNumber) @ \begin{figure} \centering \includegraphics[width=.5\textwidth]{figure/Figure_5C_Klkloci-1} \caption{ \textbf{Klk gene loci in chromosome 7.} Figure 5C.} \label{fig:KlkLoci} \end{figure} We plot the expression of the Klk loci in a heatmap representation for both the unselected cells and Klk5 cells. The rows are ordered based on Klk5 expression levels. The columns are ordered according to their position on the genome. The expression of Klk5 seems to be a proxy for the expression of the neighboring genes~\ref{fig:KlkLociExpression} <>= range <- dn5 start(range) <- start( range ) - 20000 end(range) <- end( range ) + 3500 overlap <- findOverlaps( range, geneRanges, ignore.strand=TRUE ) toPlot <- geneRanges[names(geneRanges[subjectHits(overlap)])] toPlot <- toPlot[names( toPlot ) %in% proteinCoding,] coexpressedCoordinates <- geneRanges[names( toPlot ),] coexpressedCoordinates <- sort( coexpressedCoordinates, ignore.strand=TRUE ) klkGenes <- names(coexpressedCoordinates) library(pheatmap) cellOrder <- names( sort(counts(dxd,normalized=TRUE)[gene, colData(dxd)$SurfaceMarker %in% c("Klk5", "None")], decreasing=TRUE) ) annotationGenes <- data.frame( kmeans=1*klkGenes %in% geneClusters[[clusterNumber]], wilcox=1*klkGenes %in% klk5CoexpressionGroup, aireDependent= 1*(klkGenes %in% aireDependent), row.names=klkGenes) klkMat <- asinh( counts(dxd, normalized=TRUE)[klkGenes,cellOrder]) klkMat <- ( klkMat - rowMedians(klkMat) ) / rowSds(klkMat) klkMat <- pmin(klkMat, 3) colsKlk <- colorRampPalette(brewer.pal(9, "Blues"))(50) colsKlk <- colorRampPalette(brewer.pal(9, "RdBu"))(50) annotationCells <- data.frame( droplevels(colData(dxd)[colnames(klkMat),"SurfaceMarker"])) rownames(annotationCells) <- colnames(klkMat) colnames(annotationCells) <- "cell" ann_colors <- list(cell=c( Klk5=colClusters[[clusterNumber]], None="lightgray") ) pheatmap(( t(klkMat) )*1, col=colsKlk, cluster_rows=FALSE, cluster_cols=FALSE, labels_row="", labels_col=geneNames[klkGenes], annotation_row=annotationCells, annotation_colors=ann_colors, breaks = seq(-3, 3, length.out=51), legend=TRUE, fontsize = 12) @ \begin{figure} \centering \includegraphics[width=.5\textwidth]{figure/Figure_5D_KlkExpression-1} \caption{ \textbf{Klk expression.} Figure 5D.} \label{fig:KlkLociExpression} \end{figure} Another representation of the same result, but ploting the fraction of cells that detect the expression of the Klk genes. Stratified both by the qPCR-selected Klk5 positive cells or the Klk5 negative unselected cells (Plot~\ref{fig:KlkLociFraction}). <>= cellGroups <- split( names(cellGroups), cellGroups) klkMat <- klkMat[,!colnames(klkMat) %in% cellGroups[["TRUE"]]] cellSplit <- split(colnames(klkMat), droplevels(colData(dxd)[colnames(klkMat),"SurfaceMarker"])) klkPercentages <- apply( klkMat, 1, function(x){ c(None=sum( x[cellSplit[["None"]]] > 0 ), Klk5=sum( x[cellSplit[["Klk5"]]] > 0 )) }) klkPercentages["None",] <- klkPercentages["None",] / length(cellSplit[["None"]]) klkPercentages["Klk5",] <- klkPercentages["Klk5",] / length(cellSplit[["Klk5"]]) dfKlkFractions <- data.frame( fraction=as.vector(klkPercentages), cell=rep(rownames(klkPercentages), ncol(klkPercentages)), gene=factor( rep(geneNames[colnames(klkPercentages)], each=nrow(klkPercentages)), levels=geneNames[colnames(klkPercentages)])) levels(dfKlkFractions$cell) <- c("Klk5 pos (qPCR)", "Klk5 neg (ad hoc)") print(ggplot( dfKlkFractions, aes( x=gene, y=fraction, fill=cell, stat="bar")) + geom_bar(stat="identity") + facet_grid(cell~.) + ylab("fraction of cells") + xlab("") + theme( axis.text.x=element_text(size=12, color="black", angle=90, hjust = 1, vjust=0 ) ) ) @ \begin{figure} \centering \includegraphics[width=.7\textwidth]{figure/Figure_Supp11_KlkExpression-1} \caption{ \textbf{Klk expression.} Supplementary S9.} \label{fig:KlkLociFraction} \end{figure} <>= dfKlk <- as.data.frame( geneRanges[names( geneNames[grepl("Klk", geneNames )] ),] ) dfKlk <- dfKlk[dfKlk$seqnames == 7,] nms <- sapply( geneClusterNames, function(x){ sum(grepl("Klk", x)) }) names(nms) <- LETTERS[as.numeric(names(nms))] nms @ We print the cluster information into a nicely formatted table. <>= clusterInfo <- data.frame( `ensemblID`=unlist( geneClusters ), `geneNames`=geneNames[unlist( geneClusters )], `clusterNumber`=rep(names(geneClusters), sapply(geneClusters, length) ), `clusterColor`=rep(colClusters, sapply(geneClusters, length) ) ) rownames( clusterInfo ) <- NULL write.table( clusterInfo, sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE, file="figure/Table_Supp1_CoexpressionGroupsTable.txt" ) @ \section{ATAC-seq data} Next, we use ATAC-seq data to measure chromatin accessibility. Because the number of cells needed for the ATAC-seq data protocol were not possible to obtain from Mouse samples, we use Human samples for two co-expression groups defined by a previous study~\cite{pinto2013}. To reproduce this results, we load the ATAC-seq data for the co-expression groups and load the data objets. We prepared a \Robject{DESeqDataSet} object containing the number of ATAC-seq read fragments aligned to the promoter sequences (4Kb window around TSS) of each protein coding gene. Then, based on the microarray data from~\cite{pinto2013}, we define the set of genes that are co-expressed with \emph{CEACAM5} and with \emph{MUC1} (FDR of 10\% and fold change of larger than 2). We run DESeq2 to get moderated fold changes between the TRA positive fractions with respect to the TRA negative fractions. <>= data("geneNamesHuman") data("biotypesHuman") data(muc1Coexpression) data(cea1Coexpression) data(dxdATAC) dxdCEACAM5 <- dxdATAC[,colData(dxdATAC)$TRA == "CEACAM5"] colData(dxdCEACAM5) <- droplevels(colData(dxdCEACAM5)) dxdCEACAM5 <- estimateSizeFactors(dxdCEACAM5) dxdCEACAM5 <- estimateDispersions(dxdCEACAM5) dxdCEACAM5 <- nbinomWaldTest(dxdCEACAM5) CEACAM5Group <- as.character( cea1Coexpression$SYMBOL[ cea1Coexpression$`adj.P.Val` < 0.1 & cea1Coexpression$logFC > 2] ) CEACAM5Group <- names( geneNames[geneNames %in% CEACAM5Group] ) dxdMUC1 <- dxdATAC[,colData(dxdATAC)$TRA == "MUC1"] colData(dxdMUC1) <- droplevels(colData(dxdMUC1)) dxdMUC1 <- estimateSizeFactors(dxdMUC1) dxdMUC1 <- estimateDispersions(dxdMUC1) dxdMUC1 <- nbinomWaldTest(dxdMUC1) MUC1Group <- as.character( muc1Coexpression$SYMBOL[ muc1Coexpression$`adj.P.Val` < 0.1 & muc1Coexpression$logFC > 2] ) MUC1Group <- names( geneNames[geneNames %in% MUC1Group] ) length(CEACAM5Group) length(MUC1Group) @ We observe significant differences in chromatin accessibility between the groups of co-expressed genes compared to the rest of the genes, as assesed by a t-test. <>= t.test( results(dxdCEACAM5)$log2FoldChange[rownames(dxdCEACAM5) %in% CEACAM5Group], results(dxdCEACAM5)$log2FoldChange, alternative="greater") t.test( results(dxdMUC1)$log2FoldChange[rownames(dxdMUC1) %in% MUC1Group], results(dxdMUC1)$log2FoldChange, alternative="greater") @ We visualize the results in violin plots, showing the differences in chromatin accessibility (Plot~\ref{fig:atac}). <>= dim(results(dxdCEACAM5)) dim(results(dxdMUC1)) df <- data.frame( signal = c( results(dxdCEACAM5)$log2FoldChange, results(dxdMUC1)$log2FoldChange), group = rep(c("CEACAM5", "MUC1"), each=dim(results(dxdMUC1))[1]), genes = c( ifelse( rownames(dxdCEACAM5) %in% CEACAM5Group, "Co-expressed", "All others"), ifelse( rownames(dxdMUC1) %in% MUC1Group, "Co-expressed", "All others") )) df$genes <- relevel( factor(df$genes), 2 ) atacViolin <- function(gene, ylab="", ylim=c(-2, 2)){ dfOp <- df[df$group == gene,] p <- ggplot(dfOp, aes(factor(genes), signal, fill=genes)) p <- p + geom_violin() + geom_boxplot(width=.4, notch=.2, outlier.shape=NA) + # facet_grid(~group) + scale_y_continuous(limits = ylim) + geom_abline(intercept = 0, slope = 0, col="#ff00ff60", lwd=1.3) + theme(legend.position="none", legend.title=element_blank(), strip.text.x = element_text(size = 12, face="bold"), axis.ticks.x = element_blank(), axis.text.x = element_text(size=14, colour="black", angle=25, hjust=1), axis.ticks.y = element_line(colour="black"), legend.text=element_text(size=13), legend.key = element_blank(), axis.text.y = element_text(size=14, colour="black"), axis.title=element_text(size=14), axis.line=element_line(colour="black"), panel.background = element_rect(colour="white", fill="white"), panel.border = element_rect(colour = "white", fill=NA), panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + xlab("") + ylab(ylab) + scale_fill_manual( values = c("#af8dc3", "#7fbf7b") ) p } @ <>= #pdf("prueba.pdf", height=3.3, width=2.8) atacViolin("CEACAM5", "Promoter accesibility\n(log2 fold)\nCEACAM5+ vs CEACAM5-") #dev.off() @ \begin{figure} \centering \includegraphics[width=.3\textwidth]{figure/Figure_6A1_Atacseq-1} \caption{ \textbf{Chromatin accessibility.} Corresponds to Figure 6a.} \label{fig:atac} \end{figure} <>= atacViolin("MUC1", "Promoter accesibility\n(log2 fold)\nMUC+ vs MUC-", c(-1.1, 1.1)) @ \begin{figure} \centering \includegraphics[width=.3\textwidth]{figure/Figure_6A2_Atacseq-1} \caption{ \textbf{Chromatin accessibility.} Corresponds to Figure 6b.} \label{fig:atac2} \end{figure} \section{Session Information} This is the information from the \emph{R} session used to generate this document: <>= sessionInfo() @ \newpage %\bibliographystyle{plainnat} \bibliography{mTECs} \end{document}