## ----style, echo=FALSE, results='hide', message=FALSE---------------------- library(BiocStyle) library(knitr) opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE) opts_chunk$set(fig.asp=1) ## ---- echo=FALSE, results='hide'------------------------------------------- all.urls <- c("https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE61533&format=file&file=GSE61533%5FHTSEQ%5Fcount%5Fresults%2Exls%2Egz", "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE29087&format=file&file=GSE29087%5FL139%5Fexpression%5Ftab%2Etxt%2Egz", "http://www.ebi.ac.uk/teichmann-srv/espresso/static/counttable_es.csv", "http://www.nature.com/nbt/journal/v33/n2/extref/nbt.3102-S7.xlsx") all.basenames <- basename(all.urls) all.basenames[1] <- "GSE61533_HTSEQ_count_results.xls.gz" all.basenames[2] <- "GSE29087_L139_expression_tab.txt.gz" all.modes <- rep("w", length(all.urls)) all.modes[!grepl("(txt|csv)$", all.basenames)] <- "wb" for (x in seq_along(all.urls)) { if (!file.exists(all.basenames[x])) { download.file(all.urls[x], all.basenames[x], mode=all.modes[x], cacheOK=FALSE) } } ## ----discardplot416b, fig.cap="Average counts across all discarded and retained cells in the 416B dataset. Each point represents a gene, with spike-in and mitochondrial transcripts in red and blue respectively."---- library(SingleCellExperiment) sce.full.416b <- readRDS("416B_preQC.rds") library(scater) suppressWarnings({ lost <- calcAverage(counts(sce.full.416b)[,!sce.full.416b$PassQC]) kept <- calcAverage(counts(sce.full.416b)[,sce.full.416b$PassQC]) }) logfc <- log2((lost+1)/(kept+1)) head(sort(logfc, decreasing=TRUE), 20) plot(lost, kept, xlab="Average count (discarded)", ylab="Average count (retained)", log="xy", pch=16) is.spike <- isSpike(sce.full.416b) points(lost[is.spike], kept[is.spike], col="red", pch=16) is.mito <- rowData(sce.full.416b)$is_feature_control_Mt points(lost[is.mito], kept[is.mito], col="dodgerblue", pch=16) ## ----discardplotpbmc, fig.cap="Average counts across all discarded and retained cells in the PBMC dataset. Each point represents a gene, with platelet-related genes highlighted in orange."---- sce.pbmc <- readRDS("pbmc_data.rds") wrong.keep <- sce.pbmc$total_counts >= 1000 suppressWarnings({ lost <- calcAverage(counts(sce.pbmc)[,!wrong.keep]) kept <- calcAverage(counts(sce.pbmc)[,wrong.keep]) }) logfc <- log2((lost+1)/(kept+1)) head(sort(logfc, decreasing=TRUE), 20) plot(lost, kept, xlab="Average count (discarded)", ylab="Average count (retained)", log="xy", pch=16) platelet <- c("PF4", "PPBP", "SDPR") points(lost[platelet], kept[platelet], col="orange", pch=16) ## -------------------------------------------------------------------------- # Obtaining the dataset. library(scRNAseq) data(allen) # Setting up the data. sce.allen <- as(allen, "SingleCellExperiment") assayNames(sce.allen) <- "counts" isSpike(sce.allen, "ERCC") <- grep("ERCC", rownames(sce.allen)) # Computing the QC metrics and running PCA. library(scater) sce.allen <- calculateQCMetrics(sce.allen) sce.allen <- runPCA(sce.allen, use_coldata=TRUE, detect_outliers=TRUE) table(sce.allen$outlier) ## -------------------------------------------------------------------------- library(SingleCellExperiment) counts <- read.table("GSE29087_L139_expression_tab.txt.gz", colClasses=c(list("character", NULL, NULL, NULL, NULL, NULL, NULL), rep("integer", 96)), skip=6, sep='\t', row.names=1) is.spike <- grep("SPIKE", rownames(counts)) sce.islam <- SingleCellExperiment(list(counts=as.matrix(counts))) isSpike(sce.islam, "spike") <- is.spike dim(sce.islam) ## -------------------------------------------------------------------------- library(scater) sce.islam <- calculateQCMetrics(sce.islam) sce.islam$grouping <- rep(c("mESC", "MEF", "Neg"), c(48, 44, 4)) libsize.drop <- isOutlier(sce.islam$total_counts, nmads=3, type="lower", log=TRUE, batch=sce.islam$grouping) feature.drop <- isOutlier(sce.islam$total_features, nmads=3, type="lower", log=TRUE, batch=sce.islam$grouping) spike.drop <- isOutlier(sce.islam$pct_counts_spike, nmads=3, type="higher", batch=sce.islam$grouping) sce.islam <- sce.islam[,!(libsize.drop | feature.drop | spike.drop | sce.islam$grouping=="Neg")] data.frame(ByLibSize=sum(libsize.drop), ByFeature=sum(feature.drop), BySpike=sum(spike.drop), Remaining=ncol(sce.islam)) ## -------------------------------------------------------------------------- library(scran) sce.islam <- computeSpikeFactors(sce.islam, general.use=TRUE) ## -------------------------------------------------------------------------- sce.islam <- normalize(sce.islam) ## ----normplotspikemef, fig.cap="Size factors from spike-in normalization, plotted against the size factors from deconvolution for all cells in the mESC/MEF dataset. Axes are shown on a log-scale, and cells are coloured according to their identity. Deconvolution size factors were computed with small pool sizes owing to the low number of cells of each type."---- colours <- c(mESC="red", MEF="grey") deconv.sf <- computeSumFactors(sce.islam, sf.out=TRUE, cluster=sce.islam$grouping) plot(sizeFactors(sce.islam), deconv.sf, col=colours[sce.islam$grouping], pch=16, log="xy", xlab="Size factor (spike-in)", ylab="Size factor (deconvolution)") legend("bottomleft", col=colours, legend=names(colours), pch=16) ## -------------------------------------------------------------------------- library(R.utils) gunzip("GSE61533_HTSEQ_count_results.xls.gz", remove=FALSE, overwrite=TRUE) library(gdata) all.counts <- read.xls('GSE61533_HTSEQ_count_results.xls', sheet=1, header=TRUE) rownames(all.counts) <- all.counts$ID all.counts <- as.matrix(all.counts[,-1]) ## -------------------------------------------------------------------------- sce.hsc <- SingleCellExperiment(list(counts=all.counts)) dim(sce.hsc) is.spike <- grepl("^ERCC", rownames(sce.hsc)) isSpike(sce.hsc, "ERCC") <- is.spike summary(is.spike) ## -------------------------------------------------------------------------- sce.hsc <- calculateQCMetrics(sce.hsc) libsize.drop <- isOutlier(sce.hsc$total_counts, nmads=3, type="lower", log=TRUE) feature.drop <- isOutlier(sce.hsc$total_features, nmads=3, type="lower", log=TRUE) spike.drop <- isOutlier(sce.hsc$pct_counts_ERCC, nmads=3, type="higher") sce.hsc <- sce.hsc[,!(libsize.drop | feature.drop | spike.drop)] data.frame(ByLibSize=sum(libsize.drop), ByFeature=sum(feature.drop), BySpike=sum(spike.drop), Remaining=ncol(sce.hsc)) ## -------------------------------------------------------------------------- to.keep <- nexprs(sce.hsc, byrow=TRUE) > 0 sce.hsc <- sce.hsc[to.keep,] summary(to.keep) ## ---- warning=FALSE-------------------------------------------------------- sce.hsc <- computeSumFactors(sce.hsc) summary(sizeFactors(sce.hsc)) sce.hsc <- computeSpikeFactors(sce.hsc, type="ERCC", general.use=FALSE) summary(sizeFactors(sce.hsc, "ERCC")) sce.hsc <- normalize(sce.hsc) ## ----hvgplothsc, fig.cap="Variance of normalized log-expression values for each gene in the HSC dataset, plotted against the mean log-expression. The blue line represents the mean-dependent trend fitted to the variances of the spike-in transcripts (red)."---- var.fit <- trendVar(sce.hsc, parametric=TRUE, loess.args=list(span=0.3)) var.out <- decomposeVar(sce.hsc, var.fit) plot(var.out$mean, var.out$total, pch=16, cex=0.6, xlab="Mean log-expression", ylab="Variance of log-expression") curve(var.fit$trend(x), col="dodgerblue", lwd=2, add=TRUE) cur.spike <- isSpike(sce.hsc) points(var.out$mean[cur.spike], var.out$total[cur.spike], col="red", pch=16) ## -------------------------------------------------------------------------- hvg.out <- var.out[which(var.out$FDR <= 0.05),] nrow(hvg.out) ## -------------------------------------------------------------------------- hvg.out <- hvg.out[order(hvg.out$bio, decreasing=TRUE),] write.table(file="hsc_hvg.tsv", hvg.out, sep="\t", quote=FALSE, col.names=NA) head(hvg.out) ## ----hvgvioplothsc, fig.cap="Violin plots of normalized log-expression values for the top 10 genes with the largest biological components in the HSC dataset. Each point represents the log-expression value in a single cell."---- fontsize <- theme(axis.text=element_text(size=12), axis.title=element_text(size=16)) plotExpression(sce.hsc, features=rownames(hvg.out)[1:10]) + fontsize ## -------------------------------------------------------------------------- var.fit.nospike <- trendVar(sce.hsc, parametric=TRUE, use.spikes=FALSE, loess.args=list(span=0.2)) var.out.nospike <- decomposeVar(sce.hsc, var.fit.nospike) ## ----hvgplot416b2, fig.cap="Variance of normalized log-expression values for each gene in the 416B dataset, plotted against the mean log-expression. The blue line represents the mean-dependent trend fitted to the variances of the endogenous genes (black), with spike-in transcripts shown in red."---- plot(var.out.nospike$mean, var.out.nospike$total, pch=16, cex=0.6, xlab="Mean log-expression", ylab="Variance of log-expression") curve(var.fit.nospike$trend(x), col="dodgerblue", lwd=2, add=TRUE) points(var.out.nospike$mean[cur.spike], var.out.nospike$total[cur.spike], col="red", pch=16) ## ----trendplotblock-416b, fig.cap="Plate-specific variance estimates for all spike-in transcripts in the 416B dataset, plotted against the plate-specific means. Each point represents a spike-in transcript, numbered by the plate from which the values were estimated."---- # Loading the saved object. sce.416B <- readRDS("416B_data.rds") # Repeating the trendVar() call. var.fit <- trendVar(sce.416B, parametric=TRUE, block=sce.416B$Plate, loess.args=list(span=0.3)) matplot(var.fit$means, var.fit$vars, col=c("darkorange", "forestgreen")) ## ----sizefacplot-416b, fig.cap="Plate-specific distribution of the size factors for endogenous genes."---- tmp.416B <- sce.416B tmp.416B$log_size_factor <- log(sizeFactors(sce.416B)) plotColData(tmp.416B, x="Plate", y="log_size_factor") ## -------------------------------------------------------------------------- sce.416B.2 <- normalize(sce.416B, size_factor_grouping=sce.416B$Plate) comb.out <- multiBlockVar(sce.416B.2, block=sce.416B.2$Plate, trend.args=list(parametric=TRUE, loess.args=list(span=0.4))) ## -------------------------------------------------------------------------- head(comb.out[,1:6]) ## ----hvgplotbatch416b, fig.width=10, fig.asp=0.5, fig.cap="Variance of normalized log-expression values for each gene in each plate of the 416B dataset, plotted against the mean log-expression. The blue line represents the mean-dependent trend fitted to the variances of the spike-in transcripts (red)."---- par(mfrow=c(1,2)) is.spike <- isSpike(sce.416B.2) for (plate in levels(sce.416B.2$Plate)) { cur.out <- comb.out$per.block[[plate]] plot(cur.out$mean, cur.out$total, pch=16, cex=0.6, xlab="Mean log-expression", ylab="Variance of log-expression", main=plate) curve(metadata(cur.out)$trend(x), col="dodgerblue", lwd=2, add=TRUE) points(cur.out$mean[is.spike], cur.out$total[is.spike], col="red", pch=16) } ## ---- echo=FALSE, results="hide"------------------------------------------- gc() ## -------------------------------------------------------------------------- lfit <- trendVar(sce.416B, design=model.matrix(~sce.416B$Plate)) ## -------------------------------------------------------------------------- set.seed(100) var.cor <- correlatePairs(sce.hsc, subset.row=grep("^H2-", rownames(sce.hsc))) head(var.cor) ## -------------------------------------------------------------------------- sig.cor <- var.cor$FDR <= 0.05 summary(sig.cor) ## -------------------------------------------------------------------------- correlatePairs(sce.hsc, subset.row=cbind("Fos", "Jun")) ## ----fosjuncorplot, fig.cap="Expression of _Fos_ plotted against the expression of _Jun_ for all cells in the HSC dataset."---- plotExpression(sce.hsc, features="Fos", x="Jun") ## -------------------------------------------------------------------------- set.seed(1000) npcs <- parallelPCA(sce.416B, assay.type="corrected", subset.row=comb.out$bio > 0, value="n") as.integer(npcs) ## -------------------------------------------------------------------------- library(readxl) incoming <- as.data.frame(read_excel("nbt.3102-S7.xlsx", sheet=1)) rownames(incoming) <- incoming[,1] incoming <- incoming[,-1] incoming <- incoming[,!duplicated(colnames(incoming))] # Remove duplicated genes. sce.th2 <- SingleCellExperiment(list(logcounts=t(incoming))) ## ----phaseplotth2, message=FALSE, fig.cap="Cell cycle phase scores from applying the pair-based classifier on the T~H~2 dataset, where each point represents a cell."---- library(org.Mm.eg.db) ensembl <- mapIds(org.Mm.eg.db, keys=rownames(sce.th2), keytype="SYMBOL", column="ENSEMBL") set.seed(100) mm.pairs <- readRDS(system.file("exdata", "mouse_cycle_markers.rds", package="scran")) assignments <- cyclone(sce.th2, mm.pairs, gene.names=ensembl, assay.type="logcounts") plot(assignments$score$G1, assignments$score$G2M, xlab="G1 score", ylab="G2/M score", pch=16) ## -------------------------------------------------------------------------- design <- model.matrix(~ G1 + G2M, assignments$score) fit.block <- trendVar(sce.th2, design=design, parametric=TRUE, use.spikes=NA) dec.block <- decomposeVar(sce.th2, fit.block) library(limma) sce.th2.block <- sce.th2 assay(sce.th2.block, "corrected") <- removeBatchEffect( logcounts(sce.th2), covariates=design[,-1]) sce.th2.block <- denoisePCA(sce.th2.block, technical=dec.block, assay.type="corrected") dim(reducedDim(sce.th2.block, "PCA")) ## ----pcaplotth2, fig.width=12, fig.asp=0.5, fig.cap="PCA plots before (left) and after (right) removal of the cell cycle effect in the T~H~2 dataset. Each cell is represented by a point with colour and size determined by the G1 and G2/M scores, respectively."---- sce.th2$G1score <- sce.th2.block$G1score <- assignments$score$G1 sce.th2$G2Mscore <- sce.th2.block$G2Mscore <- assignments$score$G2M # Without blocking on phase score. fit <- trendVar(sce.th2, parametric=TRUE, use.spikes=NA) sce.th2 <- denoisePCA(sce.th2, technical=fit$trend) out <- plotReducedDim(sce.th2, use_dimred="PCA", ncomponents=2, colour_by="G1score", size_by="G2Mscore") + fontsize + ggtitle("Before removal") # After blocking on the phase score. out2 <- plotReducedDim(sce.th2.block, use_dimred="PCA", ncomponents=2, colour_by="G1score", size_by="G2Mscore") + fontsize + ggtitle("After removal") multiplot(out, out2, cols=2) ## ----diffusionth2, fig.cap="A diffusion map for the T~H~2 dataset, where each cell is coloured by its expression of _Gata3_. A larger `sigma` is used compared to the default value to obtain a smoother plot."---- plotDiffusionMap(sce.th2.block, colour_by="Gata3", run_args=list(use_dimred="PCA", sigma=25)) + fontsize ## -------------------------------------------------------------------------- sessionInfo()