## ----style, echo = FALSE, results = 'asis'----------------
BiocStyle::markdown()
options(width=60, max.print=1000)
knitr::opts_chunk$set(
    eval=as.logical(Sys.getenv("KNITR_EVAL", "TRUE")),
    cache=as.logical(Sys.getenv("KNITR_CACHE", "TRUE")), 
    tidy.opts=list(width.cutoff=60), tidy=TRUE)

## ----setup, echo=FALSE, message=FALSE, warning=FALSE, eval=FALSE----
#  suppressPackageStartupMessages({
#      library(systemPipeR)
#  })

## ----create_workflow, message=FALSE, eval=FALSE-----------
#  library(systemPipeR)
#  sal <- SPRproject()
#  sal

## ----load_packages, eval=FALSE, spr=TRUE------------------
#  cat(crayon::blue$bold("To use this workflow, following R packages are expected:\n"))
#  cat(c("'Seurat", "ggplot2", "ggpubr", "patchwork", "dplyr", "tibble", "readr'\n"), sep = "', '")
#  ###pre-end
#  appendStep(sal) <- LineWise(
#      code = {
#          library(systemPipeR)
#          library(Seurat)
#          library(dplyr)
#          library(ggplot2)
#          library(ggpubr)
#          library(patchwork)
#      },
#      step_name = "load_packages"
#  )

## ----load_data, eval=FALSE, spr=TRUE----------------------
#  appendStep(sal) <- LineWise(
#      code = {
#          # unzip the data
#          untar("data/pbmc3k_filtered_gene_bc_matrices.tar.gz", exdir = "data")
#          # load data
#          pbmc.data <- Read10X(data.dir = "data/filtered_gene_bc_matrices/hg19/")
#          # Use dim to see the size of dataset, example data has 2700 cells x 32738 genes
#          dim(pbmc.data)
#      },
#      step_name = "load_data",
#      dependency = "load_packages"
#  )

## ----count_plot, eval=FALSE, spr=TRUE---------------------
#  appendStep(sal) <- LineWise(
#      code = {
#          at_least_one <- apply(pbmc.data, 2, function(x) sum(x>0))
#          count_p1 <- tibble::as_tibble(at_least_one) %>%
#              ggplot() +
#                  geom_histogram(aes(x = value), binwidth = floor(nrow(pbmc.data)/400), fill = "#6b97c2", color = "white") +
#                  theme_pubr(16) +
#                  scale_y_continuous(expand = c(0, 0)) +
#                  scale_x_continuous(expand = c(0, 0)) +
#                  labs(title = "Distribution of detected genes", x = "Genes with at least one tag")
#  
#          count_p2 <- tibble::as_tibble(BiocGenerics::colSums(pbmc.data)) %>%
#              ggplot() +
#                  geom_histogram(aes(x = value), bins = floor(ncol(pbmc.data)/50), fill = "#6b97c2", color = "white") +
#                  theme_pubr(16) +
#                  scale_y_continuous(expand = c(0, 0)) +
#                  scale_x_continuous(expand = c(0, 0)) +
#                  labs(title = "Expression sum per cell", x = "Sum expression")
#  
#          png("results/count_plots.png", 1000, 700)
#          count_p1 + count_p2 +
#              patchwork::plot_annotation(tag_levels = "A")
#          dev.off()
#      },
#      step_name = "count_plot",
#      dependency = "load_data"
#  )

## ----create_seurat, eval=FALSE, spr=TRUE------------------
#  appendStep(sal) <- LineWise(
#      code = {
#          sce <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200)
#          # calculate mitochondria gene ratio
#          sce[['percent.mt']] <- PercentageFeatureSet(sce,pattern='^MT-')
#      },
#      step_name = "create_seurat",
#      dependency = "load_data"
#  )

## ----qc_seurat, eval=FALSE, spr=TRUE----------------------
#  appendStep(sal) <- LineWise(
#      code = {
#  
#          png("results/qc1.png", 700, 700)
#          VlnPlot(sce, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
#          dev.off()
#  
#          qc_p1 <- FeatureScatter(sce, feature1 = "nCount_RNA", feature2 = "percent.mt")
#          qc_p2 <- FeatureScatter(sce, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
#          png("results/qc2.png", 700, 450)
#          qc_p1 + qc_p2 + patchwork::plot_annotation(tag_levels = "A")
#          dev.off()
#      },
#      step_name = "qc_seurat",
#      dependency = "create_seurat"
#  )

## ----filter_cells, eval=FALSE, spr=TRUE-------------------
#  appendStep(sal) <- LineWise(
#      code = {
#          # Based on the QC plots, please change the settings accordingly
#          sce <- subset(
#              sce, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 &
#                  nCount_RNA < 10000 & percent.mt <= 5
#          )
#      },
#      step_name = "filter_cells",
#      dependency = "create_seurat"
#  )

## ----normalize, eval=FALSE, spr=TRUE----------------------
#  appendStep(sal) <- LineWise(
#      code = {
#          # scale.factor = 10000 is a convenient number for plotting, so the
#          # normalized counts is ranged between 0.xx to 10.
#          sce <- NormalizeData(sce, normalization.method = "LogNormalize", scale.factor = 10000)
#  
#          # compare counts before and after
#          count_p_norm <- tibble::as_tibble(BiocGenerics::colSums(sce$RNA@data)) %>%
#              ggplot() +
#                  geom_histogram(aes(x = value), bins = floor(ncol(pbmc.data)/50), fill = "#6b97c2", color = "white") +
#                  theme_pubr(16) +
#                  scale_y_continuous(expand = c(0, 0)) +
#                  scale_x_continuous(expand = c(0, 0)) +
#                  labs(title = "Total expression after normalization", x = "Sum expression")
#          png("results/normalize_count_compare.png", 1000, 700)
#          count_p2 + count_p_norm + patchwork::plot_annotation(tag_levels = "A")
#          dev.off()
#      },
#      step_name = "normalize",
#      dependency = "filter_cells"
#  )

## ----find_var_genes, eval=FALSE, spr=TRUE-----------------
#  appendStep(sal) <- LineWise(
#      code = {
#          # 2000 is default
#          sce <- FindVariableFeatures(sce, selection.method = "vst", nfeatures = 2000)
#          # top 10 variable genes
#          top10_var <- head(VariableFeatures(sce), 10)
#          # plot the top 2000 variable genes and mark top 10
#          png("results/variable_genes.png", 700, 600)
#          VariableFeaturePlot(sce) %>%
#              LabelPoints(points = top10_var, repel = TRUE) +
#              theme_pubr(16)
#          dev.off()
#      },
#      step_name = "find_var_genes",
#      dependency = "normalize"
#  )

## ----scaling, eval=FALSE, spr=TRUE------------------------
#  appendStep(sal) <- LineWise(
#      code = {
#          sce <- ScaleData(sce, features = rownames(sce))
#      },
#      step_name = "scaling",
#      dependency = "find_var_genes"
#  )

## ----pca, eval=FALSE, spr=TRUE----------------------------
#  appendStep(sal) <- LineWise(
#      code = {
#          # only use the top 2000 genes, first 50 PCs (default)
#          sce <- RunPCA(sce,features = VariableFeatures(object = sce), npcs = 50)
#          # we can use following command to see first 5 genes in each PC
#          print(sce$pca, dims = 1:5, nfeatures = 5)
#      },
#      step_name = "pca",
#      dependency = "scaling"
#  )

## ----pca_plots, eval=FALSE, spr=TRUE----------------------
#  appendStep(sal) <- LineWise(
#      code = {
#          # plot PCA overview
#          png("results/pca_overview.png", 500, 500)
#          DimPlot(sce , reduction = "pca")
#          dev.off()
#          # plot top contributed genes in PC 1 and 2
#          png("results/pca_loadings.png", 700, 550)
#          VizDimLoadings(sce , dims = 1:2, reduction = "pca")
#          dev.off()
#          # we can also use heatmap to show top genes in different PCs
#          png("results/pca_heatmap.png", 700, 700)
#          DimHeatmap(sce,dims = 1:6, cells = ncol(sce)/5, balanced = TRUE, slot = "scale.data")
#          dev.off()
#      },
#      step_name = "pca_plots",
#      dependency = "pca"
#  )

## ----choose_pcs, eval=FALSE, spr=TRUE---------------------
#  appendStep(sal) <- LineWise(
#      code = {
#          # for demo purposes, only a few replicates are used to speed up the calculation
#          # in your real data, use larger number like 100, etc.
#          sce <- JackStraw(sce, num.replicate = 30)
#          sce <- ScoreJackStraw(sce, dims = 1:20)
#          png("results/jackstraw.png", 660, 750)
#          JackStrawPlot(sce, dims = 1:20)
#          dev.off()
#  
#          png("results/elbow.png", 500, 500)
#          ElbowPlot(sce)
#          dev.off()
#      },
#      step_name = "choose_pcs",
#      dependency = "pca"
#  )

## ----find_clusters, eval=FALSE, spr=TRUE------------------
#  appendStep(sal) <- LineWise(
#      code = {
#          sce<- FindNeighbors(sce, dims = 1:10)
#          # resolution 0.4-1.2 good for 3000 cells, if you have more cells, increase
#          # the number will give you more clusters
#          sce<- FindClusters(sce, resolution = 0.5)
#      },
#      step_name = "find_clusters",
#      dependency = "pca"
#  )

## ----plot_cluster, eval=FALSE, spr=TRUE-------------------
#  appendStep(sal) <- LineWise(
#      code = {
#          sce <- RunUMAP(sce,dims = 1:20)
#          png("results/")
#          p_umap <- DimPlot(sce,reduction = 'umap', label = TRUE)
#          sce <- RunTSNE(sce,dims = 1:20)
#          p_tsne <- DimPlot(sce,reduction = 'tsne', label = TRUE)
#          png("results/plot_clusters.png", 1000, 570)
#          p_umap + p_tsne
#          dev.off()
#      },
#      step_name = "plot_cluster",
#      dependency = "find_clusters"
#  )

## ----find_markers, eval=FALSE, spr=TRUE-------------------
#  appendStep(sal) <- LineWise(
#      code = {
#          # find markers for every cluster compared to all remaining cells, report only the positive ones
#          sce.markers <- FindAllMarkers(sce, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
#          sce.markers %>% group_by(cluster) %>% top_n(n = 2, wt = avg_log2FC)
#      },
#      step_name = "find_markers",
#      dependency = "find_clusters"
#  )

## ----plot_markers, eval=FALSE, spr=TRUE-------------------
#  appendStep(sal) <- LineWise(
#      code = {
#          png("results/vlnplot.png", 600, 600)
#          VlnPlot(sce, features = c("MS4A1", "CD79A"))
#          dev.off()
#  
#          png("results/marker_features.png", 700, 500)
#          FeaturePlot(sce, features = c("MS4A1", "GNLY", "CD3E", "CD14"))
#          dev.off()
#  
#          # plot top 10 DEG genes in each cluster as a heatmap
#          top10_markers <- sce.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)
#          png("results/marker_heatmap.png", 1100, 700)
#          DoHeatmap(sce, features = top10_markers$gene) + NoLegend()
#          dev.off()
#      },
#      step_name = "plot_markers",
#      dependency = "find_markers"
#  )

## ----label_cell_type, eval=FALSE, spr=TRUE----------------
#  appendStep(sal) <- LineWise(
#      code = {
#          new.cluster.ids <- c("Naive CD4 T", "CD14+ Mono", "Memory CD4 T", "B",
#                               "CD8 T", "FCGR3A+ Mono","NK", "DC", "Platelet")
#          names(new.cluster.ids) <- levels(sce)
#          sce<- RenameIdents(sce, new.cluster.ids)
#          png("results/marker_labels.png", 700, 700)
#          DimPlot(sce, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()
#          dev.off()
#  
#      },
#      step_name = "label_cell_type",
#      dependency = c("plot_cluster", "find_markers")
#  )

## ----wf_session, eval=FALSE, spr=TRUE---------------------
#  appendStep(sal) <- LineWise(
#      code = {
#          sessionInfo()
#      },
#      step_name = "wf_session",
#      dependency = "label_cell_type")

## ----runWF, eval=FALSE------------------------------------
#  sal <- runWF(sal, run_step = "mandatory") # remove `run_step` to run all steps to include optional steps

## ----list_tools-------------------------------------------
if(file.exists(file.path(".SPRproject", "SYSargsList.yml"))) {
    local({
        sal <- systemPipeR::SPRproject(resume = TRUE)
        systemPipeR::listCmdTools(sal)
        systemPipeR::listCmdModules(sal)
    })
} else {
    cat(crayon::blue$bold("Tools and modules required by this workflow are:\n"))
    cat(c("NA"), sep = "\n")
}

## ----report_session_info, eval=TRUE-----------------------
sessionInfo()