## ----Installation, eval = FALSE-----------------------------------------------
#  if(!require("tidyverse")) install.packages("tidyverse")
#  if(!require("ggpubr")) install.packages("ggpubr")
#  BiocManager::install("CelliD")

## ----library load, message=FALSE----------------------------------------------
library(CelliD)
library(tidyverse) # general purpose library for data handling
library(ggpubr) #library for plotting

## ----download data------------------------------------------------------------
#read data
BaronMatrix   <- readRDS(url("https://storage.googleapis.com/cellid-cbl/BaronMatrix.rds"))
BaronMetaData <- readRDS(url("https://storage.googleapis.com/cellid-cbl/BaronMetaData.rds"))
SegerMatrix   <- readRDS(url("https://storage.googleapis.com/cellid-cbl/SegerstolpeMatrix.rds"))
SegerMetaData <- readRDS(url("https://storage.googleapis.com/cellid-cbl/SegerstolpeMetaData2.rds"))

## ----preprocessing1, message=FALSE--------------------------------------------
# Restricting to protein-coding genes
data("HgProteinCodingGenes")
BaronMatrixProt <- BaronMatrix[rownames(BaronMatrix) %in% HgProteinCodingGenes,]
SegerMatrixProt <- SegerMatrix[rownames(SegerMatrix) %in% HgProteinCodingGenes,]

## ----preprocessing2, message=FALSE--------------------------------------------
# Create Seurat object  and remove remove low detection genes
Baron <- CreateSeuratObject(counts = BaronMatrixProt, project = "Baron", min.cells = 5, meta.data = BaronMetaData)
Seger <- CreateSeuratObject(counts = SegerMatrixProt, project = "Segerstolpe", min.cells = 5, meta.data = SegerMetaData)

# Library-size normalization, log-transformation, and centering and scaling of gene expression values
Baron <- NormalizeData(Baron)
Baron <- ScaleData(Baron, features = rownames(Baron))

## ----MCA1---------------------------------------------------------------------
Baron <- RunMCA(Baron)

## ----MCA2---------------------------------------------------------------------
DimPlotMC(Baron, reduction = "mca", group.by = "cell.type", features = c("CTRL", "INS", "MYZAP", "CDH11"), as.text = TRUE) + ggtitle("MCA with some key gene markers")

## ----Std Seurat---------------------------------------------------------------

Baron <- RunPCA(Baron, features = rownames(Baron))
Baron <- RunUMAP(Baron, dims = 1:30)
Baron <- RunTSNE(Baron, dims = 1:30)
PCA  <- DimPlot(Baron, reduction = "pca", group.by = "cell.type")  + ggtitle("PCA") + theme(legend.text = element_text(size =10), aspect.ratio = 1)
tSNE <- DimPlot(Baron, reduction = "tsne", group.by = "cell.type")+ ggtitle("tSNE") + theme(legend.text = element_text(size =10), aspect.ratio = 1)
UMAP <- DimPlot(Baron, reduction = "umap", group.by = "cell.type") + ggtitle("UMAP") + theme(legend.text = element_text(size =10), aspect.ratio = 1)
MCA <- DimPlot(Baron, reduction = "mca", group.by = "cell.type")  + ggtitle("MCA") + theme(legend.text = element_text(size =10), aspect.ratio = 1)
ggarrange(PCA, MCA, common.legend = TRUE, legend = "top")
ggarrange(tSNE, UMAP, common.legend = TRUE, legend = "top")

## ----pancreas_gs--------------------------------------------------------------
# download all cell-type gene signatures from panglaoDB
panglao <- read_tsv("https://panglaodb.se/markers/PanglaoDB_markers_27_Mar_2020.tsv.gz")

# restricting the analysis to pancreas specific gene signatues
panglao_pancreas <- panglao %>% filter(organ == "Pancreas")

# restricting to human specific genes
panglao_pancreas <- panglao_pancreas %>%  filter(str_detect(species,"Hs"))

# converting dataframes into a list of vectors, which is the format needed as input for CelliD
panglao_pancreas <- panglao_pancreas %>%  
  group_by(`cell type`) %>%  
  summarise(geneset = list(`official gene symbol`))
pancreas_gs <- setNames(panglao_pancreas$geneset, panglao_pancreas$`cell type`)

## ----all gs-------------------------------------------------------------------
#filter to get human specific genes
panglao_all <- panglao %>%  filter(str_detect(species,"Hs"))

# convert dataframes to a list of named vectors which is the format for CelliD input
panglao_all <- panglao_all %>%  
  group_by(`cell type`) %>%  
  summarise(geneset = list(`official gene symbol`))
all_gs <- setNames(panglao_all$geneset, panglao_all$`cell type`)

#remove very short signatures
all_gs <- all_gs[sapply(all_gs, length) >= 10]

## ----predictions using pancreas preestablished geneset------------------------

# Performing per-cell hypergeometric tests against the gene signature collection
HGT_pancreas_gs <- RunCellHGT(Baron, pathways = pancreas_gs, dims = 1:50, n.features = 200)

# For each cell, assess the signature with the lowest corrected p-value (max -log10 corrected p-value)
pancreas_gs_prediction <- rownames(HGT_pancreas_gs)[apply(HGT_pancreas_gs, 2, which.max)]

# For each cell, evaluate if the lowest p-value is significant
pancreas_gs_prediction_signif <- ifelse(apply(HGT_pancreas_gs, 2, max)>2, yes = pancreas_gs_prediction, "unassigned")

# Save cell type predictions as metadata within the Seurat object
Baron$pancreas_gs_prediction <- pancreas_gs_prediction_signif

## ----plottting cell type predictions based on pancreas-specific gene signatures----
# Comparing the original labels with CelliD cell-type predictions based on pancreas-specific gene signatures
color <- c("#F8766D", "#E18A00", "#BE9C00", "#8CAB00", "#24B700", "#00BE70", "#00C1AB", "#00BBDA", "#00ACFC", "#8B93FF", "#D575FE", "#F962DD", "#FF65AC", "grey")
ggcolor <- setNames(color,c(sort(unique(Baron$cell.type)), "unassigned"))
OriginalPlot <- DimPlot(Baron, reduction = "tsne", group.by = "cell.type") + 
  scale_color_manual(values = ggcolor) + 
  theme(legend.text = element_text(size =10), aspect.ratio = 1)
Predplot1 <- DimPlot(Baron, reduction = "tsne", group.by = "pancreas_gs_prediction") + 
  scale_color_manual(values = ggcolor) + 
  theme(legend.text = element_text(size =10), aspect.ratio = 1)
ggarrange(OriginalPlot, Predplot1, legend = "top",common.legend = TRUE)

## ----predictions using all preestablished geneset-----------------------------
HGT_all_gs <- RunCellHGT(Baron, pathways = all_gs, dims = 1:50)
all_gs_prediction <- rownames(HGT_all_gs)[apply(HGT_all_gs, 2, which.max)]
all_gs_prediction_signif <- ifelse(apply(HGT_all_gs, 2, max)>2, yes = all_gs_prediction, "unassigned")

# For the sake of visualization, we group under the label "other" diverse cell types for which significant enrichments were found:
Baron$all_gs_prediction <- ifelse(all_gs_prediction_signif %in% c(names(pancreas_gs), "Schwann cells", "Endothelial cells", "Macrophages", "Mast cells", "T cells","Fibroblasts", "unassigned"), all_gs_prediction_signif,"other")

color <- c("#F8766D", "#E18A00", "#BE9C00", "#8CAB00", "#24B700", "#00BE70", "#00C1AB", "#00BBDA", "#00ACFC", "#8B93FF", "#D575FE", "#F962DD", "#FF65AC", "#D575FE", "#F962DD", "grey", "black")
ggcolor <- setNames(color,c(sort(unique(Baron$cell.type)), "Fibroblasts", "Schwann cells", "unassigned", "other"))
Baron$pancreas_gs_prediction <- factor(Baron$pancreas_gs_prediction,c(sort(unique(Baron$cell.type)), "Fibroblasts", "Schwann cells", "unassigned", "other"))
PredPlot2 <- DimPlot(Baron, group.by = "all_gs_prediction", reduction = "tsne") + 
  scale_color_manual(values = ggcolor, drop = FALSE) + 
  theme(legend.text = element_text(size = 10), aspect.ratio = 1)
ggarrange(OriginalPlot, PredPlot2, legend = "top", common.legend = TRUE)

## ----geneset extraction-------------------------------------------------------
# Extracting per-cell gene signatures from the Baron dataset with CelliD(c)
Baron_cell_gs <- GetCellGeneSet(Baron, dims = 1:50, n.features = 200)

# Extracting per-group gene signatures from the Baron dataset with CelliD(g)
Baron_group_gs <- GetGroupGeneSet(Baron, dims = 1:50, n.features = 200, group.by = "cell.type")

## ----segerstolpe preprocessing, message = FALSE-------------------------------
# Normalization, basic preprocessing and MCA dimensionality reduction assessment
Seger <- NormalizeData(Seger)
Seger <- FindVariableFeatures(Seger)
Seger <- ScaleData(Seger)
Seger <- RunMCA(Seger, nmcs = 50)
Seger <- RunPCA(Seger)
Seger <- RunUMAP(Seger, dims = 1:30)
Seger <- RunTSNE(Seger, dims = 1:30)
tSNE <- DimPlot(Seger, reduction = "tsne", group.by = "cell.type", pt.size = 0.1) + ggtitle("tSNE") + theme(aspect.ratio = 1)
UMAP <- DimPlot(Seger, reduction = "umap", group.by = "cell.type", pt.size = 0.1) + ggtitle("UMAP") + theme(aspect.ratio = 1)
ggarrange(tSNE, UMAP, common.legend = TRUE, legend = "top")

## ----predictions using extracted geneset cell---------------------------------
HGT_baron_cell_gs <- RunCellHGT(Seger, pathways = Baron_cell_gs, dims = 1:50)
baron_cell_gs_match <- rownames(HGT_baron_cell_gs)[apply(HGT_baron_cell_gs, 2, which.max)]
baron_cell_gs_prediction <- Baron$cell.type[baron_cell_gs_match]
baron_cell_gs_prediction_signif <- ifelse(apply(HGT_baron_cell_gs, 2, max)>2, yes = baron_cell_gs_prediction, "unassigned")
Seger$baron_cell_gs_prediction <- baron_cell_gs_prediction_signif

color <- c("#F8766D", "#E18A00", "#BE9C00", "#8CAB00", "#24B700", "#00BE70", "#00C1AB", "#00BBDA", "#00ACFC", "#8B93FF", "#D575FE", "#F962DD", "#FF65AC", "grey")
ggcolor <- setNames(color,c(sort(unique(Baron$cell.type)), "unassigned"))

ggPredictionsCellMatch <- DimPlot(Seger, group.by = "baron_cell_gs_prediction", pt.size = 0.2, reduction = "tsne") + ggtitle("Predicitons") + scale_color_manual(values = ggcolor, drop =FALSE) + 
  theme(legend.text = element_text(size =10), aspect.ratio = 1)
ggOriginal <- DimPlot(Seger, group.by = "cell.type", pt.size = 0.2, reduction = "tsne") + ggtitle("Original") + scale_color_manual(values = ggcolor) + 
  theme(legend.text = element_text(size =10), aspect.ratio = 1)
ggarrange(ggOriginal, ggPredictionsCellMatch, legend = "top", common.legend = TRUE)

## ----predictions using extracted geneset group--------------------------------
HGT_baron_group_gs <- RunCellHGT(Seger, pathways = Baron_group_gs, dims = 1:50)
baron_group_gs_prediction <- rownames(HGT_baron_group_gs)[apply(HGT_baron_group_gs, 2, which.max)]
baron_group_gs_prediction_signif <- ifelse(apply(HGT_baron_group_gs, 2, max)>2, yes = baron_group_gs_prediction, "unassigned")
Seger$baron_group_gs_prediction <- baron_group_gs_prediction_signif

color <- c("#F8766D", "#E18A00", "#BE9C00", "#8CAB00", "#24B700", "#00BE70", "#00C1AB", "#00BBDA", "#00ACFC", "#8B93FF", "#D575FE", "#F962DD", "#FF65AC", "grey")
ggcolor <- setNames(color,c(sort(unique(Baron$cell.type)), "unassigned"))

ggPredictions <- DimPlot(Seger, group.by = "baron_group_gs_prediction", pt.size = 0.2, reduction = "tsne") + ggtitle("Predicitons") + scale_color_manual(values = ggcolor, drop =FALSE) + 
  theme(legend.text = element_text(size =10), aspect.ratio = 1)
ggOriginal <- DimPlot(Seger, group.by = "cell.type", pt.size = 0.2, reduction = "tsne") + ggtitle("Original") + scale_color_manual(values = ggcolor) + 
  theme(legend.text = element_text(size =10), aspect.ratio = 1)
ggarrange(ggOriginal, ggPredictions, legend = "top", common.legend = TRUE)

## ---- Functionnal Enrichment--------------------------------------------------
# Downloading functional gene sets:
# For computational reasons, we just developed here the assessment on Hallmark MSigDB gene sets that is provided in the package
# Other functionnal pathways can be obtained using the msigdbr package

# library(msigdbr)
# msigdf <- msigdbr::msigdbr(species = "Homo sapiens")
# msigdf_filtered <- msigdf %>% select(gs_cat, gs_subcat, gs_name, gene_symbol, gs_description) %>%
#   filter(gs_subcat == "CP:KEGG") %>%
#   mutate(gs_subcat = ifelse(gs_subcat == "", "Hallmark", gs_subcat))
# KEGGgenesetDF <- msigdf_filtered %>%  mutate(gs_name = str_replace(sub("^.*?_{1}", "", gs_name), "_", "\\-")) %>%
#   group_by(gs_subcat, gs_name) %>%
#   summarise(geneset = list(gene_symbol)) %>%
#   group_by(gs_subcat) %>%
#   summarise(all_geneset = list(setNames(geneset, gs_name))) 
# KEGGgeneset <- setNames(KEGGgenesetDF$all_geneset, KEGGgenesetDF$gs_subcat)

# Assessing per-cell functional enrichment analyses
HGT_Hallmark <- RunCellHGT(Seger, pathways = Hallmark, dims = 1:50)

#Integrating functional annotations as an "assay" slot in the Seurat objects

Seger[["Hallmark"]] <- CreateAssayObject(HGT_Hallmark)

# Visualizing per-cell functional enrichment annotations in a dimensionality-reduction representation of choice (e.g. MCA, PCA, tSNE, UMAP)
ggG2Mcell <- FeaturePlot(Seger, "G2M-CHECKPOINT", order = TRUE, reduction = "tsne", min.cutoff = 2) + 
  theme(legend.text = element_text(size =10), aspect.ratio = 1)

ggG2Mcell

## ----sessionInfo, echo=FALSE--------------------------------------------------
sessionInfo()