## ----setup, include=FALSE-----------------------------------------------------
options(width = 80)
knitr::opts_chunk$set(collapse = TRUE,
                      message = FALSE,
                      comment = "#>",
                      dpi = 250)

suppressPackageStartupMessages({
    library(TMSig)
})

## ----read-gmt-----------------------------------------------------------------
# Path to GMT file - MSigDB Gene Ontology sets
pathToGMT <- system.file(
    "extdata", "c5.go.v2023.2.Hs.symbols.gmt.gz",
    package = "TMSig"
)

geneSets <- readGMT(path = pathToGMT)

length(geneSets) # 10461 gene sets

head(names(geneSets)) # first 6 gene set names

geneSets[[1]] # genes in first set

## ----filter-sets--------------------------------------------------------------
# Filter by size - no background
filt <- filterSets(x = geneSets, 
                   min_size = 10L, 
                   max_size = 500L)
length(filt) # 7096 gene sets remain (down from 10461)

## ----create-background--------------------------------------------------------
# Top 2000 most common genes
topGenes <- table(unlist(geneSets))
topGenes <- head(sort(topGenes, decreasing = TRUE), 2000L)
head(topGenes)

background <- names(topGenes)

## ----filter-sets-bg-----------------------------------------------------------
# Restrict genes sets to background of genes
geneSetsFilt <- filterSets(
    x = geneSets, 
    background = background, 
    min_size = 10L # min. overlap of each set with background
)

length(geneSetsFilt) # 4629 gene sets pass

## ----overlap-prop-opts, include=FALSE-----------------------------------------
cap <- "Scatterplot of the original set size vs. overlap proportion."

## ----overlap-prop, fig.cap=cap------------------------------------------------
# Calculate proportion of overlap with background
setSizes <- lengths(geneSetsFilt)
setSizesOld <- lengths(geneSets)[names(geneSetsFilt)]
overlapProp <- setSizes / setSizesOld

plot(setSizesOld, overlapProp, main = "Set Size vs. Overlap")

## ----filter-by-overlap--------------------------------------------------------
table(overlapProp >= 0.7)

geneSetsFilt <- geneSetsFilt[overlapProp >= 0.7]
length(geneSetsFilt) # 1015 gene sets pass

## ----incidence-matrix---------------------------------------------------------
imat <- sparseIncidence(x = geneSetsFilt)
dim(imat) # 1015 sets, 1914 genes

imat[seq_len(8L), seq_len(5L)] # first 8 sets, first 5 genes

## ----incidence-products-------------------------------------------------------
# Calculate sizes of all pairwise intersections
tcrossprod(imat)[1:3, 1:3] # first 3 gene sets

# Calculate number of sets and pairs of sets to which each gene belongs
crossprod(imat)[1:3, 1:3] # first 3 genes

## ----sim-matrix---------------------------------------------------------------
# Jaccard similarity (default)
jaccard <- similarity(x = geneSetsFilt)
dim(jaccard) # 1015 1015`
class(jaccard)

## ----sim-jaccard--------------------------------------------------------------
# 6 sets with highest Jaccard for a specific term
idx <- order(jaccard[, "GOBP_CARDIAC_ATRIUM_DEVELOPMENT"], 
             decreasing = TRUE)
idx <- head(idx)

jaccard[idx, "GOBP_CARDIAC_ATRIUM_DEVELOPMENT", drop = FALSE]

## ----sim-simpson--------------------------------------------------------------
overlap <- similarity(x = geneSetsFilt, type = "overlap")

overlap[idx, "GOBP_CARDIAC_ATRIUM_DEVELOPMENT", drop = FALSE]

## ----sim-otsuka---------------------------------------------------------------
otsuka <- similarity(x = geneSetsFilt, type = "otsuka")

otsuka[idx, "GOBP_CARDIAC_ATRIUM_DEVELOPMENT", drop = FALSE]

## ----cluster-sets-------------------------------------------------------------
# clusterSets with default arguments
clusterDF <- clusterSets(x = geneSetsFilt, 
                         type = "jaccard", 
                         cutoff = 0.85, 
                         method = "complete",
                         h = 0.9)

# First 4 clusters with 2 or more sets
subset(clusterDF, subset = cluster <= 4L)

## ----add-clusterDF-columns----------------------------------------------------
## Use clusterSets output to select sets to retain for analysis
clusterDF$overlap_prop <- overlapProp[clusterDF$set] # overlap proportion
clusterDF$n_char <- nchar(clusterDF$set) # length of set description

# Reorder rows so that the first set in each cluster is the one to keep
o <- with(clusterDF, 
          order(cluster, set_size, overlap_prop, n_char, set,
                decreasing = c(FALSE, TRUE, TRUE, FALSE, TRUE),
                method = "radix"))
clusterDF <- clusterDF[o, ]

subset(clusterDF, cluster <= 4L) # show first 4 clusters

## ----remove-similar-sets------------------------------------------------------
# Sets to keep for analysis
keepSets <- with(clusterDF, set[!duplicated(cluster)])
head(keepSets, 4L)

# Subset geneSetsFilt to those sets
geneSetsFilt <- geneSetsFilt[keepSets]
length(geneSetsFilt) # 986 (down from 1015)

## ----venn-diagram-chunk-opts, include=FALSE-----------------------------------
h <- w <- "80%"
cap <- "Decomposition of sets."

## ----venn-diagram, echo=FALSE, out.height=h, out.width=w, fig.cap=cap---------
knitr::include_graphics("images/Venn_Diagram.png")

## ----decompose-sets-----------------------------------------------------------
# Limit maximum set size for this example
geneSetsFilt2 <- filterSets(geneSetsFilt, max_size = 50L)

# Decompose all pairs of sets with at least 10 genes in common
decompSets <- decomposeSets(x = geneSetsFilt2, overlap = 10L)

# Last 3 components
tail(decompSets, 3L)

## ----invert-sets--------------------------------------------------------------
invertedSets <- invertSets(x = geneSetsFilt)

length(invertedSets) # 1914 genes

head(names(invertedSets)) # names are genes

invertedSets["FOXO1"] # all gene sets containing FOXO1

## -----------------------------------------------------------------------------
similarity(x = invertedSets[1:5]) # first 5 genes

## ----simulate-expression-data-------------------------------------------------
# Control and 2 treatment groups, 3 replicates each
group <- rep(c("ctrl", "treat1", "treat2"), 
             each = 3)
design <- model.matrix(~ 0 + group) # design matrix
contrasts <- makeContrasts(
    contrasts = c("grouptreat1 - groupctrl",
                  "grouptreat2 - groupctrl"),
    levels = colnames(design)
)

# Shorten contrast names
colnames(contrasts) <- gsub("group", "", colnames(contrasts))

ngenes <- length(background) # 2000 genes
nsamples <- length(group) # 9 samples

set.seed(0)
y <- matrix(data = rnorm(ngenes * nsamples),
            nrow = ngenes, ncol = nsamples,
            dimnames = list(background, make.unique(group)))
head(y)

## ----perturb-gene-sets--------------------------------------------------------
cardiacGenes <- geneSetsFilt[["GOBP_CARDIAC_ATRIUM_DEVELOPMENT"]]
tcellGenes <- geneSetsFilt[["GOBP_ACTIVATED_T_CELL_PROLIFERATION"]]

# Indices of treatment group samples
trt1 <- which(group == "treat1")
trt2 <- which(group == "treat2")

# Cardiac genes: higher in control relative to treatment
y[cardiacGenes, trt1] <- y[cardiacGenes, trt1] - 2
y[cardiacGenes, trt2] <- y[cardiacGenes, trt2] - 0.7

# T cell proliferation genes: higher in treatment relative to control
y[tcellGenes, trt1] <- y[tcellGenes, trt1] + 2
y[tcellGenes, trt2] <- y[tcellGenes, trt2] + 1

## ----differential-analysis----------------------------------------------------
# Differential analysis with LIMMA
fit <- lmFit(y, design)
fit.contr <- contrasts.fit(fit, contrasts = contrasts)
fit.smooth <- eBayes(fit.contr)

# Matrix of z-score equivalents of moderated t-statistics
modz <- with(fit.smooth, zscoreT(x = t, df = df.total))
head(modz)

## ----reformat-DA-results------------------------------------------------------
# Reformat differential analysis results for enrichmap
resDA <- lapply(colnames(contrasts), function(contrast_i) {
    res_i <- topTable(fit.smooth, 
                      coef = contrast_i, 
                      number = Inf, 
                      sort.by = "none")
    res_i$contrast <- contrast_i
    res_i$gene <- rownames(res_i)
    res_i$df.total <- fit.smooth$df.total
    
    return(res_i)
})

resDA <- data.table::rbindlist(resDA)

# Add z-statistic column
resDA$z <- with(resDA, zscoreT(x = t, df = df.total))

# Reorder rows
resDA <- resDA[with(resDA, order(contrast, P.Value, z)), ]

head(resDA)

## ----count-signif-genes-------------------------------------------------------
# Count number of significant (P adj. < 0.05) genes
table(resDA$contrast, resDA$adj.P.Val < 0.05)

## ----camera-PR----------------------------------------------------------------
# CAMERA-PR (matrix method)
res <- cameraPR(statistic = modz, 
                index = geneSetsFilt)
head(res)

## ----count-signif-sets--------------------------------------------------------
# Number of sets passing FDR threshold
table(res$Contrast, res$FDR < 0.05)

## ----echo=FALSE---------------------------------------------------------------
j1 <- round(jaccard["GOBP_CARDIAC_ATRIUM_DEVELOPMENT", 
                    "GOBP_ATRIAL_SEPTUM_DEVELOPMENT"], 
            digits = 3)
o1 <- overlap["GOBP_CARDIAC_ATRIUM_DEVELOPMENT", 
              "GOBP_ATRIAL_SEPTUM_DEVELOPMENT"]

## ----DA-heatmap-chunk-opts, include=FALSE-------------------------------------
h <- w <- "70%"
cap <- "Bubble heatmap of differential expression analysis results."

## ----DA-bubble-heatmap, out.height=h, out.width=w, fig.cap=cap----------------
# Differential analysis bubble heatmap
enrichmap(x = resDA, 
          scale_by = "max",
          n_top = 15L, # default
          plot_sig_only = TRUE, # default
          set_column = "gene", 
          statistic_column = "z", 
          contrast_column = "contrast", 
          padj_column = "adj.P.Val", 
          padj_legend_title = "BH Adjusted\nP-Value", 
          padj_cutoff = 0.05,
          cell_size = grid::unit(20, "pt"),
          # Additional arguments passed to ComplexHeatmap::Heatmap. Used to
          # modify default appearance.
          heatmap_args = list(
              name = "Z-Score",
              column_names_rot = 60,
              column_names_side = "top"
          ))

## ----camera-heatmap-chunk-opts, include=FALSE---------------------------------
h <- w <- "100%"
cap <- "Bubble heatmap of CAMERA-PR results."

## ----CAMERA-bubble-heatmap, out.height=h, out.width=w, fig.cap=cap------------
# CAMERA-PR bubble heatmap
enrichmap(x = res, 
          scale_by = "row", # default
          n_top = 20L,
          set_column = "GeneSet", 
          statistic_column = "ZScore", 
          contrast_column = "Contrast",
          padj_column = "FDR",
          padj_legend_title = "BH Adjusted\nP-Value",
          padj_cutoff = 0.05,
          cell_size = grid::unit(13, "pt"),
          # Additional arguments passed to ComplexHeatmap::Heatmap. Used to
          # modify default appearance.
          heatmap_args = list(
              name = "Z-Score", 
              column_names_rot = 60,
              column_names_side = "top"
          ))

## ----session-info-------------------------------------------------------------
print(sessionInfo(), locale = FALSE)