---
title: "Multi-sample analysis (10x Visium Human DLPFC)"
output: BiocStyle::html_document
# output: pdf_document
vignette: >
  %\VignetteIndexEntry{Multi-sample analysis (10x Visium Human DLPFC)}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
    collapse = TRUE,
    comment = "#>",
    fig.path = "figures/",
    dpi = 36
)
```

Here, we demonstrate BANKSY analysis on 10x Visium data of the human 
dorsolateral prefrontal cortex from Maynard et al (2018). The data comprise 12 
samples obtained from 3 subjects, with manual annotation of the layers in each 
sample. We will focus on 4 of the 12 samples from subject 3, demonstrating 
multi-sample analysis with BANKSY.

```{r, eval=TRUE, include=FALSE}
start.time <- Sys.time()
```

```{r, eval=TRUE, message=FALSE, warning=FALSE}
library(Banksy)
library(SummarizedExperiment)
library(SpatialExperiment)
library(Seurat)

library(scater)
library(cowplot)
library(ggplot2)
```

# Loading the data

We fetch the data for all 12 DLPFC samples with the [*spatialLIBD*](https://bioconductor.org/packages/release/data/experiment/html/spatialLIBD.html) 
package. This might take awhile.

```{r, eval=TRUE, message=FALSE, warning=FALSE}
library(spatialLIBD)
library(ExperimentHub)

ehub <- ExperimentHub::ExperimentHub()
spe <- spatialLIBD::fetch_data(type = "spe", eh = ehub)
```

After the download is completed, we trim the *SpatialExperiment* object, 
retaining only the counts and some metadata such as the sample identifier and 
pathology annotations. This saves some memory.

```{r, eval=TRUE, message=FALSE, warning=FALSE}
imgData(spe) <- NULL
assay(spe, "logcounts") <- NULL
reducedDims(spe) <- NULL
rowData(spe) <- NULL
colData(spe) <- DataFrame(
    sample_id = spe$sample_id,
    clust_annotation = factor(
        addNA(spe$layer_guess_reordered_short),
        exclude = NULL, labels = seq(8)
    ),
    in_tissue = spe$in_tissue,
    row.names = colnames(spe)
)
invisible(gc())
```

Next, subset `spe` to samples from the last subject (samples `151673`, 
`151674`, `151675`, `151676`). This stores each sample in its own 
*SpatialExperiment* object, all placed in a list.

```{r, eval=TRUE, message=FALSE, warning=FALSE}
sample_names <- as.character(151673:151676)
spe_list <- lapply(sample_names, function(x) spe[, spe$sample_id == x])
rm(spe)
invisible(gc())
```

# Data preprocessing

Using Seurat, we perform basic normalisation of the data, and select the top
2000 highly variable features from each sample. Other methods for normalisation 
and feature selection may also be used. We take the union of these features for 
downstream analysis.

```{r, eval=TRUE, message=FALSE, warning=FALSE}
#' Normalize data
seu_list <- lapply(spe_list, function(x) {
    x <- as.Seurat(x, data = NULL)
    NormalizeData(x, scale.factor = 5000, normalization.method = 'RC')
})

#' Compute HVGs
hvgs <- lapply(seu_list, function(x) {
    VariableFeatures(FindVariableFeatures(x, nfeatures = 2000))
})
hvgs <- Reduce(union, hvgs)

#' Add data to SpatialExperiment and subset to HVGs
aname <- "normcounts"
spe_list <- Map(function(spe, seu) {
    assay(spe, aname) <- GetAssayData(seu)
    spe[hvgs,]
    }, spe_list, seu_list)
rm(seu_list)
invisible(gc())
```

# Running BANKSY

To run BANKSY across multiple samples, we first compute the BANKSY neighborhood
feature matrices for each sample separately. We use `k_geom=6` corresponding to
the first-order neighbors in 10x Visium assays (`k_geom=18` corresponding to 
first and second-order neighbors may also be used).

```{r, eval=TRUE, message=FALSE, warning=FALSE}
compute_agf <- FALSE
k_geom <- 6
spe_list <- lapply(spe_list, computeBanksy, assay_name = aname, 
                   compute_agf = compute_agf, k_geom = k_geom)
```

We then merge the samples to perform joint dimensional reduction and 
clustering:

```{r, eval=TRUE, message=FALSE, warning=FALSE}
spe_joint <- do.call(cbind, spe_list)
rm(spe_list)
invisible(gc())
```

When running multi-sample BANKSY PCA, the `group` argument may be provided. 
This specifies the grouping variable for the cells or spots across the samples.
Features belonging to cells or spots corresponding to each level of the 
grouping variable will be z-scaled separately. In this case, `sample_id` in 
`colData(spe_joint)` gives the grouping based on the sample of origin.

```{r, eval=TRUE, message=FALSE, warning=FALSE}
lambda <- 0.2
use_agf <- FALSE
spe_joint <- runBanksyPCA(spe_joint, use_agf = use_agf, lambda = lambda, group = "sample_id", seed = 1000)
```

Run UMAP on the BANKSY embedding:

```{r, eval=TRUE, message=FALSE, warning=FALSE}
spe_joint <- runBanksyUMAP(spe_joint, use_agf = use_agf, lambda = lambda, seed = 1000)
```

Finally, obtain cluster labels for spots across all 4 samples. We use 
`connectClusters` for visual comparison of the manual annotations and BANKSY 
clusters.

```{r, eval=TRUE, message=FALSE, warning=FALSE}
res <- 0.7
spe_joint <- clusterBanksy(spe_joint, use_agf = use_agf, lambda = lambda, resolution = res, seed = 1000)
cnm <- sprintf("clust_M%s_lam%s_k50_res%s", as.numeric(use_agf), lambda, res)
spe_joint <- connectClusters(spe_joint)
```

Once joint clustering is performed, we split the samples into their own
`SpatialExperiment` objects:

```{r, eval=TRUE, message=FALSE, warning=FALSE}
spe_list <- lapply(sample_names, function(x) spe_joint[, spe_joint$sample_id == x])
rm(spe_joint)
invisible(gc())
```

As an optional step, we smooth the cluster labels of each sample separately. 
This can be done if smooth spatial domains are expected in the biological 
sample or tissue in question.

```{r, eval=TRUE, message=FALSE, warning=FALSE}
spe_list <- lapply(spe_list, smoothLabels, cluster_names = cnm, k = 6L, verbose = FALSE)
names(spe_list) <- paste0("sample_", sample_names)
```

The raw and smoothed cluster labels are stored in the `colData` slot of each 
`SingleCellExperiment` or `SpatialExperiment` object. 

```{r, eval=TRUE, echo=FALSE}
head(colData(spe_list$sample_151673))
```

# Parsing BANKSY output

We can compare BANKSY clusters to pathology annotations using several cluster
comparison measures such as the adjusted Rand index (ARI) or normalized mutual
information (NMI) with `compareClusters`. The function computes the selected 
comparison measure between all pairs of cluster labels:

```{r, eval=TRUE}
compareClusters(spe_list$sample_151673, func = 'ARI')
```

We evaluate the ARI and NMI for each sample:

```{r, eval=TRUE}
ari <- sapply(spe_list, function(x) as.numeric(tail(compareClusters(x, func = "ARI")[, 1], n = 1)))
ari
```

```{r, eval=TRUE}
nmi <- sapply(spe_list, function(x) as.numeric(tail(compareClusters(x, func = "NMI")[, 1], n = 1)))
nmi
```

Visualise pathology annotation and BANKSY cluster on spatial coordinates with 
the [*ggspavis*](https://bioconductor.org/packages/ggspavis) package:

```{r multi-sample-spatial, eval=TRUE, fig.height=4, out.width='90%'}
# Use scater:::.get_palette('tableau10medium')
pal <- c(
    "#729ECE", "#FF9E4A", "#67BF5C", "#ED665D", "#AD8BC9",
    "#A8786E", "#ED97CA", "#A2A2A2", "#CDCC5D", "#6DCCDA"
)

plot_bank <- lapply(spe_list, function(x) {
    df <- cbind.data.frame(
        clust=colData(x)[[sprintf("%s_smooth", cnm)]], spatialCoords(x))
    ggplot(df, aes(x=pxl_row_in_fullres, y=pxl_col_in_fullres, col=clust)) +
        geom_point(size = 0.5) + 
        scale_color_manual(values = pal) +
        theme_classic() + 
        theme(
            legend.position = "none",
            axis.text.x=element_blank(),
            axis.text.y=element_blank(),
            axis.ticks=element_blank(),
            axis.title.x=element_blank(),
            axis.title.y=element_blank()) +
        labs(title = "BANKSY clusters") +
        coord_equal()
})

plot_anno <- lapply(spe_list, function(x) {
    df <- cbind.data.frame(
        clust=colData(x)[['clust_annotation']], spatialCoords(x))
    ggplot(df, aes(x=pxl_row_in_fullres, y=pxl_col_in_fullres, col=clust)) +
        geom_point(size = 0.5) + 
        scale_color_manual(values = pal) +
        theme_classic() + 
        theme(
            legend.position = "none",
            axis.text.x=element_blank(),
            axis.text.y=element_blank(),
            axis.ticks=element_blank(),
            axis.title.x=element_blank(),
            axis.title.y=element_blank()) +
        labs(title = sprintf("Sample %s", x$sample_id[1])) +
        coord_equal()
})

plot_list <- c(plot_anno, plot_bank)

plot_grid(plotlist = plot_list, ncol = 4, byrow = TRUE)
```

Visualize joint UMAPs for each sample:

```{r multi-sample-umap, eval=TRUE, fig.height=5, out.width='90%'}
umap_bank <- lapply(spe_list, function(x) {
    plotReducedDim(x,
        "UMAP_M0_lam0.2",
        colour_by = sprintf("%s_smooth", cnm),
        point_size = 0.5
    ) +
        theme(legend.position = "none") +
        labs(title = "BANKSY clusters")
})

umap_anno <- lapply(spe_list, function(x) {
    plotReducedDim(x,
        "UMAP_M0_lam0.2",
        colour_by = "clust_annotation",
        point_size = 0.5
    ) +
        theme(legend.position = "none") +
        labs(title = sprintf("Sample %s", x$sample_id[1]))
})

umap_list <- c(umap_anno, umap_bank)

plot_grid(plotlist = umap_list, ncol = 4, byrow = TRUE)
```

# Session information

Vignette runtime:

```{r, eval=TRUE, echo=FALSE}
Sys.time() - start.time
```

<details>

```{r, sess}
sessionInfo()
```

</details>