Getting started with ‘SpotSweeper’

Michael Totty

Johns Hopkins Bloomberg School of Public Health, Baltimore, MD, USA

Boyi Guo

Johns Hopkins Bloomberg School of Public Health, Baltimore, MD, USA

Stephanie Hicks

Johns Hopkins Bloomberg School of Public Health, Baltimore, MD, USA

2024-02-14

Input data format

In the examples below, we assume the input data are provided as a SpatialExperiment (SPE) object. The outputs for spot-level outliers and artifacts are stored in the colData of the SPE object.

Spot-level local outlier detection

This is an example workflow showing how to detect and visualize local outliers in 10X Genomics Visium data.

library(SpotSweeper)


# load  Maynard et al DLPFC daatset
spe <- STexampleData::Visium_humanDLPFC()

# change from gene id to gene names
rownames(spe) <- rowData(spe)$gene_name

# show column data before SpotSweeper
colnames(colData(spe))
#> [1] "barcode_id"   "sample_id"    "in_tissue"    "array_row"    "array_col"   
#> [6] "ground_truth" "cell_count"

# drop out-of-tissue spots
spe <- spe[, spe$in_tissue == 1]
spe <- spe[, !is.na(spe$ground_truth)]

SpotSweeper can be run on an SPE object with the following code. This outputs the local_outliers in the colData of the SPE object. Selecting data_output=TRUE exports z-transformed QC metrics as well.


# Identifying the mitochondrial transcripts in our SpatialExperiment.
is.mito <- rownames(spe)[grepl("^MT-", rownames(spe))]

# Calculating QC metrics for each spot using scuttle
spe<- scuttle::addPerCellQCMetrics(spe, subsets=list(Mito=is.mito))
colnames(colData(spe))
#>  [1] "barcode_id"            "sample_id"             "in_tissue"            
#>  [4] "array_row"             "array_col"             "ground_truth"         
#>  [7] "cell_count"            "sum"                   "detected"             
#> [10] "subsets_Mito_sum"      "subsets_Mito_detected" "subsets_Mito_percent" 
#> [13] "total"

# Identifying local outliers using SpotSweeper
features <- c('sum' ,'detected', "subsets_Mito_percent")
spe<- localOutliers(spe, 
                    features=features,
                    n_neighbors=18, 
                    data_output=TRUE,
                    method="multivariate"
                    )

# show column data after SpotSweeper
colnames(colData(spe))
#>  [1] "barcode_id"                "sample_id"                
#>  [3] "in_tissue"                 "array_row"                
#>  [5] "array_col"                 "ground_truth"             
#>  [7] "cell_count"                "sum"                      
#>  [9] "detected"                  "subsets_Mito_sum"         
#> [11] "subsets_Mito_detected"     "subsets_Mito_percent"     
#> [13] "total"                     "sum_log2"                 
#> [15] "detected_log2"             "subsets_Mito_percent_log2"
#> [17] "coords"                    "local_outliers"           
#> [19] "sum_z"                     "detected_z"               
#> [21] "subsets_Mito_percent_z"    "LOF"

We can now visualize local_outliers vs one of the QC metrics, sum_log2, with help from the escheR package.

library(escheR)
library(ggpubr)

# plotting using escheR
p1 <- make_escheR(spe) |> 
  add_fill(var = "sum_log2", point_size=1.25) +
  scale_fill_gradient(low ="white",high =  "darkgreen")

p2 <- make_escheR(spe) |> 
  add_fill(var = "sum_log2", point_size=1.25) |>
  add_ground(var = "local_outliers", stroke = 1) +
  scale_color_manual(
    name = "", # turn off legend name for ground_truth
    values = c(
      "TRUE" = "red",
      "FALSE" = "transparent")
  ) +
  scale_fill_gradient(low ="white",high =  "darkgreen")

plot_list <- list(p1, p2)
ggarrange(
  plotlist = plot_list,
  ncol = 2, nrow = 1,
  common.legend = FALSE
)

Session information

utils::sessionInfo()
#> R Under development (unstable) (2024-01-16 r85808)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 22.04.3 LTS
#> 
#> Matrix products: default
#> BLAS:   /home/biocbuild/bbs-3.19-bioc/R/lib/libRblas.so 
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_GB              LC_COLLATE=C              
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: America/New_York
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats4    stats     graphics  grDevices utils     datasets  methods  
#> [8] base     
#> 
#> other attached packages:
#>  [1] ggpubr_0.6.0                escheR_1.3.2               
#>  [3] ggplot2_3.4.4               STexampleData_1.11.0       
#>  [5] SpatialExperiment_1.13.0    SingleCellExperiment_1.25.0
#>  [7] SummarizedExperiment_1.33.3 Biobase_2.63.0             
#>  [9] GenomicRanges_1.55.3        GenomeInfoDb_1.39.6        
#> [11] IRanges_2.37.1              S4Vectors_0.41.3           
#> [13] MatrixGenerics_1.15.0       matrixStats_1.2.0          
#> [15] ExperimentHub_2.11.1        AnnotationHub_3.11.1       
#> [17] BiocFileCache_2.11.1        dbplyr_2.4.0               
#> [19] BiocGenerics_0.49.1         SpotSweeper_0.99.0         
#> 
#> loaded via a namespace (and not attached):
#>  [1] DBI_1.2.1                 bitops_1.0-7             
#>  [3] rlang_1.1.3               magrittr_2.0.3           
#>  [5] compiler_4.4.0            RSQLite_2.3.5            
#>  [7] DelayedMatrixStats_1.25.1 png_0.1-8                
#>  [9] vctrs_0.6.5               pkgconfig_2.0.3          
#> [11] crayon_1.5.2              fastmap_1.1.1            
#> [13] backports_1.4.1           magick_2.8.2             
#> [15] XVector_0.43.1            labeling_0.4.3           
#> [17] scuttle_1.13.0            utf8_1.2.4               
#> [19] rmarkdown_2.25            purrr_1.0.2              
#> [21] bit_4.0.5                 xfun_0.42                
#> [23] zlibbioc_1.49.0           cachem_1.0.8             
#> [25] beachmat_2.19.1           jsonlite_1.8.8           
#> [27] blob_1.2.4                highr_0.10               
#> [29] DelayedArray_0.29.1       BiocParallel_1.37.0      
#> [31] broom_1.0.5               parallel_4.4.0           
#> [33] R6_2.5.1                  bslib_0.6.1              
#> [35] car_3.1-2                 jquerylib_0.1.4          
#> [37] Rcpp_1.0.12               knitr_1.45               
#> [39] Matrix_1.6-5              tidyselect_1.2.0         
#> [41] abind_1.4-5               yaml_2.3.8               
#> [43] codetools_0.2-19          curl_5.2.0               
#> [45] lattice_0.22-5            tibble_3.2.1             
#> [47] withr_3.0.0               KEGGREST_1.43.0          
#> [49] evaluate_0.23             Biostrings_2.71.2        
#> [51] pillar_1.9.0              BiocManager_1.30.22      
#> [53] filelock_1.0.3            carData_3.0-5            
#> [55] generics_0.1.3            dbscan_1.1-12            
#> [57] RCurl_1.98-1.14           BiocVersion_3.19.1       
#> [59] sparseMatrixStats_1.15.0  munsell_0.5.0            
#> [61] scales_1.3.0              glue_1.7.0               
#> [63] tools_4.4.0               BiocNeighbors_1.21.2     
#> [65] ggsignif_0.6.4            cowplot_1.1.3            
#> [67] grid_4.4.0                tidyr_1.3.1              
#> [69] AnnotationDbi_1.65.2      colorspace_2.1-0         
#> [71] GenomeInfoDbData_1.2.11   cli_3.6.2                
#> [73] rappdirs_0.3.3            fansi_1.0.6              
#> [75] viridisLite_0.4.2         S4Arrays_1.3.3           
#> [77] dplyr_1.1.4               gtable_0.3.4             
#> [79] rstatix_0.7.2             sass_0.4.8               
#> [81] digest_0.6.34             SparseArray_1.3.4        
#> [83] farver_2.1.1              rjson_0.2.21             
#> [85] memoise_2.0.1             htmltools_0.5.7          
#> [87] lifecycle_1.0.4           httr_1.4.7               
#> [89] mime_0.12                 bit64_4.0.5              
#> [91] MASS_7.3-60.2