---
title: "Case study: Environmental complexity"
author: "The miaSim package authors."
date: "`r Sys.Date()`"
package: miaSim
output: 
  BiocStyle::html_document:
    fig_height: 7
    fig_width: 10
    toc: yes
    toc_depth: 2
    number_sections: true
vignette: >
  %\VignetteIndexEntry{caseStudy3-EnvironmentalComplexity}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
  \usepackage[utf8]{inputenc}
---

```{r, echo=FALSE}
knitr::opts_chunk$set(
  cache = FALSE,
  fig.width = 9,
  message = FALSE,
  eval=FALSE,  
  warning = FALSE)
```

# Introduction

`miaSim` implements tools for microbiome data simulation based on different ecological modeling assumptions. These can be used to simulate species abundance matrices, including time series. For a detailed function documentation, see the [function reference page](https://microbiome.github.io/miaSim/reference/index.html)

Install the Bioconductor release version with

```{r install-bioc, eval=FALSE}
if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
```

Load the library

```{r load, eval=TRUE}
library(miaSim)
```

# Case study 3: Environmental complexity impacts on different communities

(Reference: [Non-additive microbial community responses to environmental complexity](https://doi.org/10.1038/s41467-021-22426-3))

The aim of this case study is to recalculate the patterns shown in the existing publication in the above reference. More specifically, the left part of this Figure 2. [![Figure 2](https://media.springernature.com/full/springer-static/image/art%253A10.1038%252Fs41467-021-22426-3/MediaObjects/41467_2021_22426_Fig2_HTML.png?as=webp)](https://www.nature.com/articles/s41467-021-22426-3#Fig2)

In this case study, consumer-resource model was used. The number of species in 3 distinct communities varied so that communities are named after the number of species (i.e. com13, com3, and com4).

For each community, a floor of different numbers of carbon resources (nutrients) is set from 1, 2, 4, 8, 16, to 32. For demo purposes we include here a smaller number of cases.

In the original experiment, value of OD600 is selected as y-axis to represent the growth yield. In our simulations, however, the total number of organisms(number of all individuals) is used to reflect the growth yield.

## Load dependencies

```{r deps}
library(ggplot2)
library(miaSim)
library(vegan)
library(ggplot2)
library(reshape2)
```

## Set random seed and number of repeats

```{r seed}
set.seed(42)
# n_rep <- 50 # Full (and slow) example
n_rep <- 5 # Speed up demo example
```


## initial the output dataframes to store data

```{r initialize}
result_df <- data.frame(
    n_species = integer(),
    theta = numeric(),
    i = integer(),
    n_resources = integer(),
    value = numeric()
)
result_df2 <- data.frame(
  matrix(NA, nrow = 0, ncol = 13, dimnames = list(NULL, paste0("sp", seq_len(13))))
)
sorensen_df <- data.frame(
    n_species = integer(),
    theta = numeric(),
    rho_mean = numeric(),
    rho_sd = numeric()
)
```

## generating function

this function generates a data frame, where each row is arranged in an increasing dissimilarity to the first row.

```{r funcs}
gradient_df_generator <- function(n_row, n_col, density_row, max_gradient, error_interval){
  list_initial <- list()
  dissimilarity.gradient <- seq(from = 0, to = max_gradient, length.out = n_row)
  for (i in seq_len(n_row)){
    print(i)
    if (i == 1){
      row_temp <- rbeta(n_col, 1, n_col)
      col_to_remove <- sample(x = seq_len(n_col), size = n_col-n_col*density_row)
      row_temp[col_to_remove] <- 0
      list_initial[[i]] <- row_temp
    } else {
      while (length(list_initial) < i) {
        row_temp <- rbeta(n_col, 1, n_col)
        col_to_remove <- sample(x = seq_len(n_col), size = n_col-n_col*density_row)
        row_temp[col_to_remove] <- 0
        diff_temp <- abs(vegdist(rbind(list_initial[[1]], row_temp), method = "bray") - dissimilarity.gradient[i])
        if (diff_temp < error_interval) {
          list_initial[[i]] <- row_temp
        }
      }
    }
  }

  # Return
  as.data.frame(t(matrix(unlist(list_initial), ncol = n_row)))

}
```

## Load parameters

Load parameters used by Pacheco et al., and initialized the community data frame

Note that in this step, the value range of theta has been extended.

```{r initialize2}
# Full example
#n_species_types <- c(13, 3, 4)
#theta_types <- c(1, 0.75, 0.5, 0.25, 0.1, 0.05)
#n_resources_types <- c(1,2,4,8,16,32)

# Speeding up the example by reducing the number of combinations
n_species_types <- c(3, 4)
theta_types <- c(1, 0.5)
n_resources_types <- c(1,2)

community.initial.df <- as.list(
  lapply(n_species_types, 
         gradient_df_generator, 
         n_row = n_rep, 
         density_row = 1,
         max_gradient = 0.7,
         error_interval = 0.15)
)
```

## Loop over different combinations of (number of species X theta X number of repetations X number of resources)

```{r loop}
n_species_types <- c(3, 4)
for (n_species in n_species_types){
    for (theta in theta_types) {
        sorensen <- c()
        for (i in seq_len(n_rep)){
            for (n_resources in n_resources_types) {
                ### generate E ####
                Etest <- randomE(n_species = n_species,
                                 n_resources = n_resources,
                                 mean_consumption = theta*n_resources,
                                 exact = TRUE)

                ### calculate rho ####
                if (n_resources == max(n_resources_types)){
                    Etest_pos <- Etest
                    Etest_pos[Etest_pos<0] <- 0
                    for (j in seq_len(n_species - 1)){
                        for (k in 2:n_species){
                            sorensen <- c(sorensen,
                                          sum(apply(Etest_pos[c(j,k),], 2, min)))
                        }
                    }
                }

                if (n_resources > 1){
                    Priority <- t(apply(matrix(sample(n_species * n_resources), nrow = n_species), 1, order))
                    Priority <- (Etest > 0) * Priority
                } else {
                    Priority <- NULL
                }

                print(paste0("n_species=",n_species, " theta=",theta, " i=", i, " n_resources=", n_resources))
                x0temp <- as.numeric(community.initial.df[[match(n_species, n_species_types)]][i,])
                x0temp <- 10*x0temp/sum(x0temp)
                CRMtest <- simulateConsumerResource(n_species = n_species,
                                                    n_resources = n_resources,
                                                    x0 = x0temp, #rep(10, n_species),
                                                    resources = rep(100, n_resources),
                                                    E = Etest,
                                                    # trophic_priority = Priority,
                                                    stochastic = TRUE,
                                                    t_end = 1000,
                                                    t_step = 1,
                                                    t_store = 1000)
                CRMspecies <- assay(CRMtest, "counts")[, ncol(CRMtest)] # last time point; was getCommunity(CRMtest)
                CRMspeciesTotal <- sum(CRMspecies)
                result_df[nrow(result_df)+1,] <- c(n_species, theta, i, n_resources, CRMspeciesTotal)
                result_df2[nrow(result_df2)+1,] <- c(CRMspecies, rep(NA, 13-length(CRMspecies)))

            }
        }
        rho_mean <- mean(sorensen)
        rho_sd <- var(sorensen)
        sorensen_df[nrow(sorensen_df)+1, ] <- c(n_species, theta, rho_mean, rho_sd)
    }
}

```

```{r visualize}
n_species_types <- c(3, 4)
p <- ggplot(result_df, aes(x = n_resources, y = value, group = n_resources)) +
    geom_boxplot(outlier.shape = NA) +
    geom_jitter(alpha = 0.2, width = 0.2) +
    facet_grid(. ~ factor(n_species, levels = n_species_types)) +
    theme_bw() +
    scale_x_continuous(trans = "log2", breaks = n_resources_types) +
    labs(x = "number of resources",
         y = "growth yield (number of individuals)")
print(p)
```