---
title: "Example: finding E. coli outstanding genomic zones in response to nickel stress"
author: "Hua Zhong and Joe Song"
date: "Updated August 9, 2019; Created August 9, 2019"
output: rmarkdown::html_vignette
bibliography: vignettes.bib
vignette: >
  %\VignetteIndexEntry{GenomicOZone}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
```

We illustrate the GenomicOZone package with an example. The *Escherichia coli* transcriptome data set [@gault2016data] contains transcriptome profiles of *E. coli* K-12 under nickel stress, with three samples under exposure to nickel and three normal samples. 

To run this example, two other packages `GEOquery` [@davis2007geoquery] for data download and `readxl` for Excel file reading are required and should have been installed, in addition to `GenomicOZone`.
```{r setup, message=F, warning=F, results = "hide"}
require(GenomicOZone)
require(GEOquery)
require(readxl)
```

The following code chunk downloads a data transcriptome set, prepares genome annotation, and creates parameters before outstanding zone analysis. Specifically, an *Escherichia coli* transcriptome data set of series number GSE76167 is downloaded from GEO database [@edgar2002gene]. We extracted the RPKM values from six samples including three stressed samples and three wild-type samples. The genome annotation is prepared as a `GRanges` [@lawrence2013software] object.
```{r, message=F, warning=F, results = "hide"}
# Obtain data matrix from GSE76167 supplementary file
invisible(getGEOSuppFiles("GSE76167"))
data <- read_excel("./GSE76167/GSE76167_GeneFPKM_AllSamples.xlsx")


# Adjust the input data
data.info <- data[,1:5]
data <- data[,-c(1:5)]
data <- data[,substr(colnames(data), 1, 4) == "FPKM"]
data <- data.matrix(data[,c(1,5,6,3,4,2)])
colnames(data) <- c(paste(rep("WT",3), "_", c(1,2,3), sep=""), 
                    paste(rep("Ni",3), "_", c(1,2,3), sep=""))
rownames(data) <- data.info$tracking_id


# Obtain genes
data.genes <- data.info$gene_short_name
data.genes[data.genes == "-"] <- data.info$tracking_id[data.genes == "-"]


# Create colData
colData <- data.frame(Sample_name = colnames(data),
                      Condition = factor(rep(c("WT", "Ni"), each = 3), 
                                         levels = c("WT", "Ni")))


# Create design
design <- ~ Condition


# Create rowData.GRanges
pattern <- "(.[^\\:]*)\\:([0-9]+)\\-([0-9]+)"
matched <- regexec(pattern, as.character(data.info$locus))
values <- regmatches(as.character(data.info$locus), matched)
data.gene.coor <-  data.frame(chr = as.character(sapply(values, function(x){x[[2]]})),
                              start = as.numeric(sapply(values, function(x){x[[3]]})),
                              end = as.numeric(sapply(values, function(x){x[[4]]})))
rownames(data.gene.coor) <- as.character(data.info$tracking_id)
rowData.GRanges <- GRanges(seqnames = data.gene.coor$chr,
                           IRanges(start = data.gene.coor$start, 
                                   end = data.gene.coor$end),
                           Gene.name = data.genes)
names(rowData.GRanges) <- data.info$tracking_id

chr.size <- 4646332
names(chr.size) <- "NC_007779"
seqlevels(rowData.GRanges) <- names(chr.size)
seqlengths(rowData.GRanges) <- chr.size
```


With the formatted data, parameters, and annotation, we run the outstanding zone analysis as below:
```{r}
# Create an input object also checking for data format, consistency, and completeness
GOZ.ds <- GOZDataSet(data = data,
                     colData = colData,
                     design = design,
                     rowData.GRanges = rowData.GRanges)


# Run the outstanding zone analysis
GOZ.ds <- GenomicOZone(GOZ.ds)
```

The following four auxiliary functions extract gene annotation, zone annotation, outstanding zone annotation, and zone expression matrix, respectively:
```{r}
# Extract gene/zone GRanges object
Gene.GRanges <- extract_genes(GOZ.ds)
head(Gene.GRanges)

Zone.GRanges <- extract_zones(GOZ.ds)
head(Zone.GRanges)

# min.effect.size = 0.36 is chosen from the 
#     minumum of top 5% effect size values
OZone.GRanges <- extract_outstanding_zones(
                              GOZ.ds,
                              alpha = 0.05, 
                              min.effect.size = 0.36)
head(OZone.GRanges)


Zone.exp.mat <- extract_zone_expression(GOZ.ds)
head(Zone.exp.mat)
```

Three types of plot can be generated from the returned object by `GenomicOZone()`, including genome-wide overview, within-chromosome heatmap, and within-zone expression. The plots are generated using R package `ggplot2` [@Wickham2016ggplot2] and `ggbio` [@yin2012ggbio]. The value of `min.effect.size = 0.36` is chosen from the minumum of top 5% effect size values. The effect size for ANOVA is calculated by R package `sjstats` [@sjstats2019].
    
```{r, out.width = "100%", out.height = "100%", fig.align="center"}
# Genome-wide overview
plot_genome(GOZ.ds, plot.file = "E_coli_genome.pdf",
            plot.width = 15, plot.height = 4,
            alpha = 0.05, min.effect.size = 0.36)
knitr::include_graphics("E_coli_genome.pdf")
```

```{r, out.width = "100%", out.height = "100%", fig.align="center"}
# Within-chromosome heatmap
plot_chromosomes(GOZ.ds, plot.file = "E_coli_chromosome.pdf",
                 plot.width = 20, plot.height = 4,
                 alpha = 0.05, min.effect.size = 0.36)
knitr::include_graphics("E_coli_chromosome.pdf")
```

```{r, out.width = "50%", out.height = "100%", fig.align="center"}
# Within-zone expression
plot_zones(GOZ.ds, plot.file = "E_coli_zone.pdf", 
           plot.all.zones = FALSE,
           alpha = 0.05, min.effect.size = 0.36)
knitr::include_graphics("E_coli_zone.pdf")
```


## References