--- title: "_RNAdecay_ Vignette: Normalization, Modeling, and Visualization of RNA Decay Data" author: "Reed Sorenson | reed.sorenson@utah.edu | Department of Biology | University of Utah" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{RNAdecay} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- # I. DATA NORMALIZATION AND CORRECTION The functions and code of the `RNAdecay` package were written for the purpose of analyzing decreasing levels of RNA abundance as measured by RNA short read sequencing (RNA-seq) after inhibition of transcription. Tissue is treated to block transcription (e.g., using a chemical such as cordycepin or ActinomycinD) and collected at regular intervals (referred to as the RNA decay time course, e.g., T0, T30, T60 refer to 0, 30, and 60 min of blocked transcription). Total RNA is extracted from the tissue and measured by RNA-seq. Although we wrote this package to analyze RNA-seq data, it could also be adapted to analyze individual gene data from quantitative reverse transcription polymerase chain reaction (qRT-PCR) measures, however, normalization (as herein described) is best performed using a number of stable reference genes. Raw (Illumina) RNA-seq data are typically libraries made of up 10M-250M short (50 nt) sequences. These sequences are aligned to the genome and counted based on the location of their alignment in annotated genomic ranges (e.g., genes, exons, etc). Read counts for each gene are normalized to the size of their respective RNA-seq library for comparison to libraries generated from other biological material, therefore, read counts are expressed as reads per million (RPM). We begin RNA decay analysis in this package with RPM RNA abundance data. ## 1. T0 NORMALIZATION Prior to modeling RNA decay, we normalize the read counts to the initial (T0) RNA abundance, the transcript level just prior to blocking transcription. This is unique for each gene, and, in an experiment with multiple replicates (reps), we use the mean value for all T0 reps. All RPM values in the time course are divided by the mean T0 value. For example: ```{r} # you can see that in this fabricated data the RNAs have a half-life of ~30 min RPM_geneX <- data.frame(T0 = c(150, 135, 148), T30 = c(72, 76, 69), T60 = c(35, 35, 30), row.names = paste0("rep", 1:3)) RPM_geneX RPM_geneX_T0norm <- RPM_geneX/mean(RPM_geneX$T0) RPM_geneX_T0norm colMeans(RPM_geneX_T0norm) ``` _The following workflow depends on beginning with a data frame of RPM values with column names consisting of unique, non-nested variable tags separated by underscores ('\_'), with rownames as unique gene identifiers._ Here, we use with built-in example data (from [Sorenson et al., 2018](http://www.pnas.org/content/early/2018/01/30/1712312115.short)) which we will use throughout this workflow. It has 4 genotypes, 8 time points, and 4 biological replicates for 128 samples (columns) and 118 genes (rows). ```{r} library(RNAdecay) RPMs <- RNAdecay::RPMs # built-in example data RPMs[1:2, c(1:11, 128)] # NOTE: not showing all columns here ``` Biological replicate data columns are indexed using the `cols()` function. ```{r} cols(patterns = c("WT", "00"), df = RPMs) #gives the column indices that contain both tags in the column names colnames(RPMs)[cols(patterns = c("WT", "00"), RPMs, "WT", "00")] # NOTE: this is based on grep pattern matching so tags MUST be unique and non-nested (i.e. one entire label should not be part of another). ``` The mean and standard error (SE) RPM values are calculated by looping over label combinations and binding the values together in a new data frame. ```{r} # create a directory for results output wrdir <- paste0(tempdir(),"/DecayAnalysis/Example analysis results 1") if(!file.exists(wrdir)) dir.create(wrdir, recursive = TRUE) # specify sample tags from colnames of RPMs treatment <- c("WT", "sov", "vcs", "vs") # NOTE: I did not use 'vcs sov' for the name of the double mutant, or cols(df=RPMs, patterns = c("vcs", "00")) would have indexed all T0 samples of both "vcs" and "vcs" sov"; instead, I used 'vs'. reps <- c("r1", "r2", "r3", "r4") tdecay <- c("00", "07", "15", "30", "60", "120", "240", "480") #again NO nesting of one label in another hence "00" instead of "0". ``` Loop over the treatments and timepoints to subset the appropriate columns, and calculate mean and SE of replicates. ```{r} mean_RPM <- data.frame("geneID" = rownames(RPMs)) SE_RPM <- data.frame("geneID" = rownames(RPMs)) for(g in treatment){ for(t in tdecay){ mean_RPM <- cbind(mean_RPM, rowMeans(RPMs[, cols(df=RPMs, patterns = c(g, t))])) names(mean_RPM)[length(mean_RPM)] <- paste0(g, "_", t) SE_RPM <- cbind(SE_RPM, apply(X = RPMs[, cols(df=RPMs, patterns = c(g, t))], MARGIN = 1, FUN = stats::sd)/sqrt(length(reps))) names(SE_RPM)[length(SE_RPM)] <- paste0(g, "_", t) }} mean_RPM[1:2, ] SE_RPM[1:2, ] # write output to file write.table(x = mean_RPM, paste0(wrdir, "/RPM_mean.txt"), sep = "\t") write.table(x = SE_RPM, paste0(wrdir, "/RPM_SE.txt"), sep = "\t") ``` Optionally, an RPM-based filtering step could be applied at this point to remove lowly expressed genes, for example. ```{r} filt1 <- rep(TRUE, 118) # we will not filter in this example, so the filter value for each gene is set to TRUE. ``` Replicate mean T0 RPM values are then used to normalize gene level data. ```{r} mT0norm <- data.frame(row.names = rownames(RPMs)[filt1]) for(g in treatment){ mean_T0reps <- rowMeans(RPMs[filt1, cols(df=RPMs, patterns=c(g, "00"))]) for(r in reps){ df <- RPMs[filt1, colnames(RPMs)[cols(df=RPMs, patterns=c(g, r))]] df <- df[, 1:length(tdecay)]/mean_T0reps mT0norm <- cbind(mT0norm, df) }} write.table(x = mT0norm, file = paste0(wrdir, "/T0 normalized.txt"), sep = "\t") ``` The mean and standard error of the T0 normalized data are then calculated for replicate samples. ```{r} mean_mT0norm <- data.frame(row.names = rownames(mT0norm)) for(g in treatment){ for(t in tdecay){ mean_mT0norm <- cbind(mean_mT0norm, rowMeans(mT0norm[, cols(df=mT0norm, patterns=c(g, t))])) names(mean_mT0norm)[length(names(mean_mT0norm))] <- paste0(g, "_", t) }} SE_mT0norm <- data.frame(row.names = rownames(mT0norm)) for(g in treatment){ for(t in tdecay){ SE_mT0norm <- cbind(SE_mT0norm, apply(X = mT0norm[, cols(df=mT0norm, patterns=c(g, t))], MARGIN = 1, FUN = function(x) stats::sd(x)/sqrt(length(reps)))) names(SE_mT0norm)[length(names(SE_mT0norm))] <- paste0(g, "_", t) }} # write output to file write.table(x = mean_mT0norm, file = paste0(wrdir, "/T0 normalized_Mean.txt"), sep = "\t") write.table(x = SE_mT0norm, file = paste0(wrdir, "/T0 normalized_SE.txt"), sep = "\t") ``` ## 2. DECAY FACTOR CORRECTION After inhibition of RNA synthesis, RNA degradation continues causing the total pool of RNA in cells to decrease. As this occurs, very stable RNAs become a much larger proportion of the total RNA pool even though their concentration in the cell remains nearly constant. RNA-seq RPM data is scaled to a normalized library size (i.e. to the total RNA pool), and, because of this, abundance of stable RNAs appears to increase. The apparent increase in abundance of stable genes is proportional to the reduction in the total pool of RNA, therefore, we can use it to apply a correction to the data. We call this decay factor correction. The decay factor is estimated based on the RPM increase (relative to the T0 samples) of a set of stable and abundant reference genes. We selected 30 genes with high abundance on which to calculate the decay factors for each sample. These were then used to correct abundance in each of the respective samples. A unique decay factor is calculated for each treatment at each time point. First, make a vector of 'geneID's of abundant and stable reference genes present in the data set. ```{r} stablegenes <- c( "ATCG00490", "ATCG00680", "ATMG00280", "ATCG00580", "ATCG00140", "AT4G38970", "AT2G07671", "ATCG00570", "ATMG00730", "AT2G07727", "AT2G07687", "ATMG00160" ,"AT3G11630", "ATMG00060", "ATCG00600", "ATMG00220", "ATMG01170", "ATMG00410", "AT1G78900", "AT3G55440", "ATMG01320", "AT2G21170" ,"AT5G08670", "AT5G53300", "ATMG00070", "AT1G26630", "AT5G48300", "AT2G33040", "AT5G08690", "AT1G57720") ``` Mean T0 normalized values of stable genes are then pulled out of the `mean_mT0norm` data frame and used to calculate the decay factor for each set of replicate samples. ```{r} stabletable <- mean_mT0norm[stablegenes, ] normFactors <- colMeans(stabletable) write.table(x <- normFactors, paste0(wrdir, "/Normalziation Decay Factors.txt"), sep = "\t") normFactors_mean <- matrix(normFactors, nrow = length(tdecay)) normFactors_SE <- matrix(apply(X = stabletable, MARGIN = 2, function(x) stats::sd(x)/sqrt(length(stablegenes))), nrow = length(tdecay)) t.decay <- c(0, 7.5, 15, 30, 60, 120, 240, 480) rownames(normFactors_mean) <- t.decay rownames(normFactors_SE) <- t.decay colnames(normFactors_mean) <- treatment colnames(normFactors_SE) <- treatment list(normalizationFactors = normFactors_mean, SE = normFactors_SE) ``` Generate a matrix of the same dimensions as `mT0norm` with the appropriate decay factors in the same respective positions. ```{r} nF <- vector() ind <- sapply(names(normFactors), function(x) grep(x, colnames(mT0norm))) for(i in 1:ncol(ind)){ nF[ind[, i]] <- colnames(ind)[i] } normFactorsM <- t(matrix(rep(normFactors[nF], nrow(mT0norm)), ncol = nrow(mT0norm))) rm(nF, ind) ``` Apply the decay factor corrections. ```{r} mT0norm_2 <- data.frame(mT0norm/normFactorsM, row.names = rownames(mT0norm)) write.table(mT0norm_2, paste0(wrdir, "/T0 normalized and decay factor corrected.txt"), sep = "\t") ``` Rearrange the RNA decay data into a long form data frame for modeling and visualization. ```{r} mT0norm_2.1 <- reshape2::melt(as.matrix(mT0norm_2), varnames = c("geneID", "variable")) mT0norm_2.1 <- cbind(mT0norm_2.1, reshape2::colsplit(mT0norm_2.1$variable, "_", names = c("treatment", "t.decay", "rep"))) mT0norm_2.1 <- mT0norm_2.1[, colnames(mT0norm_2.1) != "variable"] mT0norm_2.1 <- mT0norm_2.1[, c(1, 3, 4, 5, 2)] colnames(mT0norm_2.1) <- c("geneID", "treatment", "t.decay", "rep", "value") mT0norm_2.1$rep <- gsub("r", "rep", mT0norm_2.1$rep) mT0norm_2.1$t.decay <- as.numeric(gsub("7", "7.5", as.numeric(mT0norm_2.1$t.decay))) mT0norm_2.1$treatment <- factor(mT0norm_2.1$treatment,levels = c("WT","sov","vcs","vs")) mT0norm_2.1$rep <- factor(mT0norm_2.1$rep, levels = paste0("rep",1:4)) write.table(x = mT0norm_2.1, file = paste0(wrdir, "/ExampleDecayData+stableGenes.txt"), sep = "\t") ``` Following the steps in this vignette, the resulting normalized data frame should be identical to the one supplied in this package called decay_data. To check: ```{r} table(RNAdecay::decay_data[,1:4] == mT0norm_2.1[,1:4]) # should be all TRUE no FALSE ``` The normalized data is now ready to be used for modeling decay rates. # II. MODELING DECAY RATES RNA degradation is typically modeled with a constant exponential decay model. This model assumes a constant decay rate throughout the decay time course. However, a number of issues might violate this assumption. For example, decay might depend on continuous transcription of rapidly turned over components of decay machinery; inhibition of this transcription would cause slowed decay due to lost supply of decay components. Alternatively, decay rate might be regulated (e.g., diurnally) leading to a slow change in decay rate over a long decay time course. Feedback due to slowed transcription could also lead to slowed RNA decay. We, therefore, apply both a constant exponential decay model and a time-dependent exponential decay model. However, besides these possibilies, other distinct mechanisms could lead to an apparent slowing of RNA decay. For example, because RNA-seq reads are pooled from multiple cell types and mapped to genes, distinct gene mRNA subpopulations might have different decay rates (i.e., different splice isoforms, or the same mRNA in different cell types of a multicellular organism) that are are averaged when these are pooled together. We believe that for these the time-dependent exponential decay model will also better capture this average than a constant exponential decay model. The dynamic nature of RNA decay can be a point of gene regulation. To identify treatments that might affect decay rate, we model all possibilities of treatment effect on both [initital] decay rates ($\alpha$) and decay of decay rates ($\beta$) using maximum likelihood modeling. This is done by running the modeling with constraints on treatment decay rates (i.e., constraining decay rates of treatments to be equal or allowing them to combinatorially vary in independent groups). This can be done with up to four treatments in this package. Modeling is performed one gene at a time with each set of constraints, and then each set of parameter estimates (from each set of contraints) are compared using the AICc statistic. In this package, we refer to these as equivalence constraint groupings. For the detailed mathematical framework of the decaying decay model see [Sorenson et. al. (2018)](http://www.pnas.org/content/early/2018/01/30/1712312115.short). Below we describe the steps of the algorithmic implementation and walk though an example data set. ## 3. LOAD NORMALIZED AND CORRECTED DATA AND CHECK THE FORMAT Load normalized and corrected data and check the format: the following script requires a data frame with five columns with the following column names exactly: "geneID", "treatment", "t.decay", "rep", "value": * geneID = gene identifiers (`factor`) * treatment = experimental treatment or genotype (`factor`) * t.decay = time after transcriptional inhibition (`numeric`) * rep = biological replicate number (`factor`) * value = RNA abundance normalized to library size, decay factor and mean T0 values (`numeric`) ```{r} library(RNAdecay) decay_data <- RNAdecay::decay_data # built-in example data # make new directory for results wrdir <- paste0(tempdir(),"/DecayAnalysis/Example analysis results 2") if(!file.exists(wrdir)) dir.create(wrdir, recursive = TRUE) ``` Rename factor levels and specify level order of `decay_data$treatment` for sorting and making figures. The example data set is already ordered, but when data is read from a text file (e.g., .csv) factor levels are ordered automatically and should be reordered according to user preference now to avoid later headaches. ```{r} levels(decay_data$treatment) # This can not be longer than 4 elements for modeling. # reorder factor levels if necessary (skipped here): # decay_data$treatment <- factor(decay_data$treatment, levels = levels(decay_data$treatment)[c(4, 1, 2, 3)]) # numbers here refer to the current order position in the factor levels() - the order of the numbers designates the new position levels(decay_data$treatment)[4] <- "vcs.sov" levels(decay_data$treatment) ``` Sort the `decay_data` data frame as follows (*NOTE: REQUIRED step for proper indexing below*). ```{r} decay_data <- decay_data[order(decay_data$t.decay), ] decay_data <- decay_data[order(decay_data$rep), ] decay_data <- decay_data[order(as.numeric(decay_data$treatment)), ] decay_data <- decay_data[order(decay_data$geneID), ] ids <- as.character(unique(decay_data$geneID)) # 118 in example set decay_data[1:10, ] ``` ## 4. GENERATE MATRICES OF $\alpha$ AND $\beta$ EQUIVALENCE CONSTRAINT GROUPS Create objects and set varables used in the modeling based on `decay_data`. ```{r} # (no changes needed here - just execute the code) nEquivGrp <- if (length(unique(decay_data$treatment)) == 2) {2} else if (length(unique(decay_data$treatment)) == 3) {5} else if (length(unique(decay_data$treatment)) == 4) {15} genoSet <- 1:(length(unique(decay_data$rep)) * length(unique(decay_data$t.decay))) nTreat <- length(unique(decay_data$treatment)) nSet <- length(genoSet)*nTreat groups <- groupings(decay_data) mods <- data.frame( "a" = as.vector(sapply(1:nEquivGrp, function(x) {rep(x, nEquivGrp + 1)})), "b" = rep(1:(nEquivGrp + 1), nEquivGrp), row.names = paste0("mod", 1:(nEquivGrp*(nEquivGrp+1))) ) ``` Create a colormap of model groups. ```{r} group_map(decaydata = decay_data, path = paste0(wrdir, "/Model grouping colormap.pdf"), nEquivGrp = nEquivGrp, groups = groups, mods = mods) ``` This file ("Model grouping colormap.pdf") should now be in your working directory. It is a color map reference for understanding model constraints. For example, model 1 has all different colored boxes representing unconstrained $\alpha$s and unconstrained betas, whereas, the second to last model has only two box colors - one for all $\alpha$s indicating that they all have constrained equivalence and the other indicating that all $\beta$s also have constrained equivalence. Constant decay models (i.e., $\beta$s = 0) are indicated with gray color in the $\beta$ columns. ## 5. MODEL OPTIMIZATION Determine the bounds for $\alpha$ and $\beta$ parameters. The bounds on $\alpha$ and $\beta$ are related to distinguishing constant decay, and decaying decay as described in the modeling supplement of [Sorenson et al., 2018](http://www.pnas.org/content/early/2018/01/30/1712312115.short). _Care should be taken in determining these bounds given each experimental design._ We have provided functions to aid in selection of the lower bounds of $\alpha$ and $\beta$ (`a_low` and `b_low`, respectively), and upper bound for $\alpha$ (`a_high`). These functions calculate limits based on the time points data were collected. For example, if abundance for an unstable mRNA is measured at the first time point to be 0.02% of the initial (T0) abundance, the information needed to estimate this decay rate was not collected. So, `a_high` caluclates the maximum estimatable decay rate based on the time at which the first decay data point was collected. Similarly, the lower bound for $\beta$ is required to distinguish the constant decay model (`const_decay`) from the decaying decay model (`decaying_decay`). The upper bound for $\beta$ is required to distinguish the no decay model (`const_decay(0,t)`) from decaying decay model. If $\beta$ is too small 1-exp(-$\beta$\*t) ~ -$\beta$\*t and c(t) ~ exp(-$\alpha$\*t); therefore, we can't distinguish between constant decay and decaying decay models. If $\beta$ is too big 1-exp(-$\beta$\*t) ~ 1 and c(t) ~ exp(-$\alpha$/$\beta$), a constant, we can't distinguish between no decay and decaying decay models. Refer to [Sorenson et al. (2018)](http://www.pnas.org/content/early/2018/01/30/1712312115.short) supplemental material for more detail about how we calculated the bounds for $\alpha$ and $\beta$. ```{r} a_bounds <- c(a_low(max(decay_data$t.decay)), a_high(min(unique(decay_data$t.decay)[unique(decay_data$t.decay)>0]))) b_bounds <- c(b_low(max(decay_data$t.decay)), 0.075) a_bounds;b_bounds ``` Modeling is accomplished using the `mod_optimization` function. The function takes as arguments, the gene identifier, $\alpha$ and $\beta$ bounds, the `decay_data` data frame, a vector of model names to run optimization for (e.g., as `c("mod1", "mod239", ... )`, the equivalence constraint matrix (defined as `groups` above), and a matrix to specify which contstraint groupings to use for $\alpha$ and $\beta$ parameters (defined as `mods` above), respectively, for each model, and a results folder (which will be made if it doesn't already exist) to which the results are written. A number of other arguments are available as well, see help file for details using `?mod_optimization`. Efficient model parameter optimization is accomplished using objective functions coded as binary dynamically linked shared libraries. These libraries are compiled from functions coded in the C++ language using functionality of the `TMB` package. The compiled files are dynamically linked library (\*.dll, for Windows) or shared object (\*.so, linux and Mac) files for each of the objective functions. If RNAdecay is installed from source, compiling C++ code will require a compiler be installed separatedly on your system (e.g., Rtools34 or later for Windows, https://cran.r-project.org/bin/windows/Rtools/; Xcode comand line tools for Mac, http://railsapps.github.io/xcode-command-line-tools.html; R development package for Linux, r-base-dev). If you don't already have one of these installed, install one and restart R. C++ source files are located in the installed 'RNAdecay/src' folder. The compiled (shared library) files should also be written to this same folder upon installation. Test the modeling on a couple of genes on a handful of models. ```{r} mod_optimization("AT2G18150", decay_data, group = groups, mod = mods, a_bounds, b_bounds, c("mod1", "mod2", "mod16", "mod239", "mod240"), file_only = FALSE, path = wrdir) mod_optimization("AT4G09680", decay_data, group = groups, mod = mods, a_bounds, b_bounds, c("mod1", "mod2", "mod16", "mod239", "mod240"), file_only = FALSE, path = wrdir) ``` Next, run all the models on each and every gene. This requires significant computational resources depending on data set size, number of models, and computing speed. The sample data set has 8 time points x 4 replicates each x 4 treatments (= 128 samples) for 118 genes and requires about 10 h processor time. 20000 genes at ~5 min each is ~70 d processor time, so be sure to test a few different models on a few handfuls of genes to feel comfortable running all the modeling. We recommend parallel processing (on multiple processor cores) using `parallel::mclapply` to cut overall time. If this function is unavailable on your machine (e.g., on _Windows_) you can use lapply, but it will take much longer. `mod_optimization()` writes results to file in tab-delimited form, and, optionally, can also return them as a data frame object. ```{r} ####### To run all genes in parallel use: # parallel::mclapply(ids, FUN = mod_optimization, # data = decay_data, group = groups, mod = mods, # alpha_bounds = a_bounds, beta_bounds = b_bounds, # models = paste0("mod", 1:240), # path = paste0(wrdir, "/modeling_results"), # mc.cores = getOption("mc.cores", 15L), # set the number of compute cores to use here (e.g., 9L = 9 cores, 11L = 11 cores) # mc.preschedule = TRUE, # mc.set.seed = TRUE, # mc.silent = FALSE, # mc.cleanup = TRUE, # mc.allow.recursive = TRUE) ``` The entire example data set takes ~ 10 h processor time with lapply so for this example we will randomly select one gene ID to run here. ```{r} test_ids <- sample(ids, 1) # NOTE: that everytime this line is run it generates a different random sampling, therefore the gene modeled below will be different each time this code is run. To test the exact gene shown in the vignette make a new character vector of the gene id reported below and pass it to the gene argument of mod_optimization using lapply instead of passing 'test_ids' as we do here. test_ids a <- proc.time()[3] models <- lapply(X = test_ids, # alternatively use `ids` here to complete the entire sample data set, but be prepared to wait ~ 10 h. These gene IDs will get passed one at time to the "gene" argument of mod_optimization() and return a list of the results data frame. FUN = mod_optimization, data = decay_data, group = groups, mod = mods, alpha_bounds = a_bounds, beta_bounds = b_bounds, models = rownames(mods), path = paste0(wrdir, "/modeling_results"), file_only = FALSE) names(models) <- test_ids b <- proc.time()[3] (b-a)/60/length(test_ids) # gives you average min per gene ``` For each gene, read in the results data frame written by `mod_optimization` as an element of a single list object. ```{r} models <- lapply( paste0( wrdir, "/modeling_results/", test_ids, "_results.txt"), read.delim, header = TRUE ) names(models) <- test_ids ``` ## 6. MODEL SELECTION The AICc statistic is used to compare model performance. Better models have lower AICc values. Select a single model for each gene based on the lowest AICc statistic. We will continue here with the `RNAdecay::models` data included in the package which uses all 118 genes from the sample data set `RNAdecay::decay_data`. ```{r} models <- RNAdecay::models # built-in example data, comment this line out to continue with your own modelling output results <- t(sapply(models, function(x) x[x[, "AICc"] == min(x[, "AICc"]), ])) results <- as.data.frame(results) results[, 1:2] <- sapply(as.data.frame(results[, 1:2]), function(x) as.character(unlist(x))) results[, -c(1,2)] <- sapply(results[, -c(1,2)], unlist) write.table(results, file = paste0(wrdir,"/best model results.txt"), sep = "\t") results <- read.delim(paste0(wrdir,"/best model results.txt")) ``` Generate some graphics to visualize results. ```{r} library(ggplot2) pdf(paste0(wrdir,"/distributions of stats.pdf")) p <- ggplot(results) print(p+geom_histogram(aes(x = sigma2), bins = 300)+ coord_cartesian(xlim = c(0,0.5))+ geom_vline(xintercept = 0.0625,color = "red2")+ plain_theme()) print(p+stat_bin(aes(x = sigma2), breaks = c(seq(0,0.25,0.25/50),1), geom = "bar")+coord_cartesian(xlim = c(0,0.25))) print(p+stat_ecdf(aes(sigma2), geom = "line")+ coord_cartesian(xlim = c(0,0.5))) print(p+stat_ecdf(aes(sqrt(sigma2)), geom = "line")+ coord_cartesian(xlim = c(0,sqrt(0.5)))) print(p+geom_histogram(aes(x = range.LL), bins = 60)) print(p+geom_histogram(aes(x = nUnique.LL), bins = 60)) dev.off() pdf(paste0(wrdir,"/lowest AICc model counts.pdf"), height = 8, width = 32) p <- ggplot(data = data.frame( model = as.integer(gsub("mod","",names(table(results$mod)))), counts = as.integer(table(results$mod))))+ geom_bar(aes(x = model, y = counts), stat = "identity")+ scale_x_continuous(limits = c(0,nrow(mods)), breaks = seq(0,nrow(mods),5))+ ggtitle("model distribution of absolute lowest AICs") print(p+plain_theme(25)) dev.off() ``` Because two models are considered to perform differently if their AICc value difference is >2, any models that have AICc values within two of the model with the lowest AICc are considered to have performed similarly. The number of models performing similarly to the best model are plotted as a histogram here. The number of different alpha groupings represented in these models is also plotted. ```{r} min_mods <- sapply(models, function(x) which (x[, "AICc"] < (2+min(x[, "AICc"])))) min_alpha_mods <- lapply(min_mods, function(x) unique(mods[x, "a"])) pdf(paste0(wrdir,"/number of models that performed similar to the one selected.pdf")) barplot(height = table(sapply(min_mods, length)), xlab = "No. models in the lowest AICc group (not more than 2 different from lowest)", ylab = "No. genes") barplot(height = table(sapply(min_alpha_mods, length)), xlab = "No. alpha groups in the lowest AICc group (not more than 2 different from lowest)", ylab = "No. genes") dev.off() ``` Build the modeling results data frame. Group genes with similar $\alpha$ groupings and then group genes with similar $\beta$ groupings. ```{r} results <- read.delim(paste0(wrdir,"/best model results.txt")) results$alpha_grp <- mods[as.character(results$mod), "a"] results$beta_grp <- mods[as.character(results$mod), "b"] results$mod <- as.numeric(gsub("mod", "", as.character(results$mod))) results$alphaPattern <- sapply(rownames(results), function(x) { paste0(gsub("alpha_", "", colnames(results)[3:(2+nTreat)][order(round(results[x, 3:(2+nTreat)], 4))]), collapse = "<=") }) results$alphaPattern <- paste0(results$alpha_grp, "_", results$alphaPattern) results$betaPattern <- sapply(rownames(results), function(x){ paste0(gsub("beta_", "", colnames(results)[(3+nTreat):(2+2*nTreat)][order(round(results[x, (3+nTreat):(2+2*nTreat)], 4))]), collapse = "<=") }) results$betaPattern <- paste0(results$beta_grp, "_", results$betaPattern) results <- results[order(rownames(results)), ] results <- results[order(results$beta_grp), ] results <- results[order(results$alphaPattern), ] results <- results[order(results$alpha_grp), ] results$alphaPattern <- factor(results$alphaPattern, levels = as.character(unique(results$alphaPattern))) results <- data.frame(results[, 3:(2*nTreat+3), 2], results[, c("AICc", "alpha_grp", "beta_grp", "alphaPattern", "betaPattern")]) results$nEqMods <- sapply(min_mods[rownames(results)], length) results$nEqAgp <- sapply(min_alpha_mods[rownames(results)], length) # Customize: add columns of relative alphas and betas as desired, e.g.: results$rA_sov.WT <- results$alpha_sov / results$alpha_WT results$rA_vcs.WT <- results$alpha_vcs / results$alpha_WT results$rA_vcssov.WT <- results$alpha_vcs.sov / results$alpha_WT write.table(results, paste0(wrdir,"/alphas+betas+mods+grps+patterns+relABs.txt"), sep = "\t") results <- read.delim(paste0(wrdir,"/alphas+betas+mods+grps+patterns+relABs.txt"), header = TRUE, colClasses = c(NA, rep("numeric", 2+2*nTreat), rep("integer", 2), rep("character", 2), rep("integer", 2), rep("numeric", 3))) # results$alpha_subgroup <- factor(results$alpha_subgroup, levels = unique(results$alpha_subgroup)) results$alphaPattern <- factor(results$alphaPattern, levels = unique(results$alphaPattern)) results$betaPattern <- factor(results$betaPattern, levels = unique(results$betaPattern)) results[1:3, ] ``` All files were written to the `wrdir`. The modeling results are now ready to be visualized. # III. DATA VISULIZATION ## 7. LOAD DECAY DATA AND MODELING RESULTS (AND, OPTIONALLY, GENE DESCRIPTIONS) Load normalized read data and reorder factor levels of "treatment" as you would like them to appear in the plot key and in the order you want them to plot. ```{r} library(RNAdecay) decay_data <- RNAdecay::decay_data # built-in package example data; comment this line out to use your own decay_data decay_data$treatment <- factor(decay_data$treatment, levels = c("WT", "sov", "vcs", "vs")) # you must type them identically in the new order levels(decay_data$treatment)[4] <- "vcs.sov" decay_data[1:4,] ``` Load modeling results. ```{r} results <- RNAdecay::results # built-in package example data; comment this line out to use your own results # For example: # results <- read.delim(paste0(tempdir(),"/DecayAnalysis/Example analysis results 2/alphas+betas+mods+grps+patterns+relABs.txt"), header = TRUE, colClasses = c(NA, rep("numeric", 9), rep("integer", 3), rep("character", 3), rep("numeric", 6))) # results$alpha_subgroup <- factor(results$alpha_subgroup, levels = unique(results$alpha_subgroup)) results[1:3, ] ``` Optionally, for printing gene descriptions directly on the plot, load a gene description file and then generate a character vector of descriptions with elements named with geneIDs (e.g., for _Arabidopsis thaliana_, download and unzip https://www.arabidopsis.org/download_files/Genes/TAIR10_genome_release/gene_description_20131231.txt.gz, and place it in your working directory). ```{r} # gene_desc <- read.delim("gene_description_20140101.txt"), quote = NULL, comment = '', header = FALSE) # gene_desc[, 1] <- substr(gene_desc[, 1], 1, 9) # gene_desc <- data.frame(gene_desc[!duplicated(gene_desc[, 1]), ], row.names = 1) # colnames(gene_desc) <- c("type", "short description", "long description", "computational description") # descriptions <- gene_desc[, "long description"] # names(descriptions) <- rownames(gene_desc) descriptions <- c( "Encodes a ubiquitin E3 ligase with RING and SPX domains that is involved in mediating immune responses and mediates degradation of PHT1s at plasma membranes. Targeted by MIR827. Ubiquitinates PHT1;3, PHT1;2, PHT1;1/AtPT1 and PHT1;4/AtPT2.", "", "Related to Cys2/His2-type zinc-finger proteins found in higher plants. Compensated for a subset of calcineurin deficiency in yeast. Salt tolerance produced by ZAT10 appeared to be partially dependent on ENA1/PMR2, a P-type ATPase required for Li+ and Na+ efflux in yeast. The protein is localized to the nucleus, acts as a transcriptional repressor and is responsive to chitin oligomers. Also involved in response to photooxidative stress.", "Encodes a stress enhanced protein that localizes to the thylakoid membrane and whose mRNA is upregulated in response to high light intensity. It may be involved in chlorophyll binding." ) names(descriptions) <- c("AT1G02860","AT5G54730", "AT1G27730", "AT4G34190") ``` ## 8. PRINT DECAY PLOTS TO PDF Plot a single gene using `decay_plot`. If you do not have a gene desription file, do not include "Desc" in the `what` argument (see `?decay_plot`). ```{r, fig.show = 'hold'} p <- decay_plot(geneID = "AT1G02860", xlim = c(0, 350), ylim = c(0, 1.25), xticks = 1:5*100, yticks = 0:5/4, alphaSZ = 12, what = c("meanSE", "reps", "models"), treatments = c("WT", "vcs.sov"), colors = c("darkblue", "red3"), DATA = decay_data, mod.results = results, gdesc = descriptions) # print(p+plain_theme(8, 1, leg.pos = c(0.7, 0.8))) #this will print the plot to your current graphics device (dev.cur() tells you what that is), if you do not have a graphics device open (e.g., "null device") initiate one (e.g., use quartz(),pdf(), or windows(); dev.off() closes the device and writes the file). ``` Plot one or multiple genes; write them to a pdf file. ```{r} # make new directory for results wrdir <- paste0(tempdir(),"/DecayAnalysis/Example analysis results 3") if(!file.exists(wrdir)) dir.create(wrdir, recursive = TRUE) pdf(paste0(wrdir, "/decay plot example.pdf"), width = 10, height = 8) p <- decay_plot(geneID = "AT1G02860", xlim = c(0, 500), ylim = c(0, 1.25), xticks = 1:5*100, yticks = 0:5/4, alphaSZ = 10, what = c("Desc","meanSE", "reps", "models","alphas&betas"), treatments = c("WT", "vcs.sov"), colors = c("darkblue", "red3"), DATA = decay_data, mod.results = results, gdesc = descriptions) print(p+plain_theme(32, 1, leg.pos = 'right')) dev.off() # plot multiple genes ids <- c("AT5G54730", "AT1G27730", "AT4G34190") # or e.g., # ids <- rownames(results[results$alpha_WT > 0.1, ]) pdf(paste0(wrdir, "/multiple RNA decay plots.pdf"), width = 10, height = 7) for(i in ids){ p <- decay_plot(i, what = c("meanSE", "models", "Desc"), mod.results = results, gdesc = descriptions, treatments = c("WT","sov","vcs","vcs.sov"), DATA = decay_data) print(p+plain_theme(13.8, 1, leg.pos = 'right')) cat(i, " plotted.\n") } dev.off() ``` Plot one or multiple genes using hl_plot; write them to a pdf file. ```{r} pdf(paste0(wrdir, "/halflife plot example.pdf"), width = 10, height = 8) p <- hl_plot(geneID = "AT1G02860", gene_symbol = "SYG1", df_decay_rates = RNAdecay::results, hl_dist_treatment = "WT", hl_treatment = c("WT","sov","vcs","vcs.sov"), arrow_lab_loc = "key" ) print(p) dev.off() # plot multiple genes ids <- c("AT5G54730", "AT1G27730", "AT4G34190") # or e.g., # ids <- rownames(results[results$alpha_WT > 0.1, ]) pdf(paste0(wrdir, "/multiple halflife plots.pdf"), width = 4, height = 4) for(i in ids) { p <- hl_plot(geneID = i, df_decay_rates = RNAdecay::results, hl_dist_treatment = "WT", hl_treatment = c("WT","sov","vcs","vcs.sov"), arrow_lab_loc = "plot" ) print(p+plain_theme(8, 1,leg.pos = 'right')) cat(i, " plotted.\n") } dev.off() ```