\documentclass[a4paper]{article} % \VignetteIndexEntry{GRENITS} \title{GRENITS: \\ Gene Regulatory Network Inference \\ Using Time Series} \author{Edward R. Morrissey\\ Systems Biology Doctoral Training Centre\\ University of Warwick } \date{\today} \begin{document} \setkeys{Gin}{width=1\textwidth} @ % Sweave("GRENITS/inst/doc/GRENITS_package.Rnw"); tools::texi2dvi("GRENITS_package.tex", pdf=TRUE) \maketitle \section*{Overview} GRENITS offers four network inference statistical models using Dynamic Bayesian Networks and Gibbs Variable Selection. A linear interaction model, two linear interaction models with added experimental noise (Gaussian and Student distributed) and a non-linear interaction model (\cite{Morrissey10} and \cite{Morrissey11}). The package is intended to be used by both users with a background in Bayesian Inference, as well as casual users. To this end, prior parameters and MCMC (Markov chain Monte Carlo) parameters have been set by default to values that in our experience are adequate for a large range of data sets. As well as this, some basic diagnostic and analysis plots are provided. The example contained in this document shows how to run the Linear Network model, as well as how to analyse the output and how to make an explicit network prediction. <>= library(GRENITS) @ \section*{Example: Network Inference For Simulated Data} \subsection*{Simulated Data} The data we will use for this example is data generated by an Ordinary Differential Equation (ODE) model of \textit{A. thaliana}'s circadian clock \cite{Locke}. The model was deterministically simulated using COPASI \cite{copasi}. The data was subsampled to produce hourly time spacing and the log was taken. <>= data(Athaliana_ODE) dim(Athaliana_ODE) @ The data matrix has genes in rows (5) and time points in columns (50) (fig: \ref{fig:LockeData}). \begin{figure}[!h] \centering <>= # Load A. thaliana circadian clock ODE generated data plot.ts( t(Athaliana_ODE), plot.type = "single", col = 1:5, xlim = c(0,65), main = "Circadian Clock Network \n ODE simulated data", xlab = "Time (h)", ylab = "Expression") legend("topright", rownames(Athaliana_ODE), lty = 1, col = 1:5) @MCMC \caption{\textit{A. thaliana} Circadian Clock. Simulated data from ODE model \label{fig:LockeData}} \end{figure} \subsection*{Inference} First we run the MCMC function <>= output.folder <- paste(tempdir(), "/Example_LinearNet", sep="") LinearNet(output.folder, Athaliana_ODE) @ This will run two MCMC chains with the default parameters. For non default parameter values, a parameter vector can be provided (See ?mcmc.defaultParams\_Linear). The output folder contains the raw output from the MCMC run (chain1 and chain2), gene names (geneNames.txt) and the values of the parameters used (runInfo.txt). Next we analyse the output. <>= # Analyse raw results, place analysis plots and files in output.folder analyse.output(output.folder) @ The results of the analysis are placed in the run folder. <>= dir(output.folder) @ The Convergence file (ConvergencePlots.pdf) contains plots associated to the adequate convergence of the MCMC. This is crucial for further analysis as, if convergence has not been reached, the results are not trustworthy. The most important parameters to check are the connection probabilities (fig \ref{fig:gamma-gamma}). A certain amount of "jitter" is acceptable. If any link has a probability difference larger than 0.2 a warning will be issued (both on screen and as a file) and it would be advisable to re-run the MCMC function with a larger number of iterations (See ?mcmc.defaultParams\_Linear). \begin{figure}[!h] \centering <>= chain1 <- read.chain(output.folder,1) chain2 <- read.chain(output.folder,2) gamma1 <- colMeans(chain1$gamma) gamma2 <- colMeans(chain2$gamma) plot(x = gamma1, y = gamma2, xlab = "chain1", ylab = "chain2", main = "Convergence plot for link probabilities", xlim = c(0,1), ylim = c(0,1), cex = 1.5, cex.lab = 1.6, cex.main = 1.6) lines(c(0,1), c(0,1), col = "red") @ \caption{Convergence plot \label{fig:gamma-gamma}} \end{figure} More formal convergence analysis can also be carried out by reading the chains (?read.chain) and using an MCMC convergence package such as "coda". Although in our experience with the parameters chosen, convergence is not too problematic. Normally monitoring the link probabilities (fig \ref{fig:gamma-gamma}) and if necessary running longer runs is sufficient to achieve convergence. To predict a network we first load the inferred network probabilities <>= prob.file <- paste(output.folder, "/NetworkProbability_Matrix.txt", sep = "") prob.mat <- read.table(prob.file) print(prob.mat) @ By applying a threshold (e.g 0.8) on the link probability (not a p-value!), we can make a network prediction <>= inferred.net <- 1*(prob.mat > 0.8) print(inferred.net) @ A plot of the inferred network can be seen in fig \ref{fig:graph} (right), whereas fig \ref{fig:graph} (left) shows how the link probabilities are distributed. The latter plot (found in the file "AnalysisPlots.pdf") is useful when deciding what threshold to use, especially if the links separate into two groups. \begin{figure}[!h] \centering <>= library(network) inferred.net <- network(inferred.net) par(mfrow = c(1,2), cex = 1.76, cex.lab = 1.3, cex.main = 1.4) # Plot cut off prob.vec <- sort(as.vector(as.matrix(prob.mat)), T) # Remove self interaction (last 5 elements) prob.vec <- prob.vec[4:0 - length(prob.vec)] plot(x = prob.vec, y = 1:length(prob.vec), xlim = c(0,1), main = "Connections included vs threshold", xlab = "Probability threshold", ylab = "Connections included") lines(c(0.8,0.8), c(0, 30), col = "red", lty = 2, lwd = 2) # Plot Network plot(inferred.net, label = network.vertex.names(inferred.net), main = "A. thaliana Inferred Network", mode = "circle", vertex.cex=7, arrowhead.cex = 2,vertex.col="green") @ \caption{Inferred network. Data used is Simulated data from ODE model \label{fig:graph}} \end{figure} For information on the sign of the interaction (activating/inhibiting) see NetworkProbability\_List.txt <>= prob.list.file <- paste(output.folder, "/NetworkProbability_List.txt", sep = "") prob.list <- read.table(prob.list.file, header = T) above.08 <- (prob.list[,3] > 0.8) print(prob.list[above.08,]) @ This file can be used for further analysis using for example cytoscape \cite{cytoscape}. \begin{thebibliography}{2011} \bibitem{Morrissey10} Morrissey, E.R., Juarez, M.A., Denby, K.J. and Burroughs, N.J. {\sl On reverse engineering of gene interaction networks using time course data with repeated measurements.} Bioinformatics 2010. \bibitem{Morrissey11} Morrissey, E.R., Juarez, M.A., Denby, K.J. and Burroughs, N.J. {\sl Inferring the time-invariant topology of a nonlinear sparse gene regulatory network using fully Bayesian spline autoregression} Biostatistics 2011. \bibitem{copasi} Hoops, S., Sahle, S., Gauges, R., Lee, C., Pahle, J., Simus, N., Singhal, M., Xu, L., Mendes, P. and Kummer, U. {\sl COPASI a COmplex PAthway SImulator.} Bioinformatics 2006. \bibitem{Locke} Locke, J.C.W., Kozma-Bognar, L., Gould, P.D., Feher, B., Kevei, E., Nagy, F., Turner, M.S., Hall, A. and Millar, A.J. {\sl Experimental validation of a predicted feedback loop in the multi-oscillator clock of Arabidopsis thaliana.} Molecular Systems Biology 2006. \bibitem{cytoscape} Shannon P, Markiel A, Ozier O, Baliga NS, Wang JT, Ramage D, Amin N, Schwikowski B and Ideker T. {\sl Cytoscape: a software environment for integrated models of biomolecular interaction networks.} Genome Research 2003. \end{thebibliography} \end{document}