Merge pull request #325 from laurenhsu1/master

Add monster wrapper function that takes as input 2 premade networks
netZoo · Aug 2, 2024 · e7439ca · e7439ca
2 parents 42b9f03 + 93cb19b
commit e7439ca
Show file tree

Hide file tree

Showing 5 changed files with 181 additions and 16 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -24,6 +24,7 @@ export(craneBipartite)
 export(craneUnipartite)
 export(createCondorObject)
 export(createPandaStyle)
+export(domonster)
 export(downloadGTEx)
 export(dragon)
 export(el2adj)

diff --git a/R/MONSTER.R b/R/MONSTER.R
@@ -118,7 +118,8 @@ monsterPrintMonsterAnalysis <- function(x, ...){
 #' design=c(rep(0,nGenes),rep(1, nGenes))
 #' monsterRes <- monster(expr, design, motif=NA, nullPerms=10, numMaxCores=1, mode='regNet')
 #' }
-
+#' # alternatively, if a gene regulatory network has already been estimated,
+#' # see the domonster function for quick start
 monster <- function(expr, 
                     design, 
                     motif, 
@@ -1136,3 +1137,73 @@ NULL
 #' @references Glass K, Huttenhower C, Quackenbush J, Yuan GC. Passing Messages Between Biological Networks to Refine Predicted Interactions. PLoS One. 2013 May 31;8(5):e64832.
 #' 
 NULL
+
+
+#' MONSTER quick-start with pre-made regulatory networks
+#'
+#' This function is a wrapper to simplify usage of the \code{monster} function in
+#' the case where the pair of regulatory networks to be contrasted have already
+#' been estimated, either as \code{panda} objects, or represented as an
+#' adjacency matrix with regulators in rows and genes in columns.
+#'
+#' @param exp_graph matrix or PANDA object (generated by \code{netzooR::panda}); if matrix, should be the adjacency matrix for the network with regulators in rows and genes in columns, both named. This should be the network for the experimental (case) group.
+#' @param control_graph matrix or PANDA object (generated by \code{netzooR::panda}); if matrix, should be the adjacency matrix for the network with regulators in rows and genes in columns, both named. This should be the network for the control (reference) group.
+#' @param nullPerms numeric; defaults to 1000. Number of null permutations to perform. See \code{monster} for more details.
+#' @param numMaxCores numeric; defaults to 3. Maximum number of cores to use; will be the minimum of this number and the actual available cores. See \code{monster} for more details.
+#' @param ... other arguments for \code{monster} may be passed.
+#'
+#' @return \code{monster} object
+#' @export
+#'
+#' @examples
+#' 
+#' \donttest{
+#'
+#' # Generating PANDA networks for demonstration:
+#' # For the purposes of this example, first partition the pandaToyData samples, then perform panda:
+#' pandaResult_exp <- panda(pandaToyData$motif, pandaToyData$expression[,1:25], pandaToyData$ppi)
+#' pandaResult_control <- panda(pandaToyData$motif, pandaToyData$expression[,26:50], pandaToyData$ppi)
+#'
+#' # function takes both panda objects and matrices, or a mixture
+#' monster_res1 <- domonster(pandaResult_exp, pandaResult_control)
+#' monster_res2 <- domonster(pandaResult_exp@regNet, pandaResult_control@regNet)
+#' monster_res3 <- domonster(pandaResult_exp@regNet, pandaResult_control)
+#' }
+domonster <- function(exp_graph, control_graph, nullPerms = 1000, numMaxCores = 3, ...){
+  if('panda' %in% class(exp_graph)){
+    exp_graph <- exp_graph@regNet
+  }
+  if('panda' %in% class(control_graph)){
+    control_graph <- control_graph@regNet
+  }
+
+  if(!identical(colnames(control_graph), colnames(exp_graph))){
+    genes_keep <- intersect(colnames(control_graph), colnames(exp_graph))
+
+    exp_graph <- exp_graph[,genes_keep]
+    control_graph <- control_graph[,genes_keep]
+
+    cat('\nNote: column names are not identical in the input; taking the intersection of them:\n')
+    cat(paste0(length(genes_keep), ' genes included \n'))
+  }
+  if(!identical(rownames(control_graph), rownames(exp_graph))){
+    reg_keep <- intersect(rownames(control_graph), rownames(exp_graph))
+
+    exp_graph <- exp_graph[reg_keep,]
+    control_graph <- control_graph[reg_keep,]
+
+    cat('\nNote: row names are not identical in the input; taking the intersection of them:\n')
+    cat(paste0(length(reg_keep), ' regulators included \n'))
+  }
+
+  combdf <- as.data.frame(cbind(control_graph, exp_graph))
+  combdes <- c(rep(0, ncol(control_graph)), rep(1, ncol(exp_graph)))
+  res <- monster(expr = combdf,
+                 design = combdes,
+                 motif = NA,
+                 mode = 'regNet',
+                 nullPerms = nullPerms,
+                 numMaxCores = numMaxCores,
+                 ...)
+  return(res)
+}
diff --git a/man/domonster.Rd b/man/domonster.Rd
diff --git a/man/monster.Rd b/man/monster.Rd
diff --git a/vignettes/MONSTER.Rmd b/vignettes/MONSTER.Rmd
@@ -29,7 +29,7 @@ MONSTER takes in sequence motif data linking transcription factors (TFs) to gene
 
 Install and load netZooR package
 ```{r,eval = FALSE}
-# install.packages("devtools") 
+# install.packages("devtools")
 library(devtools)
 # install netZooR pkg with vignettes, otherwise remove the "build_vignettes = TRUE" argument.
 devtools::install_github("netZoo/netZooR", build_vignettes = TRUE)
@@ -39,31 +39,34 @@ devtools::install_github("netZoo/netZooR", build_vignettes = TRUE)
 library(netZooR)
 ```
 
-## Input files
+
+## Full MONSTER tutorial in build network mode
+
+### Input files
 
 In this demo, we use the included Yeast dataset, containing three separate Yeast datasets, and a sequence motif object.  The `yeast` dataset includes three separate experimental designs each involving microarray assays of 2555 genes of yeast (Saccharomyces cerevisiae).  `yeast$exp.ko` is a set of 106 gene expression samples of following a number of gene knockouts.  `yeast$exp.cc` is a set of 50 gene expression samples taken sets of two across 25 timepoints spanning the cellular cycle. `yeast$exp.sr` is a set of 173 gene expression samples collected under conditions of stress such as heat shock.  Each expression dataset has been normalized and scaled.
 
-In this tutorial, we will run `monster` to identify suspected TF drivers of the change from the early cell cycle to late cell cycle. 
+In this tutorial, we will run `monster` to identify suspected TF drivers of the change from the early cell cycle to late cell cycle.
 
 First, we load the data included in the `netZooR` package
 
 ```{r, message=F}
 data(yeast)
 ```
 
-Next, we create our design vector indicating to which group each sample belongs.  This vector must contain only 0's and 1's (NAs allowed). 
+Next, we create our design vector indicating to which group each sample belongs.  This vector must contain only 0's and 1's (NAs allowed).
 
 In this example we are running the analysis on the first 10 timepoints compared to the last 10 timepoints, ignoring the middle 5 for the purposes of simplicity in this tutorial.  Each timepoint contains two samples.
 
 ```{r, message=F}
 design <- c(rep(0,20),rep(NA,10),rep(1,20))
 ```
 
-The main method in MONSTER is the `monster` function.  This function has three required arguments, 
+The main method in MONSTER is the `monster` function.  This function has three required arguments,
 
-* A gene expression matrix, `yeast$exp.cc` 
+* A gene expression matrix, `yeast$exp.cc`
 * A motif mapping data.frame, `yeast$motif`
-* A design integer vector, `design` 
+* A design integer vector, `design`
 
 The gene expression argument may be a `matrix`, a `data.frame` or an `ExpressionSet`.  In this example, `yeast$exp.cc` is a `data.frame` consisting of 2555 rows and 50 columns.
 
@@ -74,7 +77,7 @@ yeast$exp.cc[1:5,1:5]
 
 The motif mapping data.frame tells the MONSTER algorithm which genes contain likely transcription factor binding sites in the vicinity of their promoter. This serves as the regulatory prior and informs the initial network inference method by supplying a partial list of TF targeting.
 
-This data.frame contains 3 columns, where each row is a specific edge in the prior network.  
+This data.frame contains 3 columns, where each row is a specific edge in the prior network.
 
 * Column 1 specifies the transcription factor for the edge.
 * Column 2 specifies the targeted gene for the edge
@@ -92,7 +95,7 @@ MONSTER tests the statistical significance of its results by permuting the sampl
 
 Monster is optimized to run on multiple cores, if available.  We can specify the maximum number of cores which are available to run the algorithm.  If `numMaxCores` unspecified, MONSTER will check available resources and run on all but four of the available cores.
 
-## Running MONSTER
+### Running MONSTER
 
 ```{r, message=F}
 yeast$exp.cc[is.na(yeast$exp.cc)] <- mean(as.matrix(yeast$exp.cc),na.rm=TRUE)
@@ -124,7 +127,7 @@ We can view the $dTFI$ with the generic `plot` function
 monsterPlotMonsterAnalysis(monsterRes)
 ```
 
-In this plot, we have the $dTFI$ for each transcription factor along the x-axis.  The observed values are shown in red and the null values are shown in blue.  
+In this plot, we have the $dTFI$ for each transcription factor along the x-axis.  The observed values are shown in red and the null values are shown in blue.
 
 Due to the complex nature of the structure of gene regulatory networks, it is not uncommon to see transcription factors which exhibit a high degree of transitional change, but which is not statistically significant due to the high variability of that particular TF (e.g. YDR463W).  Conversely, some TFs show weak changes, but those changes are large compared to the changes observed in null transitions (e.g. YKL062W).  Ideally, we are most interested in TFs which demonstrate large changes in targetting pattern which is found to be strongly significant (e.g. YJL056C).
 
@@ -138,7 +141,7 @@ Our top hit here is YDL056W, which reassuringly is established in the literature
 
 ***
 
-The dTFI plot focuses primarily on individual transcription factors which have systematically changed their targetting patterns between groups. To dive further into the mechanisms, we may be specifically interested in which TFs are acquiring the targetting signatures of which other TFs. We can visualize the transition matrix in a number of ways using MONSTER.  
+The dTFI plot focuses primarily on individual transcription factors which have systematically changed their targetting patterns between groups. To dive further into the mechanisms, we may be specifically interested in which TFs are acquiring the targetting signatures of which other TFs. We can visualize the transition matrix in a number of ways using MONSTER.
 
 First, using the package `gplots`, we can simply plot the transition matrix. The `heatmap.2` function will show the $m\times m$ transition matrix in the form of a square heatmap with $m$ being the number of transcription factors.  Intuitively, this is the operator, $\textbf{T}$, on which we transform gene regulatory network $\textbf{A}$ (The first 10 timepoints in the Yeast Cell Cycle) to network $\textbf{B}$ (The last 10 timepoints in the Yeast Cell Cycle) via the equation
 $$\textbf{B}=\textbf{AT} + \textbf{E}$$
@@ -148,9 +151,9 @@ where $\textbf{E}$ is the $p\times m$ error matrix which we are minimizing.
 ```{r, fig.width=10, fig.height=8, out.width = 750, out.height = 600, message=FALSE}
 library(gplots)
 heatmap.2(slot(monsterRes, 'tm'), col = "bluered",
-density.info="none",  
-trace="none", 
-dendrogram='none',     
+density.info="none",
+trace="none",
+dendrogram='none',
 Rowv=FALSE,
 Colv=FALSE)
 ```
@@ -185,3 +188,47 @@ Furthermore, we are often interested in correlated targetting pattern sharing.
 ```
 
 The above plot can also be achieved using the included `MONSTER` function, `monster.transitionPCAPlot(monsterRes)`.
+
+## MONSTER quick-start with pre-made regulatory networks
+
+For analyses where MONSTER is to be performed after a regulatory network has already been estimated (e.g., with PANDA), the `domonster` function can be used to easily prepare the input networks for running `monster`.
+
+`domonster` takes as input 2 networks (`exp_graph` and `control_graph`), either as PANDA objects or adjacency matrices (regulators in rows; genes in columns).
+
+- `exp_graph` is the graph generated from experimental or case samples
+- `control_graph` is the graph generated from control or reference samples
+
+The transition matrix $\hat{\Tau}$ that the MONSTER algorithm estimates will be that for:
+
+$$G_{\mathrm{control}} \hat{\tau} = G_{\mathrm{exp}}$$
+
+where $G_{\mathrm{control}}$ and $G_{\mathrm{exp}}$ are the control and experimental graphs, respectively.
+
+To demonstrate the code set-up with a toy example, we start by creating toy datasets using `pandaToyData`. For the purposes of demonstration, we partition the samples into experimental and control:
+
+```{r, message = F}
+# Generating PANDA networks from partitioned toy data
+
+pandaResult_exp <- panda(pandaToyData$motif, pandaToyData$expression[,1:25], pandaToyData$ppi)
+pandaResult_control <- panda(pandaToyData$motif, pandaToyData$expression[,26:50], pandaToyData$ppi)
+```
+
+Then, these outputs generated by PANDA can be used as input either in PANDA format directly, or with the network. Any of the below code options will work and be equivalent to each other. In addition to PANDA networks, any regulatory network graph in adjacency matrix format (regulators in rows and genes in columns) can be used.
+
+```{r, message = F}
+monster_res1 <- domonster(exp_graph = pandaResult_exp,
+                          control_graph = pandaResult_control,
+                          nullPerms = 10)
+
+monster_res2 <- domonster(exp_graph = pandaResult_exp@regNet,
+                          control_graph = pandaResult_control@regNet,
+                          nullPerms = 10)
+
+monster_res3 <- domonster(exp_graph = pandaResult_exp@regNet,
+                          control_graph = pandaResult_control,
+                          nullPerms = 10)
+```
+
+For this code example, we compute only 10 null permutations. However, the function defaults to 1000, as would likely be used in an actual analysis to ensure sufficient null observations to estimate null distributions and compute empirical p-values.
+
+The results can then be further analyzed and visualized as described in the vignette above.