From 65cff73bb2bafc8f55088199f636acbf81aeab42 Mon Sep 17 00:00:00 2001 From: Tara Eicher Date: Fri, 17 May 2024 14:19:11 -0400 Subject: [PATCH 01/17] Added BLOBFISH --- DESCRIPTION | 5 +- NAMESPACE | 2 + R/BLOBFISH.R | 508 +++++++++++++++++++++++++++ man/BuildSubnetwork.Rd | 39 ++ man/FindSignificantEdgesForHop.Rd | 38 ++ man/GenerateNullPANDADistribution.Rd | 35 ++ man/PlotNetwork.Rd | 53 +++ man/RunBLOBFISH.Rd | 43 +++ man/SignificantBreadthFirstSearch.Rd | 47 +++ tests/testthat/test-blobfish.R | 318 +++++++++++++++++ 10 files changed, 1086 insertions(+), 2 deletions(-) create mode 100644 R/BLOBFISH.R create mode 100644 man/BuildSubnetwork.Rd create mode 100644 man/FindSignificantEdgesForHop.Rd create mode 100644 man/GenerateNullPANDADistribution.Rd create mode 100644 man/PlotNetwork.Rd create mode 100644 man/RunBLOBFISH.Rd create mode 100644 man/SignificantBreadthFirstSearch.Rd create mode 100644 tests/testthat/test-blobfish.R diff --git a/DESCRIPTION b/DESCRIPTION index 636e0c37..8bfc4dd2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -50,6 +50,7 @@ Imports: GO.db, org.Hs.eg.db, Matrix, + matrixTests, gplots, nnet, data.table, @@ -75,7 +76,7 @@ Imports: GeneNet, loo, rARPACK, - corpcor + corpcor, License: GPL-3 Encoding: UTF-8 LazyData: false @@ -87,6 +88,6 @@ Suggests: dorothea VignetteEngine: knitr VignetteBuilder: knitr -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.1 BugReports: https://github.com/netZoo/netZooR/issues URL: https://github.com/netZoo/netZooR, https://netzoo.github.io/ diff --git a/NAMESPACE b/NAMESPACE index 2a9bd8c0..37882969 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,7 @@ # Generated by roxygen2: do not edit by hand +export(GenerateNullPANDADistribution) +export(RunBLOBFISH) export(adj2el) export(adj2regulon) export(alpaca) diff --git a/R/BLOBFISH.R b/R/BLOBFISH.R new file mode 100644 index 00000000..ce423a17 --- /dev/null +++ b/R/BLOBFISH.R @@ -0,0 +1,508 @@ +#' Given a set of genes of interest, full bipartite networks with scores (one network for each sample), a significance +#' cutoff for statistical testing, and a hop constraint, BLOBFISH finds a subnetwork of +#' significant edges connecting the genes. +#' @param geneSet A character vector of genes comprising the targets of interest. +#' @param networks A list of bipartite (PANDA-like) networks, where each network is a data frame with the following format: +#' tf,gene,score +#' @param alpha The significance cutoff for the statistical test. +#' @param hopConstraint The maximum number of hops to be considered between gene pairs. +#' Must be an even number. +#' @param nullDistribution The null distribution, specified as a vector of values. +#' @param verbose Whether or not to print detailed information about the run. +#' @param topX Select the X lowest significant p-values for each gene. NULL by default. +#' @returns A bipartite subnetwork in the same format as the original networks. +#' @export +RunBLOBFISH <- function(geneSet, networks, alpha, hopConstraint, nullDistribution, + verbose = FALSE, topX = NULL){ + + # Check for invalid inputs. + if(!is.character(geneSet) || !is.list(networks) || !is.numeric(alpha) || !is.numeric(hopConstraint)){ + stop(paste("Wrong input type! geneSet must be a character vector. networks must be a list.", + "alpha and hopConstraint must be scalar numeric values.")) + }else if(!is.data.frame(networks[[1]])){ + stop("Each network must be a data frame.") + }else if(ncol(networks[[1]]) != 3 || colnames(networks[[1]])[1] != "tf" || + colnames(networks[[1]])[2] != "gene" || colnames(networks[[1]])[3] != "score"){ + stop(paste("Each network must have transcription factors in the first column,", + "target genes in the second column, and scores in the third column.")) + }else if(alpha > 1 || alpha <=0){ + stop("alpha must be between 0 and 1, not including 0.") + }else if(hopConstraint < 2 || hopConstraint %% 2 != 0){ + stop("hopConstraint must be an even number of at least 2.") + }else if(!is.numeric(nullDistribution)){ + stop("nullDistribution must be numeric.") + } + + # Build the subnetwork. + subnetwork <- BuildSubnetwork(geneSet = geneSet, + networks = networks, + alpha = alpha, + hopConstraint = hopConstraint, + nullDistribution = nullDistribution, + verbose = verbose, topX = topX) + + # Return. + return(subnetwork) +} + +#' Generate a null distribution of edge scores for PANDA-like networks; that is, +#' the set of edges where (1) the TF does not have a binding motif in the gene region, +#' (2) the TF does not form a complex with any other TF that has a binding motif in +#' the gene region, and (3) the genes regulated by the TF are not coexpressed with the +#' gene in question. We obtain this by inputting an empty prior and an identity coexpression +#' matrix. +#' @param ngenes Number of genes to simulate +#' @param nTranscriptionFactors Number of transcription factors to simulate +#' @param sampSize Number of samples to simulate +#' @param numberOfPandas Number of null PANDA networks to generate +#' @export +GenerateNullPANDADistribution <- function(ngenes = 20000, nTranscriptionFactors = 600, sampSize = 20, + numberOfPandas = 10){ + + # Set the seed. + set.seed(1) + + # Generate a motif file where only TFs 1-3 are connected to genes. + motiffname <- paste0("tmpMotif.tsv") + motifs <- data.frame(tf = rep(paste0("tf", 1:nTranscriptionFactors), ngenes), + gene = unlist(lapply(1:ngenes, function(j){ + return(rep(paste0("gene", j), nTranscriptionFactors)) + })), score = rep(0, ngenes * nTranscriptionFactors)) + motifs[intersect(which(motifs$tf == "tf1"), which(motifs$gene == "gene1")), "score"] <- 1 + motifs[intersect(which(motifs$tf == "tf2"), which(motifs$gene == "gene2")), "score"] <- 1 + motifs[intersect(which(motifs$tf == "tf3"), which(motifs$gene == "gene3")), "score"] <- 1 + write.table(motifs, motiffname, quote = FALSE, sep = "\t", + row.names = FALSE, col.names = FALSE) + + # Generate a PPI where TFs 1 is connected to TFs 4-5. + ppifname <- paste0("tmpPPI.tsv") + ppi <- data.frame(tf = c(paste0("tf", 1:nTranscriptionFactors), "tf1", "tf1"), + gene = c(paste0("tf", 1:nTranscriptionFactors), "tf4", "tf5"), + score = rep(1, nTranscriptionFactors + 2)) + write.table(ppi, ppifname, quote = FALSE, sep = "\t", + row.names = FALSE, col.names = FALSE) + + # Generate the null PANDA edges. + nullPandas <- lapply(1:numberOfPandas, function(i){ + + # Generate a random matrix of expression values, where any correlation that exists + # is spurious. + randomExpression <- matrix(data = stats::rnorm(ngenes * sampSize), nrow = ngenes) + rownames(randomExpression) <- paste0("gene", 1:ngenes) + + # Save in a temporary file. + fname <- paste0("tmp_", i, ".tsv") + write.table(randomExpression, fname, row.names = TRUE, col.names = FALSE, sep = "\t", + quote = FALSE) + + # Run PANDA. + nullPanda <- pandaPy(expr_file = fname, motif_file=motiffname, ppi_file=ppifname, save_tmp=FALSE)$panda + rownames(nullPanda) <- paste(nullPanda$TF, nullPanda$Gene, sep = "__") + + # Remove file. + unlink(fname) + + # Return null results. + tfListToExclude <- c("tf1", "tf2", "tf3", "tf4", "tf5") + geneListToExclude <- c("gene1", "gene2", "gene3", "gene1", "gene1") + rowsToExclude <- paste(tfListToExclude, geneListToExclude, sep = "__") + return(nullPanda[setdiff(rownames(nullPanda), rowsToExclude),"Score"]) + }) + + # Remove files. + unlink(motiffname) + unlink(ppifname) + + # Return the values. + return(unlist(nullPandas)) +} + +#' Find the subnetwork of significant edges connecting the genes. +#' @param geneSet A character vector of genes comprising the targets of interest. +#' @param networks A list of bipartite (PANDA-like) networks, where each network is a data frame with the following format: +#' tf,gene,score +#' @param alpha The significance cutoff for the statistical test. +#' @param hopConstraint The maximum number of hops to be considered between gene pairs. +#' Must be an even number. +#' @param nullDistribution The null distribution, specified as a vector of values. +#' @param verbose Whether or not to print detailed information about the run. +#' @param topX Select the X lowest significant p-values for each gene. NULL by default. +#' @returns A bipartite subnetwork in the same format as the original networks. +BuildSubnetwork <- function(geneSet, networks, alpha, hopConstraint, nullDistribution, + verbose = FALSE, topX = NULL){ + + # Name edges for each network. + networksNamed <- lapply(networks, function(network){ + rownames(network) <- paste(network$tf, network$gene, sep = "__") + return(network) + }) + + # Paste together the networks. + combinedNetwork <- networksNamed[[1]] + for(i in 2:length(networksNamed)){ + combinedNetwork[,2+i] <- networksNamed[[i]]$score + } + + # For each gene, find the significant edges from each hop. + significantSubnetworks <- FindSignificantEdgesForHop(geneSet = geneSet, + combinedNetwork = combinedNetwork, + alpha = alpha, + hopConstraint = hopConstraint / 2, + nullDistribution = nullDistribution, + verbose = verbose, topX = topX) + + # Find matches for each hop. Note that we do not need to consider cases where the + # number of hops are uneven. For instance, any two nodes that can be connected by + # a node 3 hops away from node 1 and 1 hop away from node 2 are also connected by + # a node 2 hops away from both nodes 1 and 2. The same goes for even numbers. + # Even-odd pairs should not be considered. This can be proven. + subnetwork <- FindConnectionsForAllHopCounts(subnetworks = significantSubnetworks, + verbose = verbose) + return(subnetwork) +} + +#' Find the subnetwork of significant edges n / 2 hops away from each gene. +#' @param geneSet A character vector of genes comprising the targets of interest. +#' @param combinedNetwork A concatenation of n PANDA-like networks with the following format: +#' tf,gene,score_net1, score_net2, ... , score_netn +#' @param alpha The significance cutoff for the statistical test. +#' @param hopConstraint The maximum number of hops to be considered for a gene. +#' @param nullDistribution The null distribution, specified as a vector of values. +#' @param verbose Whether or not to print detailed information about the run. +#' @param topX Select the X lowest significant p-values for each gene. NULL by default. +#' @returns A bipartite subnetwork in the same format as the original networks. +FindSignificantEdgesForHop <- function(geneSet, combinedNetwork, alpha, hopConstraint, nullDistribution, + verbose = FALSE, topX = NULL){ + # Build the significant subnetwork for each gene, up to the hop constraint. + uniqueGeneSet <- sort(unique(geneSet)) + geneSubnetworks <- lapply(uniqueGeneSet, function(gene){ + + # Get all significant edges for a 1-hop subnetwork. + if(verbose == TRUE){ + message(paste("Evaluating hop 1 for gene", gene)) + } + subnetwork1Hop <- SignificantBreadthFirstSearch(networks = combinedNetwork, + alpha = alpha, + startingNodes = gene, + nodesToExclude = c(), + startFromTF = FALSE, doFDRAdjustment = FALSE, + nullDistribution, verbose = verbose, + topX = topX) + + # Set the starting and excluded set for the next hop. + startingNodes <- unique(subnetwork1Hop$tf) + topXNew <- NULL + if(!is.null(topX)){ + topXNew <- topX * length(startingNodes) + } + excludedSubset <- gene + + # Add to the list of all subnetworks. + allSubnetworksForGene <- list(subnetwork1Hop) + + # Loop until we reach the maximum number of hops or there are no new edges + # to traverse. + hop <- 2 + while(hop <= hopConstraint && length(startingNodes) > 0){ + + # If we are on an even hop, start from transcription factors. + # If we are on an odd hop, start from genes. + if(hop %% 2 == 0){ + + # Find all significant edges in the next hop. + if(verbose == TRUE){ + message(paste("Evaluating hop", hop, "for gene", gene)) + } + subnetworkHops <- SignificantBreadthFirstSearch(networks = combinedNetwork, + alpha = alpha, + startingNodes = startingNodes, + nodesToExclude = excludedSubset, + startFromTF = TRUE, doFDRAdjustment = FALSE, + nullDistribution = nullDistribution, + verbose = verbose, + topX = topXNew) + + # Set the starting and excluded set for the next hop. + excludedSubset <- c(excludedSubset, startingNodes) + startingNodes <- setdiff(unique(subnetworkHops$gene), excludedSubset) + if(!is.null(topX)){ + topXNew <- topX * length(startingNodes) + } + }else{ + + # Find all significant edges in the next hop. + if(verbose == TRUE){ + message(paste("Evaluating hop", hop, "for gene", gene)) + } + subnetworkHops <- SignificantBreadthFirstSearch(networks = combinedNetwork, + alpha = alpha, + startingNodes = startingNodes, + nodesToExclude = excludedSubset, + startFromTF = FALSE, doFDRAdjustment = FALSE, + nullDistribution = nullDistribution, + verbose = verbose, + topX = topXNew) + + # Set the starting and excluded set for the next hop. + excludedSubset <- c(excludedSubset, startingNodes) + startingNodes <- setdiff(unique(subnetworkHops$tf), excludedSubset) + if(!is.null(topX)){ + topXNew <- topX * length(startingNodes) + } + } + + # Add to the list. + allSubnetworksForGene[[length(allSubnetworksForGene) + 1]] <- subnetworkHops + + # Increment hops. + hop <- hop + 1 + } + return(allSubnetworksForGene) + }) + + # Add the names of the genes. + names(geneSubnetworks) <- uniqueGeneSet + return(geneSubnetworks) +} + +#' Find all significant edges adjacent to the starting nodes, excluding the nodes +#' specified. +#' @param networks A concatenation of n PANDA-like networks with the following format: +#' tf,gene,score_net1, score_net2, ... , score_netn +#' Edges must be specified as "tf__gene". +#' @param alpha The significance cutoff for the statistical test. +#' @param startingNodes The list of nodes from which to start. +#' @param nodesToExclude The list of nodes to exclude from the search. +#' @param startFromTF Whether to start from transcription factors (TRUE) or genes (FALSE). +#' @param doFDRAdjustment Whether or not to adjust the p-values using FDR. +#' @param nullDistribution The null distribution, specified as a vector of values. +#' @param verbose Whether or not to print detailed information about the run. +#' @param topX Select the X lowest significant p-values for each gene. NULL by default. +#' @returns A bipartite subnetwork in the same format as the original networks. +SignificantBreadthFirstSearch <- function(networks, alpha, startingNodes, + nodesToExclude, startFromTF, doFDRAdjustment = FALSE, + nullDistribution, verbose = FALSE, topX = NULL){ + # Check that provided nodes overlap with the networks. + if((length(setdiff(startingNodes, networks$tf)) > 0 && startFromTF == TRUE) || + (length(setdiff(startingNodes, networks$gene)) > 0 && startFromTF == FALSE)){ + stop("ERROR: Starting nodes do not overlap with network nodes") + } + if(length(setdiff(nodesToExclude, c(networks$tf, networks$gene))) > 0){ + stop("ERROR: List of nodes to exclude does not overlap with network nodes") + } + if(length(intersect(startingNodes, nodesToExclude)) > 0){ + stop("ERROR: Starting nodes cannot overlap with nodes to exclude") + } + + # Identify genes and transcription factors to test, based on which of these we are + # starting from. + tfsToTest <- c() + genesToTest <- c() + if(startFromTF == TRUE){ + tfsToTest <- startingNodes + genesToTest <- setdiff(unique(networks$gene), nodesToExclude) + }else{ + genesToTest <- startingNodes + tfsToTest <- setdiff(unique(networks$tf), nodesToExclude) + } + + # Construct all edges to test based on the combination of these. + geneLongList <- rep(genesToTest, length(tfsToTest)) + tfLongList <- unlist(lapply(tfsToTest, function(tf){ + return(rep(tf, length(genesToTest))) + })) + allEdges <- paste(tfLongList, geneLongList, sep = "__") + + # For each edge, measure its significance. + subnetwork <- networks[c(), 3:ncol(networks)] + if(length(allEdges) > 0){ + ourEdgeVals <- networks[allEdges, 3:ncol(networks)] + #nullDistribution <- sample(nullDistribution, size = floor(length(nullDistribution) / length(allEdges)) * length(allEdges)) + nullEdgeVals <- t(matrix(rep(nullDistribution, length(allEdges)), ncol = length(allEdges))) + pValues <- matrixTests::row_wilcoxon_twosample(x = ourEdgeVals, y = nullEdgeVals, alternative = "greater")$pvalue + + # Adjust the p-values. + if(doFDRAdjustment == TRUE){ + pValues <- stats::p.adjust(pValues, method = "fdr") + } + whichSig <- which(pValues < alpha) + + # If topX is specified, filter again. + if(!is.null(topX) && length(whichSig) > topX){ + whichTopX <- order(pValues[whichSig])[1:topX] + whichSig <- whichSig[whichTopX] + } + + # Return the edges meeting alpha. + significantEdges <- allEdges[whichSig] + subnetwork <- networks[significantEdges, c(1:2)] + if(verbose == TRUE){ + message(paste("Retained", length(significantEdges), "out of", length(allEdges), "edges")) + } + } + + # Return the subnetwork. + return(subnetwork) +} + +#' For all hop counts up to the maximum, find subnetworks connecting each pair of +#' genes by exactly that number of hops. For instance, find each +#' @param subnetworks A list of bipartite (PANDA-like) subnetworks for each gene, +#' containing only the significant edges meeting the hop count criteria and +#' where each network is a data frame with the following format: +#' tf,gene +#' @param verbose Whether or not to print detailed information about the run. +#' @returns A bipartite subnetwork in the same format as the original networks. +FindConnectionsForAllHopCounts <- function(subnetworks, verbose = FALSE){ + + # Find a subnetwork for each hop count. + hopCountSubnetworks <- lapply(1:length(subnetworks[[1]]), function(hops){ + + # For each pair of genes, find the subnetworks for this number of hops. + geneSpecificHopCountSubnetwork <- lapply(1:(length(names(subnetworks))-1), function(i){ + genePairSpecificHopCountSubnetwork <- lapply((i+1):length(names(subnetworks)), function(j){ + + # Get the subnetworks for the number of hops of interest. + gene1 <- names(subnetworks)[i] + gene2 <- names(subnetworks)[j] + connectingSubnetwork <- data.frame(tf = NA, gene = NA)[0,] + + # If there were no edges at this hop count for one or both genes, + # do not evaluate. + if(length(subnetworks[[gene1]]) >= hops && length(subnetworks[[gene2]]) >= hops){ + subnetwork1 <- subnetworks[[gene1]][[hops]] + subnetwork2 <- subnetworks[[gene2]][[hops]] + + # Initialize overlapping subnetwork. + genesToRecurse1 <- c() + genesToRecurse2 <- c() + tfsToRecurse1 <- c() + tfsToRecurse2 <- c() + + # If the number of hops is even, add edges from genes that overlap + # If the number of hops is odd, add edges from transcription factors that overlap. + if(hops %% 2 == 0){ + overlappingGenes <- intersect(subnetwork1$gene, subnetwork2$gene) + if(verbose == TRUE){ + message(paste("Hop", hops, "-", length(overlappingGenes), "overlapped between", gene1, "and", gene2)) + } + whichSubnet1Gene <- which(subnetwork1$gene %in% overlappingGenes) + whichSubnet2Gene <- which(subnetwork2$gene %in% overlappingGenes) + tfsToRecurse1 <- unique(subnetwork1[whichSubnet1Gene, "tf"]) + tfsToRecurse2 <- unique(subnetwork2[whichSubnet2Gene, "tf"]) + connectingSubnetwork <- rbind(connectingSubnetwork, subnetwork1[whichSubnet1Gene,], + subnetwork2[whichSubnet2Gene,]) + }else{ + overlappingTF <- intersect(subnetwork1$tf, subnetwork2$tf) + if(verbose == TRUE){ + message(paste("Hop", hops, "-", length(overlappingTF), "overlapped between", gene1, "and", gene2)) + } + whichSubnet1TF <- which(subnetwork1$tf %in% overlappingTF) + whichSubnet2TF <- which(subnetwork2$tf %in% overlappingTF) + genesToRecurse1 <- unique(subnetwork1[whichSubnet1TF, "gene"]) + genesToRecurse2 <- unique(subnetwork2[whichSubnet2TF, "gene"]) + connectingSubnetwork <- rbind(connectingSubnetwork, subnetwork1[whichSubnet1TF,], + subnetwork2[whichSubnet2TF,]) + } + + # Recurse back over the number of hops. + if(hops-1 >= 1){ + for(hop in (hops-1):1){ + subnetwork1 <- subnetworks[[gene1]][[hop]] + subnetwork2 <- subnetworks[[gene2]][[hop]] + + # If the current number of hops is even, add edges from TFs connected to genes of interest. + # If the current number of hops is odd, add edges from genes connected to TFs of interest. + if(hop %% 2 == 0){ + whichTFConnectedToGene1 <- which(subnetwork1$gene %in% genesToRecurse1) + whichTFConnectedToGene2 <- which(subnetwork2$gene %in% genesToRecurse2) + tfsToRecurse1 <- unique(subnetwork1[whichTFConnectedToGene1, "tf"]) + tfsToRecurse2 <- unique(subnetwork2[whichTFConnectedToGene2, "tf"]) + connectingSubnetwork <- rbind(connectingSubnetwork, subnetwork1[whichTFConnectedToGene1,], + subnetwork2[whichTFConnectedToGene2,]) + }else{ + whichGeneConnectedToTF1 <- which(subnetwork1$tf %in% tfsToRecurse1) + whichGeneConnectedToTF2 <- which(subnetwork2$tf %in% tfsToRecurse2) + genesToRecurse1 <- unique(subnetwork1[whichGeneConnectedToTF1, "gene"]) + genesToRecurse2 <- unique(subnetwork2[whichGeneConnectedToTF2, "gene"]) + connectingSubnetwork <- rbind(connectingSubnetwork, subnetwork1[whichGeneConnectedToTF1,], + subnetwork2[whichGeneConnectedToTF2,]) + } + } + } + } + + # Return the subnetwork, which should now contain all of the edges connecting the + # gene pair at the prespecified number of hops. + return(connectingSubnetwork) + }) + # Bind together the subnetwork for each gene pair. + return(do.call(rbind, genePairSpecificHopCountSubnetwork)) + }) + + # Bind together the subnetworks for each gene. + return(do.call(rbind, geneSpecificHopCountSubnetwork)) + }) + + # Bind together the subnetworks for each hop count. + compositeSubnetwork <- do.call(rbind, hopCountSubnetworks) + compositeSubnetworkEdges <- paste(compositeSubnetwork$tf, compositeSubnetwork$gene, sep = "__") + uniqueEdges <- sort(unique(compositeSubnetworkEdges)) + compositeSubnetworkDedup <- do.call(rbind, lapply(uniqueEdges, function(edge){ + whichFirstEdge <- which(compositeSubnetworkEdges == edge)[1] + return(compositeSubnetwork[whichFirstEdge,]) + })) + rownames(compositeSubnetworkDedup) <- uniqueEdges + return(compositeSubnetworkDedup) +} + +#' Plot the networks, using different colors for transcription factors, genes of interest, +#' and additional genes. +#' @param network A data frame with the following format: +#' tf,gene +#' @param genesOfInterest Which genes of interest to highlight +#' @param geneOfInterestColor Color for the genes of interest +#' @param tfColor Color for the transcription factors +#' @param otherGenesColor Color for the other genes +#' @param nodeSize Size of node +#' @param edgeWidth Width of edges +#' @param vertexLabels Which vertex labels to include. By default, none are included. +#' @param layoutBipartite Whether or not to layout as a bipartite graph. +#' @param vertexLabelSize The size of label to use for the vertex, as a fraction of the default. +#' @param vertexLabelOffset Number of pixels in the offset when plotting labels. +#' Default is TRUE. +#' @returns A bipartite plot of the network +PlotNetwork <- function(network, genesOfInterest, geneOfInterestColor = "red", + tfColor = "blue", otherGenesColor = "gray", nodeSize = 1, + edgeWidth = 0.5, vertexLabels = NA, vertexLabelSize = 0.7, + vertexLabelOffset = 0.5, layoutBipartite = TRUE){ + + # Set the node attributes. + uniqueNodes <- unique(c(network$tf, network$gene)) + nodeAttrs <- data.frame(node = uniqueNodes, + color = rep(otherGenesColor, length(uniqueNodes)), + size = rep(nodeSize, length(uniqueNodes)), + frame.width = rep(0, length(uniqueNodes)), + label.color = "black", label.cex = vertexLabelSize, + label.dist = vertexLabelOffset) + rownames(nodeAttrs) <- uniqueNodes + nodeAttrs[which(uniqueNodes %in% network$tf), "color"] <- tfColor + nodeAttrs[which(uniqueNodes %in% genesOfInterest), "color"] <- geneOfInterestColor + + # Create a graph object. + graph <- igraph::graph_from_data_frame(network, vertices = nodeAttrs, directed = FALSE) + V(graph)$type <- V(graph)$name %in% network$tf + + # Plot. + labels <- V(graph)$name + whichEmpty <- which(labels %in% setdiff(labels, vertexLabels)) + labels[whichEmpty] <- rep(NA, length(whichEmpty)) + + if(layoutBipartite == TRUE){ + LO <- layout_as_bipartite(graph) + LO <- LO[,c(2,1)] + igraph::plot.igraph(graph, layout = LO, vertex.label = labels, edge.width = edgeWidth) + }else{ + igraph::plot.igraph(graph, vertex.label = labels, edge.width = edgeWidth) + } + } diff --git a/man/BuildSubnetwork.Rd b/man/BuildSubnetwork.Rd new file mode 100644 index 00000000..ff686596 --- /dev/null +++ b/man/BuildSubnetwork.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/BLOBFISH.R +\name{BuildSubnetwork} +\alias{BuildSubnetwork} +\title{Find the subnetwork of significant edges connecting the genes.} +\usage{ +BuildSubnetwork( + geneSet, + networks, + alpha, + hopConstraint, + nullDistribution, + verbose = FALSE, + topX = NULL +) +} +\arguments{ +\item{geneSet}{A character vector of genes comprising the targets of interest.} + +\item{networks}{A list of bipartite (PANDA-like) networks, where each network is a data frame with the following format: +tf,gene,score} + +\item{alpha}{The significance cutoff for the statistical test.} + +\item{hopConstraint}{The maximum number of hops to be considered between gene pairs. +Must be an even number.} + +\item{nullDistribution}{The null distribution, specified as a vector of values.} + +\item{verbose}{Whether or not to print detailed information about the run.} + +\item{topX}{Select the X lowest significant p-values for each gene. NULL by default.} +} +\value{ +A bipartite subnetwork in the same format as the original networks. +} +\description{ +Find the subnetwork of significant edges connecting the genes. +} diff --git a/man/FindSignificantEdgesForHop.Rd b/man/FindSignificantEdgesForHop.Rd new file mode 100644 index 00000000..f737b1e9 --- /dev/null +++ b/man/FindSignificantEdgesForHop.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/BLOBFISH.R +\name{FindSignificantEdgesForHop} +\alias{FindSignificantEdgesForHop} +\title{Find the subnetwork of significant edges n / 2 hops away from each gene.} +\usage{ +FindSignificantEdgesForHop( + geneSet, + combinedNetwork, + alpha, + hopConstraint, + nullDistribution, + verbose = FALSE, + topX = NULL +) +} +\arguments{ +\item{geneSet}{A character vector of genes comprising the targets of interest.} + +\item{combinedNetwork}{A concatenation of n PANDA-like networks with the following format: +tf,gene,score_net1, score_net2, ... , score_netn} + +\item{alpha}{The significance cutoff for the statistical test.} + +\item{hopConstraint}{The maximum number of hops to be considered for a gene.} + +\item{nullDistribution}{The null distribution, specified as a vector of values.} + +\item{verbose}{Whether or not to print detailed information about the run.} + +\item{topX}{Select the X lowest significant p-values for each gene. NULL by default.} +} +\value{ +A bipartite subnetwork in the same format as the original networks. +} +\description{ +Find the subnetwork of significant edges n / 2 hops away from each gene. +} diff --git a/man/GenerateNullPANDADistribution.Rd b/man/GenerateNullPANDADistribution.Rd new file mode 100644 index 00000000..707fef48 --- /dev/null +++ b/man/GenerateNullPANDADistribution.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/BLOBFISH.R +\name{GenerateNullPANDADistribution} +\alias{GenerateNullPANDADistribution} +\title{Generate a null distribution of edge scores for PANDA-like networks; that is, +the set of edges where (1) the TF does not have a binding motif in the gene region, +(2) the TF does not form a complex with any other TF that has a binding motif in +the gene region, and (3) the genes regulated by the TF are not coexpressed with the +gene in question. We obtain this by inputting an empty prior and an identity coexpression +matrix.} +\usage{ +GenerateNullPANDADistribution( + ngenes = 20000, + nTranscriptionFactors = 600, + sampSize = 20, + numberOfPandas = 10 +) +} +\arguments{ +\item{ngenes}{Number of genes to simulate} + +\item{nTranscriptionFactors}{Number of transcription factors to simulate} + +\item{sampSize}{Number of samples to simulate} + +\item{numberOfPandas}{Number of null PANDA networks to generate} +} +\description{ +Generate a null distribution of edge scores for PANDA-like networks; that is, +the set of edges where (1) the TF does not have a binding motif in the gene region, +(2) the TF does not form a complex with any other TF that has a binding motif in +the gene region, and (3) the genes regulated by the TF are not coexpressed with the +gene in question. We obtain this by inputting an empty prior and an identity coexpression +matrix. +} diff --git a/man/PlotNetwork.Rd b/man/PlotNetwork.Rd new file mode 100644 index 00000000..95a66421 --- /dev/null +++ b/man/PlotNetwork.Rd @@ -0,0 +1,53 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/BLOBFISH.R +\name{PlotNetwork} +\alias{PlotNetwork} +\title{Plot the networks, using different colors for transcription factors, genes of interest, +and additional genes.} +\usage{ +PlotNetwork( + network, + genesOfInterest, + geneOfInterestColor = "red", + tfColor = "blue", + otherGenesColor = "gray", + nodeSize = 1, + edgeWidth = 0.5, + vertexLabels = NA, + vertexLabelSize = 0.7, + vertexLabelOffset = 0.5, + layoutBipartite = TRUE +) +} +\arguments{ +\item{network}{A data frame with the following format: +tf,gene} + +\item{genesOfInterest}{Which genes of interest to highlight} + +\item{geneOfInterestColor}{Color for the genes of interest} + +\item{tfColor}{Color for the transcription factors} + +\item{otherGenesColor}{Color for the other genes} + +\item{nodeSize}{Size of node} + +\item{edgeWidth}{Width of edges} + +\item{vertexLabels}{Which vertex labels to include. By default, none are included.} + +\item{vertexLabelSize}{The size of label to use for the vertex, as a fraction of the default.} + +\item{vertexLabelOffset}{Number of pixels in the offset when plotting labels. +Default is TRUE.} + +\item{layoutBipartite}{Whether or not to layout as a bipartite graph.} +} +\value{ +A bipartite plot of the network +} +\description{ +Plot the networks, using different colors for transcription factors, genes of interest, +and additional genes. +} diff --git a/man/RunBLOBFISH.Rd b/man/RunBLOBFISH.Rd new file mode 100644 index 00000000..b3929034 --- /dev/null +++ b/man/RunBLOBFISH.Rd @@ -0,0 +1,43 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/BLOBFISH.R +\name{RunBLOBFISH} +\alias{RunBLOBFISH} +\title{Given a set of genes of interest, full bipartite networks with scores (one network for each sample), a significance +cutoff for statistical testing, and a hop constraint, BLOBFISH finds a subnetwork of +significant edges connecting the genes.} +\usage{ +RunBLOBFISH( + geneSet, + networks, + alpha, + hopConstraint, + nullDistribution, + verbose = FALSE, + topX = NULL +) +} +\arguments{ +\item{geneSet}{A character vector of genes comprising the targets of interest.} + +\item{networks}{A list of bipartite (PANDA-like) networks, where each network is a data frame with the following format: +tf,gene,score} + +\item{alpha}{The significance cutoff for the statistical test.} + +\item{hopConstraint}{The maximum number of hops to be considered between gene pairs. +Must be an even number.} + +\item{nullDistribution}{The null distribution, specified as a vector of values.} + +\item{verbose}{Whether or not to print detailed information about the run.} + +\item{topX}{Select the X lowest significant p-values for each gene. NULL by default.} +} +\value{ +A bipartite subnetwork in the same format as the original networks. +} +\description{ +Given a set of genes of interest, full bipartite networks with scores (one network for each sample), a significance +cutoff for statistical testing, and a hop constraint, BLOBFISH finds a subnetwork of +significant edges connecting the genes. +} diff --git a/man/SignificantBreadthFirstSearch.Rd b/man/SignificantBreadthFirstSearch.Rd new file mode 100644 index 00000000..c2d4a0c2 --- /dev/null +++ b/man/SignificantBreadthFirstSearch.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/BLOBFISH.R +\name{SignificantBreadthFirstSearch} +\alias{SignificantBreadthFirstSearch} +\title{Find all significant edges adjacent to the starting nodes, excluding the nodes +specified.} +\usage{ +SignificantBreadthFirstSearch( + networks, + alpha, + startingNodes, + nodesToExclude, + startFromTF, + doFDRAdjustment = FALSE, + nullDistribution, + verbose = FALSE, + topX = NULL +) +} +\arguments{ +\item{networks}{A concatenation of n PANDA-like networks with the following format: +tf,gene,score_net1, score_net2, ... , score_netn +Edges must be specified as "tf__gene".} + +\item{alpha}{The significance cutoff for the statistical test.} + +\item{startingNodes}{The list of nodes from which to start.} + +\item{nodesToExclude}{The list of nodes to exclude from the search.} + +\item{startFromTF}{Whether to start from transcription factors (TRUE) or genes (FALSE).} + +\item{doFDRAdjustment}{Whether or not to adjust the p-values using FDR.} + +\item{nullDistribution}{The null distribution, specified as a vector of values.} + +\item{verbose}{Whether or not to print detailed information about the run.} + +\item{topX}{Select the X lowest significant p-values for each gene. NULL by default.} +} +\value{ +A bipartite subnetwork in the same format as the original networks. +} +\description{ +Find all significant edges adjacent to the starting nodes, excluding the nodes +specified. +} diff --git a/tests/testthat/test-blobfish.R b/tests/testthat/test-blobfish.R new file mode 100644 index 00000000..7fa0c28d --- /dev/null +++ b/tests/testthat/test-blobfish.R @@ -0,0 +1,318 @@ +# unit-tests for BLOBFISH +context("test BLOBFISH functions") + +test_that("[BLOBFISH] SignificantBreadthFirstSearch() function yields expected results", { + + # Construct a starting network, which will be modified. + # Here, we expect genes A and B to be connected after 1 hop via TF2, genes A and D + # to be connected after 1 hop via TF3, and genes A and C to be connected after 2 + # hops via TF4. + startingNetwork <- data.frame(tf = c(rep(c("tf1", "tf2", "tf3", "tf4"), 4)), + gene = c(rep("geneA", 4), rep("geneB", 4), rep("geneC", 4), rep("geneD", 4)), + score = c(-3, 3, 5, -5, 0, 4, 0.0005, -0.5, 0, 0.0005, -1, 4, 0.0005, -2, 5, 3)) + addedNoise1 <- data.frame(tf = rep(1:100, 4), gene = c(rep("geneA", 100), + rep("geneB", 100), + rep("geneC", 100), + rep("geneD", 100)), + score = stats::rnorm(400) / 1000) + addedNoise2 <- data.frame(tf = rep(1:100, 4), gene = c(rep("geneA", 100), + rep("geneB", 100), + rep("geneC", 100), + rep("geneD", 100)), + score = stats::rnorm(400) / 10000) + addedNoise3 <- data.frame(tf = rep(1:100, 4), gene = c(rep("geneA", 100), + rep("geneB", 100), + rep("geneC", 100), + rep("geneD", 100)), + score = stats::rnorm(400) / 10000) + fullNetwork1 <- rbind(startingNetwork, addedNoise1) + rownames(fullNetwork1) <- paste(fullNetwork1$tf, fullNetwork1$gene, sep = "__") + + # Create a set of networks close to the original. + fullNetworks <- fullNetwork1 + fullNetworks[,4] <- fullNetwork1$score - 0.00005 + fullNetworks[,5] <- fullNetwork1$score + 0.00005 + + # Use all combined scores as the null distribution. + null <- stats::rnorm(n = nrow(fullNetworks) * 3) / 100 + + # Test that errors are thrown when appropriate. + expect_error(SignificantBreadthFirstSearch(networks = fullNetworks, + alpha = 0.05, startingNodes = c("blob", "fish"), + nodesToExclude = c(), startFromTF = TRUE, + nullDistribution = null), + "ERROR: Starting nodes do not overlap with network nodes") + expect_error(SignificantBreadthFirstSearch(networks = fullNetworks, + alpha = 0.05, startingNodes = c("geneA", "geneB"), + nodesToExclude = c(), startFromTF = TRUE, + nullDistribution = null), + "ERROR: Starting nodes do not overlap with network nodes") + expect_error(SignificantBreadthFirstSearch(networks = fullNetworks, + alpha = 0.05, startingNodes = c("tf1", "tf2"), + nodesToExclude = c(), startFromTF = FALSE, + nullDistribution = null), + "ERROR: Starting nodes do not overlap with network nodes") + expect_error(SignificantBreadthFirstSearch(networks = fullNetworks, + alpha = 0.05, startingNodes = c("geneA", "geneB", "geneC"), + nodesToExclude = c("blob", "fish"), startFromTF = FALSE, + nullDistribution = null), + "ERROR: List of nodes to exclude does not overlap with network nodes") + expect_error(SignificantBreadthFirstSearch(networks = fullNetworks, + alpha = 0.05, startingNodes = c("geneA", "geneB", "geneC"), + nodesToExclude = c("geneA", "geneB"), startFromTF = FALSE, + nullDistribution = null), + "ERROR: Starting nodes cannot overlap with nodes to exclude") + + # Ensure that, when starting from genes A, B, and C, we obtain the correct values. + expect_true(length(setdiff(c("tf2__geneA", "tf2__geneB", "tf3__geneA", "tf4__geneC"), + rownames(SignificantBreadthFirstSearch(networks = fullNetworks, + alpha = 0.05, startingNodes = c("geneA", "geneB", "geneC"), + nodesToExclude = c(), startFromTF = FALSE, doFDRAdjustment = FALSE, + nullDistribution = null)))) == 0) + + # Ensure that, when starting from transcription factors TF2, TF3, and TF4 and removing genes A, B, and C, we obtain the correct values. + expect_true(length(setdiff(c("tf3__geneD", "tf4__geneD"), + rownames(SignificantBreadthFirstSearch(networks = fullNetworks, + alpha = 0.4, startingNodes = c("tf2", "tf3", "tf4"), + nodesToExclude = c("geneA", "geneB", "geneC"), startFromTF = TRUE, doFDRAdjustment = TRUE, + nullDistribution = null)))) == 0) + + # Ensure that we obtain nothing when we include nothing significant. + expect_true(length(intersect(c("tf2__geneA", "tf2__geneB", "tf3__geneA", "tf4__geneC"), rownames(SignificantBreadthFirstSearch(networks = fullNetworks, + alpha = 0.05, startingNodes = c("1", "2", "3"), + nodesToExclude = c("geneA", "geneB", "geneC"), startFromTF = TRUE, doFDRAdjustment = FALSE, + nullDistribution = null)))) == 0) + + # Ensure that we obtain nothing when everything is excluded. + expect_equal(length(rownames(SignificantBreadthFirstSearch(networks = fullNetworks, + alpha = 0.05, startingNodes = c("tf1", "tf2", "tf3"), + nodesToExclude = unique(fullNetworks$gene), + startFromTF = TRUE, doFDRAdjustment = FALSE, + nullDistribution = null))), 0) +}) +test_that("[BLOBFISH] FindConnectionsForAllHopCounts() function yields expected results", { + + # Set up the subnetwork from the previous example. + geneANetHop1 <- data.frame(tf = c("tf2", "tf3"), gene = c("geneA", "geneA")) + geneBNetHop1 <- data.frame(tf = "tf2", gene = "geneB") + geneCNetHop1 <- data.frame(tf = "tf4", gene = "geneC") + geneDNetHop1 <- data.frame(tf = c("tf3", "tf4"), gene = c("geneD", "geneD")) + geneANetHop2 <- rbind(geneBNetHop1, geneDNetHop1[1,]) + geneBNetHop2 <- geneANetHop1[1,] + geneCNetHop2 <- geneDNetHop1[2,] + geneDNetHop2 <- rbind(geneANetHop1[2,], geneCNetHop1) + geneANetHop3 <- geneDNetHop1[2,] + geneBNetHop3 <- geneANetHop1[2,] + geneCNetHop3 <- geneDNetHop1[1,] + geneDNetHop3 <- geneANetHop1[1,] + subnetworks <- list(geneA = list(geneANetHop1, geneANetHop2, geneANetHop3), + geneB = list(geneBNetHop1, geneBNetHop2, geneBNetHop3), + geneC = list(geneCNetHop1, geneCNetHop2, geneCNetHop3), + geneD = list(geneDNetHop1, geneDNetHop2, geneDNetHop3)) + + # Obtain the overlaps for 1, 2, and 3 hops. + expect_equal(rownames(FindConnectionsForAllHopCounts(subnetworks)), + c("tf2__geneA", "tf2__geneB", "tf3__geneA", "tf3__geneD", "tf4__geneC", "tf4__geneD")) + + # Obtain the overlaps for only the first two hops, for only A and C. + subnetworks <- list(geneA = list(geneANetHop1, geneANetHop2), + geneC = list(geneCNetHop1, geneCNetHop2)) + expect_equal(rownames(FindConnectionsForAllHopCounts(subnetworks)), + c("tf3__geneA", "tf3__geneD", "tf4__geneC", "tf4__geneD")) +}) +test_that("[BLOBFISH] FindSignificantEdgesForHop() function yields expected results",{ + + # Construct a starting network, which will be modified. + # Here, we expect genes A and B to be connected after 1 hop via TF2, genes A and D + # to be connected after 1 hop via TF3, and genes A and C to be connected after 2 + # hops via TF4. + startingNetwork <- data.frame(tf = c(rep(c("tf1", "tf2", "tf3", "tf4"), 4)), + gene = c(rep("geneA", 4), rep("geneB", 4), rep("geneC", 4), rep("geneD", 4)), + score = c(-3, 3, 5, -5, 0, 4, 0.0005, -0.5, 0, 0.0005, -1, 4, 0.0005, -2, 5, 3)) + addedNoise1 <- data.frame(tf = rep(1:100, 4), gene = c(rep("geneA", 100), + rep("geneB", 100), + rep("geneC", 100), + rep("geneD", 100)), + score = stats::rnorm(400) / 10000) + addedNoise2 <- data.frame(tf = rep(1:100, 4), gene = c(rep("geneA", 100), + rep("geneB", 100), + rep("geneC", 100), + rep("geneD", 100)), + score = stats::rnorm(400) / 10000) + addedNoise3 <- data.frame(tf = rep(1:100, 4), gene = c(rep("geneA", 100), + rep("geneB", 100), + rep("geneC", 100), + rep("geneD", 100)), + score = stats::rnorm(400) / 10000) + fullNetwork1 <- rbind(startingNetwork, addedNoise1) + rownames(fullNetwork1) <- paste(fullNetwork1$tf, fullNetwork1$gene, sep = "__") + + # Create a set of networks close to the original. + fullNetworks <- fullNetwork1 + fullNetworks[,4] <- fullNetwork1$score - 0.00005 + fullNetworks[,5] <- fullNetwork1$score + 0.00005 + + # Null distribution is a narrower version of the normal distribution. + null <- rnorm(n = nrow(fullNetworks) * 3) / 100 + + # Set up the subnetwork from the previous example. + geneANetHop1 <- data.frame(tf = c("tf2", "tf3"), gene = c("geneA", "geneA")) + rownames(geneANetHop1) <- paste(geneANetHop1$tf, geneANetHop1$gene, sep = "__") + geneBNetHop1 <- data.frame(tf = "tf2", gene = "geneB") + rownames(geneBNetHop1) <- paste(geneBNetHop1$tf, geneBNetHop1$gene, sep = "__") + geneCNetHop1 <- data.frame(tf = "tf4", gene = "geneC") + rownames(geneCNetHop1) <- paste(geneCNetHop1$tf, geneCNetHop1$gene, sep = "__") + geneDNetHop1 <- data.frame(tf = c("tf3", "tf4"), gene = c("geneD", "geneD")) + rownames(geneDNetHop1) <- paste(geneDNetHop1$tf, geneDNetHop1$gene, sep = "__") + geneANetHop2 <- rbind(geneBNetHop1, geneDNetHop1[1,]) + rownames(geneANetHop2) <- paste(geneANetHop2$tf, geneANetHop2$gene, sep = "__") + geneBNetHop2 <- geneANetHop1[1,] + rownames(geneBNetHop2) <- paste(geneBNetHop2$tf, geneBNetHop2$gene, sep = "__") + geneCNetHop2 <- geneDNetHop1[2,] + rownames(geneCNetHop2) <- paste(geneCNetHop2$tf, geneCNetHop2$gene, sep = "__") + geneDNetHop2 <- rbind(geneANetHop1[2,], geneCNetHop1) + rownames(geneDNetHop2) <- paste(geneDNetHop2$tf, geneDNetHop2$gene, sep = "__") + geneANetHop3 <- geneDNetHop1[2,] + rownames(geneANetHop3) <- paste(geneANetHop3$tf, geneANetHop3$gene, sep = "__") + geneBNetHop3 <- geneANetHop1[2,] + rownames(geneBNetHop3) <- paste(geneBNetHop3$tf, geneBNetHop3$gene, sep = "__") + geneCNetHop3 <- geneDNetHop1[1,] + rownames(geneCNetHop3) <- paste(geneCNetHop3$tf, geneCNetHop3$gene, sep = "__") + geneDNetHop3 <- geneANetHop1[1,] + rownames(geneDNetHop3) <- paste(geneDNetHop3$tf, geneDNetHop3$gene, sep = "__") + subnetworksFull <- list(geneA = list(geneANetHop1, geneANetHop2, geneANetHop3), + geneB = list(geneBNetHop1, geneBNetHop2, geneBNetHop3), + geneC = list(geneCNetHop1, geneCNetHop2, geneCNetHop3), + geneD = list(geneDNetHop1, geneDNetHop2, geneDNetHop3)) + + # Test method with all genes as input. + sigEdges <- FindSignificantEdgesForHop(geneSet = c("geneA", "geneB", "geneC", "geneD"), + combinedNetwork = fullNetworks, + alpha = 0.05, hopConstraint = 3, nullDistribution = null) + expect_true(length(setdiff(rownames(subnetworksFull$geneA[[1]]), rownames(sigEdges$geneA[[1]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneA[[2]]), rownames(sigEdges$geneA[[2]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneA[[3]]), rownames(sigEdges$geneA[[3]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneB[[1]]), rownames(sigEdges$geneB[[1]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneB[[2]]), rownames(sigEdges$geneB[[2]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneB[[3]]), rownames(sigEdges$geneB[[3]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneC[[1]]), rownames(sigEdges$geneC[[1]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneC[[2]]), rownames(sigEdges$geneC[[2]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneC[[3]]), rownames(sigEdges$geneC[[3]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneD[[1]]), rownames(sigEdges$geneD[[1]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneD[[2]]), rownames(sigEdges$geneD[[2]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneD[[3]]), rownames(sigEdges$geneD[[3]]))) == 0) + + # Test method with only genes A, B, C. + sigEdges <- FindSignificantEdgesForHop(geneSet = c("geneA", "geneB", "geneC"), + combinedNetwork = fullNetworks, + alpha = 0.05, hopConstraint = 3, nullDistribution = null) + expect_true(length(setdiff(rownames(subnetworksFull$geneA[[1]]), rownames(sigEdges$geneA[[1]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneA[[2]]), rownames(sigEdges$geneA[[2]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneA[[3]]), rownames(sigEdges$geneA[[3]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneB[[1]]), rownames(sigEdges$geneB[[1]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneB[[2]]), rownames(sigEdges$geneB[[2]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneB[[3]]), rownames(sigEdges$geneB[[3]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneC[[1]]), rownames(sigEdges$geneC[[1]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneC[[2]]), rownames(sigEdges$geneC[[2]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneC[[3]]), rownames(sigEdges$geneC[[3]]))) == 0) + + # Test method with only genes A and C. + sigEdges <- FindSignificantEdgesForHop(geneSet = c("geneA", "geneC"), + combinedNetwork = fullNetworks, + alpha = 0.4, hopConstraint = 3, nullDistribution = null) + expect_true(length(setdiff(rownames(subnetworksFull$geneA[[1]]), rownames(sigEdges$geneA[[1]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneA[[2]]), rownames(sigEdges$geneA[[2]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneA[[3]]), rownames(sigEdges$geneA[[3]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneC[[1]]), rownames(sigEdges$geneC[[1]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneC[[2]]), rownames(sigEdges$geneC[[2]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneC[[3]]), rownames(sigEdges$geneC[[3]]))) == 0) +}) +test_that("[BLOBFISH] BuildSubnetwork() function yields expected results",{ + + # Construct a starting network, which will be modified. + # Here, we expect genes A and B to be connected after 1 hop via TF2, genes A and D + # to be connected after 1 hop via TF3, and genes A and C to be connected after 2 + # hops via TF4. + startingNetwork <- data.frame(tf = c(rep(c("tf1", "tf2", "tf3", "tf4"), 4)), + gene = c(rep("geneA", 4), rep("geneB", 4), rep("geneC", 4), rep("geneD", 4)), + score = c(-3, 3, 5, -5, 0, 4, 0.0005, -0.5, 0, 0.0005, -1, 4, 0.0005, -2, 5, 3)) + addedNoise1 <- data.frame(tf = rep(1:100, 4), gene = c(rep("geneA", 100), + rep("geneB", 100), + rep("geneC", 100), + rep("geneD", 100)), + score = stats::rnorm(400) / 10000) + addedNoise2 <- data.frame(tf = rep(1:100, 4), gene = c(rep("geneA", 100), + rep("geneB", 100), + rep("geneC", 100), + rep("geneD", 100)), + score = stats::rnorm(400) / 10000) + addedNoise3 <- data.frame(tf = rep(1:100, 4), gene = c(rep("geneA", 100), + rep("geneB", 100), + rep("geneC", 100), + rep("geneD", 100)), + score = stats::rnorm(400) / 1000) + fullNetwork1 <- rbind(startingNetwork, addedNoise1) + rownames(fullNetwork1) <- paste(fullNetwork1$tf, fullNetwork1$gene, sep = "__") + + # Create a set of networks close to the original. + fullNetwork2 <- fullNetwork1 + fullNetwork3 <- fullNetwork1 + fullNetwork2$score <- c(fullNetwork1$score - 0.00005) + fullNetwork3$score <- c(fullNetwork1$score + 0.00005) + + # Paste together the networks. + combinedNetwork <- fullNetwork1 + combinedNetwork[,4] <- fullNetwork2$score + combinedNetwork[,5] <- fullNetwork3$score + + # Null distribution is a narrower version of the normal distribution. + null <- rnorm(n = nrow(combinedNetwork) * 3) / 100 + + # Verify that FindConnectionsForAllHopCounts() and FindSignificantEdgesForHop() are + # working in tandem. + # Obtain the overlaps for 1, 2, and 3 hops. + expect_true(length(setdiff(c("tf2__geneA", "tf2__geneB", "tf3__geneA", "tf3__geneD", "tf4__geneC", "tf4__geneD"), + rownames(BuildSubnetwork(geneSet = c("geneA", "geneB", "geneC", "geneD"), + networks = list(fullNetwork1, fullNetwork2, fullNetwork3), + alpha = 0.05, hopConstraint = 4, nullDistribution = null)))) == 0) + + # Obtain the overlaps for only the first two hops, for only A and C. + expect_true(length(setdiff(c("tf3__geneA", "tf3__geneD", "tf4__geneC", "tf4__geneD"), + rownames(BuildSubnetwork(geneSet = c("geneA", "geneC"), + networks = list(fullNetwork1, fullNetwork2, fullNetwork3), + alpha = 0.05, hopConstraint = 4, nullDistribution = null)))) == 0) +}) +test_that("[BLOBFISH] RunBLOBFISH() function yields expected results",{ + + # Check errors. + expect_error(RunBLOBFISH(geneSet = 4, networks = list(), alpha = 0.5, hopConstraint = 5, nullDistribution = c(0,0,0)), + paste("Wrong input type! geneSet must be a character vector. networks must be a list.", + "alpha and hopConstraint must be scalar numeric values.")) + expect_error(RunBLOBFISH(geneSet = "g1", networks = 67, alpha = 0.5, hopConstraint = 5, nullDistribution = c(0,0,0)), + paste("Wrong input type! geneSet must be a character vector. networks must be a list.", + "alpha and hopConstraint must be scalar numeric values.")) + expect_error(RunBLOBFISH(geneSet = "g1", networks = list(), alpha = "hi", hopConstraint = 5, nullDistribution = c(0,0,0)), + paste("Wrong input type! geneSet must be a character vector. networks must be a list.", + "alpha and hopConstraint must be scalar numeric values.")) + expect_error(RunBLOBFISH(geneSet = "g1", networks = list(), alpha = 0.5, hopConstraint = "hi", nullDistribution = c(0,0,0)), + paste("Wrong input type! geneSet must be a character vector. networks must be a list.", + "alpha and hopConstraint must be scalar numeric values.")) + expect_error(RunBLOBFISH(geneSet = "g1", networks = list(1,2,3), alpha = 0.5, hopConstraint = 5, nullDistribution = c(0,0,0)), + "Each network must be a data frame.") + expect_error(RunBLOBFISH(geneSet = "g1", networks = list(data.frame(tf = NA, gene = NA, score = NA, blah = NA)), + alpha = 0.5, hopConstraint = 5, nullDistribution = c(0,0,0)), + paste("Each network must have transcription factors in the first column,", + "target genes in the second column, and scores in the third column.")) + expect_error(RunBLOBFISH(geneSet = "g1", networks = list(data.frame(tf = NA, gene = NA, score = NA)), + alpha = -1, hopConstraint = 5, nullDistribution = c(0,0,0)), "alpha must be between 0 and 1, not including 0.") + expect_error(RunBLOBFISH(geneSet = "g1", networks = list(data.frame(tf = NA, gene = NA, score = NA)), + alpha = 0, hopConstraint = 5, nullDistribution = c(0,0,0)), "alpha must be between 0 and 1, not including 0.") + expect_error(RunBLOBFISH(geneSet = "g1", networks = list(data.frame(tf = NA, gene = NA, score = NA)), + alpha = 1.2, hopConstraint = 5, nullDistribution = c(0,0,0)), "alpha must be between 0 and 1, not including 0.") + expect_error(RunBLOBFISH(geneSet = "g1", networks = list(data.frame(tf = NA, gene = NA, score = NA)), + alpha = 1, hopConstraint = -4, nullDistribution = c(0,0,0)), "hopConstraint must be an even number of at least 2.") + expect_error(RunBLOBFISH(geneSet = "g1", networks = list(data.frame(tf = NA, gene = NA, score = NA)), + alpha = 1, hopConstraint = 7, nullDistribution = c(0,0,0)), "hopConstraint must be an even number of at least 2.") + expect_error(RunBLOBFISH(geneSet = "g1", networks = list(data.frame(tf = NA, gene = NA, score = NA)), + alpha = 1, hopConstraint = 4, nullDistribution = "hi"), "nullDistribution must be numeric.") +}) \ No newline at end of file From d8988cfd3f40bbf06009668750c0c00935ab179d Mon Sep 17 00:00:00 2001 From: Tara Eicher Date: Fri, 17 May 2024 17:01:57 -0400 Subject: [PATCH 02/17] Removed singleton genes in BLOBFISH output --- R/BLOBFISH.R | 17 ++++++++++++++++- man/FindConnectionsForAllHopCounts.Rd | 24 ++++++++++++++++++++++++ tests/testthat/test-blobfish.R | 2 +- 3 files changed, 41 insertions(+), 2 deletions(-) create mode 100644 man/FindConnectionsForAllHopCounts.Rd diff --git a/R/BLOBFISH.R b/R/BLOBFISH.R index ce423a17..58bc03c4 100644 --- a/R/BLOBFISH.R +++ b/R/BLOBFISH.R @@ -437,7 +437,8 @@ FindConnectionsForAllHopCounts <- function(subnetworks, verbose = FALSE){ return(connectingSubnetwork) }) # Bind together the subnetwork for each gene pair. - return(do.call(rbind, genePairSpecificHopCountSubnetwork)) + connectingSubnetworkAll <- do.call(rbind, genePairSpecificHopCountSubnetwork) + return(connectingSubnetworkAll) }) # Bind together the subnetworks for each gene. @@ -453,6 +454,20 @@ FindConnectionsForAllHopCounts <- function(subnetworks, verbose = FALSE){ return(compositeSubnetwork[whichFirstEdge,]) })) rownames(compositeSubnetworkDedup) <- uniqueEdges + + # Remove all genes connected to a single transcription factor. These genes were + # added because they are regulated by a transcription factor that co-regulates + # two seed genes. Similarly, remove all transcription factors connected to a + # single gene. + geneCounts <- table(compositeSubnetworkDedup$gene) + tfCounts <- table(compositeSubnetworkDedup$tf) + genesToRemove <- names(geneCounts)[which(geneCounts == 1)] + genesToRemove <- setdiff(genesToRemove, names(subnetworks)) + tfsToRemove <- names(tfCounts)[which(tfCounts == 1)] + compositeSubnetworkDedup <- compositeSubnetworkDedup[which(compositeSubnetworkDedup$gene %in% setdiff(names(geneCounts), + genesToRemove)),] + compositeSubnetworkDedup <- compositeSubnetworkDedup[which(compositeSubnetworkDedup$tf %in% setdiff(names(tfCounts), + tfsToRemove)),] return(compositeSubnetworkDedup) } diff --git a/man/FindConnectionsForAllHopCounts.Rd b/man/FindConnectionsForAllHopCounts.Rd new file mode 100644 index 00000000..c8c5977b --- /dev/null +++ b/man/FindConnectionsForAllHopCounts.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/BLOBFISH.R +\name{FindConnectionsForAllHopCounts} +\alias{FindConnectionsForAllHopCounts} +\title{For all hop counts up to the maximum, find subnetworks connecting each pair of +genes by exactly that number of hops. For instance, find each} +\usage{ +FindConnectionsForAllHopCounts(subnetworks, verbose = FALSE) +} +\arguments{ +\item{subnetworks}{A list of bipartite (PANDA-like) subnetworks for each gene, +containing only the significant edges meeting the hop count criteria and +where each network is a data frame with the following format: +tf,gene} + +\item{verbose}{Whether or not to print detailed information about the run.} +} +\value{ +A bipartite subnetwork in the same format as the original networks. +} +\description{ +For all hop counts up to the maximum, find subnetworks connecting each pair of +genes by exactly that number of hops. For instance, find each +} diff --git a/tests/testthat/test-blobfish.R b/tests/testthat/test-blobfish.R index 7fa0c28d..27438f31 100644 --- a/tests/testthat/test-blobfish.R +++ b/tests/testthat/test-blobfish.R @@ -73,7 +73,7 @@ test_that("[BLOBFISH] SignificantBreadthFirstSearch() function yields expected r # Ensure that, when starting from transcription factors TF2, TF3, and TF4 and removing genes A, B, and C, we obtain the correct values. expect_true(length(setdiff(c("tf3__geneD", "tf4__geneD"), rownames(SignificantBreadthFirstSearch(networks = fullNetworks, - alpha = 0.4, startingNodes = c("tf2", "tf3", "tf4"), + alpha = 0.05, startingNodes = c("tf2", "tf3", "tf4"), nodesToExclude = c("geneA", "geneB", "geneC"), startFromTF = TRUE, doFDRAdjustment = TRUE, nullDistribution = null)))) == 0) From 1e39d86955a209ff134d0c8f2ca4f102a1fc6ab9 Mon Sep 17 00:00:00 2001 From: Tara Eicher Date: Tue, 21 May 2024 13:58:54 -0400 Subject: [PATCH 03/17] Added export statement to method for plotting networks --- NAMESPACE | 1 + R/BLOBFISH.R | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index 37882969..fe1ec11e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,7 @@ # Generated by roxygen2: do not edit by hand export(GenerateNullPANDADistribution) +export(PlotNetwork) export(RunBLOBFISH) export(adj2el) export(adj2regulon) diff --git a/R/BLOBFISH.R b/R/BLOBFISH.R index 58bc03c4..5f7fe5c0 100644 --- a/R/BLOBFISH.R +++ b/R/BLOBFISH.R @@ -487,11 +487,11 @@ FindConnectionsForAllHopCounts <- function(subnetworks, verbose = FALSE){ #' @param vertexLabelOffset Number of pixels in the offset when plotting labels. #' Default is TRUE. #' @returns A bipartite plot of the network +#' @export PlotNetwork <- function(network, genesOfInterest, geneOfInterestColor = "red", tfColor = "blue", otherGenesColor = "gray", nodeSize = 1, edgeWidth = 0.5, vertexLabels = NA, vertexLabelSize = 0.7, vertexLabelOffset = 0.5, layoutBipartite = TRUE){ - # Set the node attributes. uniqueNodes <- unique(c(network$tf, network$gene)) nodeAttrs <- data.frame(node = uniqueNodes, From 01211bd5d847fb68c0006f4ff3552599f20feed2 Mon Sep 17 00:00:00 2001 From: Tara Eicher Date: Thu, 23 May 2024 10:19:35 -0400 Subject: [PATCH 04/17] Added the ability to highlight the 2-hop subnetwork --- R/BLOBFISH.R | 16 +++++++++++++--- man/PlotNetwork.Rd | 11 ++++++++++- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/R/BLOBFISH.R b/R/BLOBFISH.R index 5f7fe5c0..9c6a3d70 100644 --- a/R/BLOBFISH.R +++ b/R/BLOBFISH.R @@ -481,6 +481,9 @@ FindConnectionsForAllHopCounts <- function(subnetworks, verbose = FALSE){ #' @param otherGenesColor Color for the other genes #' @param nodeSize Size of node #' @param edgeWidth Width of edges +#' @param edgeColor Color of edges +#' @param edgeWidthHop1 Width of edges 1 hop away from genes of interest. +#' @param edgeColorHop1 Color of edges 1 hop away from genes of interest. #' @param vertexLabels Which vertex labels to include. By default, none are included. #' @param layoutBipartite Whether or not to layout as a bipartite graph. #' @param vertexLabelSize The size of label to use for the vertex, as a fraction of the default. @@ -491,7 +494,8 @@ FindConnectionsForAllHopCounts <- function(subnetworks, verbose = FALSE){ PlotNetwork <- function(network, genesOfInterest, geneOfInterestColor = "red", tfColor = "blue", otherGenesColor = "gray", nodeSize = 1, edgeWidth = 0.5, vertexLabels = NA, vertexLabelSize = 0.7, - vertexLabelOffset = 0.5, layoutBipartite = TRUE){ + vertexLabelOffset = 0.5, layoutBipartite = TRUE, edgeColor = "gray", + edgeWidthHop1 = 0.5, edgeColorHop1 = "gray"){ # Set the node attributes. uniqueNodes <- unique(c(network$tf, network$gene)) nodeAttrs <- data.frame(node = uniqueNodes, @@ -504,6 +508,12 @@ PlotNetwork <- function(network, genesOfInterest, geneOfInterestColor = "red", nodeAttrs[which(uniqueNodes %in% network$tf), "color"] <- tfColor nodeAttrs[which(uniqueNodes %in% genesOfInterest), "color"] <- geneOfInterestColor + # Add edge attributes. + network$color <- edgeColor + network$color[which(network$gene %in% genesOfInterest)] <- edgeColorHop1 + network$width <- edgeWidth + network$width[which(network$gene %in% genesOfInterest)] <- edgeWidthHop1 + # Create a graph object. graph <- igraph::graph_from_data_frame(network, vertices = nodeAttrs, directed = FALSE) V(graph)$type <- V(graph)$name %in% network$tf @@ -516,8 +526,8 @@ PlotNetwork <- function(network, genesOfInterest, geneOfInterestColor = "red", if(layoutBipartite == TRUE){ LO <- layout_as_bipartite(graph) LO <- LO[,c(2,1)] - igraph::plot.igraph(graph, layout = LO, vertex.label = labels, edge.width = edgeWidth) + igraph::plot.igraph(graph, layout = LO, vertex.label = labels) }else{ - igraph::plot.igraph(graph, vertex.label = labels, edge.width = edgeWidth) + igraph::plot.igraph(graph, vertex.label = labels) } } diff --git a/man/PlotNetwork.Rd b/man/PlotNetwork.Rd index 95a66421..d36319c1 100644 --- a/man/PlotNetwork.Rd +++ b/man/PlotNetwork.Rd @@ -16,7 +16,10 @@ PlotNetwork( vertexLabels = NA, vertexLabelSize = 0.7, vertexLabelOffset = 0.5, - layoutBipartite = TRUE + layoutBipartite = TRUE, + edgeColor = "gray", + edgeWidthHop1 = 0.5, + edgeColorHop1 = "gray" ) } \arguments{ @@ -43,6 +46,12 @@ tf,gene} Default is TRUE.} \item{layoutBipartite}{Whether or not to layout as a bipartite graph.} + +\item{edgeColor}{Color of edges} + +\item{edgeWidthHop1}{Width of edges 1 hop away from genes of interest.} + +\item{edgeColorHop1}{Color of edges 1 hop away from genes of interest.} } \value{ A bipartite plot of the network From dd362c263d5a62814fe31ca2520e21253332f9a4 Mon Sep 17 00:00:00 2001 From: Tara Eicher Date: Tue, 28 May 2024 13:58:52 -0400 Subject: [PATCH 05/17] Modified null distribution to include entire prior --- R/BLOBFISH.R | 84 +++++++++++++++++----------- man/GenerateNullPANDADistribution.Rd | 15 +++-- 2 files changed, 62 insertions(+), 37 deletions(-) diff --git a/R/BLOBFISH.R b/R/BLOBFISH.R index 9c6a3d70..7ded2a48 100644 --- a/R/BLOBFISH.R +++ b/R/BLOBFISH.R @@ -51,44 +51,34 @@ RunBLOBFISH <- function(geneSet, networks, alpha, hopConstraint, nullDistributio #' the gene region, and (3) the genes regulated by the TF are not coexpressed with the #' gene in question. We obtain this by inputting an empty prior and an identity coexpression #' matrix. -#' @param ngenes Number of genes to simulate -#' @param nTranscriptionFactors Number of transcription factors to simulate +#' @param ppiFile The location of the protein-protein interaction network between transcription factors. +#' This should be a TSV file where the first two columns are the transcription +#' factors and the third is whether there is a PPI between them. +#' @param motif The location of the motif prior from genes to transcription factors. This should +#' be a TSV file where the first column is the transcription factors, the +#' second is the genes, and the third is whether the transcription factor's +#' binding motif is in the gene promoter region. #' @param sampSize Number of samples to simulate #' @param numberOfPandas Number of null PANDA networks to generate #' @export -GenerateNullPANDADistribution <- function(ngenes = 20000, nTranscriptionFactors = 600, sampSize = 20, +GenerateNullPANDADistribution <- function(ppiFile, motifFile, sampSize = 20, numberOfPandas = 10){ # Set the seed. set.seed(1) - # Generate a motif file where only TFs 1-3 are connected to genes. - motiffname <- paste0("tmpMotif.tsv") - motifs <- data.frame(tf = rep(paste0("tf", 1:nTranscriptionFactors), ngenes), - gene = unlist(lapply(1:ngenes, function(j){ - return(rep(paste0("gene", j), nTranscriptionFactors)) - })), score = rep(0, ngenes * nTranscriptionFactors)) - motifs[intersect(which(motifs$tf == "tf1"), which(motifs$gene == "gene1")), "score"] <- 1 - motifs[intersect(which(motifs$tf == "tf2"), which(motifs$gene == "gene2")), "score"] <- 1 - motifs[intersect(which(motifs$tf == "tf3"), which(motifs$gene == "gene3")), "score"] <- 1 - write.table(motifs, motiffname, quote = FALSE, sep = "\t", - row.names = FALSE, col.names = FALSE) - - # Generate a PPI where TFs 1 is connected to TFs 4-5. - ppifname <- paste0("tmpPPI.tsv") - ppi <- data.frame(tf = c(paste0("tf", 1:nTranscriptionFactors), "tf1", "tf1"), - gene = c(paste0("tf", 1:nTranscriptionFactors), "tf4", "tf5"), - score = rep(1, nTranscriptionFactors + 2)) - write.table(ppi, ppifname, quote = FALSE, sep = "\t", - row.names = FALSE, col.names = FALSE) + # Read the motif. + motif <- read.table(motifFile, sep = "\t") + ppi <- read.table(ppiFile, sep = "\t") # Generate the null PANDA edges. nullPandas <- lapply(1:numberOfPandas, function(i){ # Generate a random matrix of expression values, where any correlation that exists # is spurious. + ngenes <- length(unique(motif[,2])) randomExpression <- matrix(data = stats::rnorm(ngenes * sampSize), nrow = ngenes) - rownames(randomExpression) <- paste0("gene", 1:ngenes) + rownames(randomExpression) <- unique(motif[,2]) # Save in a temporary file. fname <- paste0("tmp_", i, ".tsv") @@ -96,25 +86,37 @@ GenerateNullPANDADistribution <- function(ngenes = 20000, nTranscriptionFactors quote = FALSE) # Run PANDA. - nullPanda <- pandaPy(expr_file = fname, motif_file=motiffname, ppi_file=ppifname, save_tmp=FALSE)$panda + nullPanda <- pandaPy(expr_file = fname, motif_file=motifFile, ppi_file=ppiFile, save_tmp=FALSE, + save_memory=TRUE)$panda rownames(nullPanda) <- paste(nullPanda$TF, nullPanda$Gene, sep = "__") + hist(nullPanda[,3]) # Remove file. unlink(fname) # Return null results. - tfListToExclude <- c("tf1", "tf2", "tf3", "tf4", "tf5") - geneListToExclude <- c("gene1", "gene2", "gene3", "gene1", "gene1") - rowsToExclude <- paste(tfListToExclude, geneListToExclude, sep = "__") + rownames(motif) <- paste(motif[,1], motif[,2], sep = "__") + rowsToExclude <- unlist(lapply(rownames(motif), function(edge){ + + # Find other transcription factors that interact with this one. + tf <- strsplit(edge, "__")[[1]][1] + interactingTF <- unique(c(ppi[which(ppi[,2] == tf),1], ppi[which(ppi[,1] == tf),2])) + + # Create new edges including other transcription factors. + newEdges <- paste(interactingTF, edge, sep = "__") + str(newEdges) + + # Return the old and new edges. + return(c(edge, newEdges)) + })) + + # Return the scores for all tf-gene relationships not in the original motif. return(nullPanda[setdiff(rownames(nullPanda), rowsToExclude),"Score"]) }) - - # Remove files. - unlink(motiffname) - unlink(ppifname) # Return the values. - return(unlist(nullPandas)) + nullPandasAll <- unlist(nullPandas) + return(sample(nullPandasAll, n = length(nullPandasAll))) } #' Find the subnetwork of significant edges connecting the genes. @@ -143,6 +145,24 @@ BuildSubnetwork <- function(geneSet, networks, alpha, hopConstraint, nullDistrib combinedNetwork[,2+i] <- networksNamed[[i]]$score } + # # Find all significant edges in the network. + # ourEdgeVals <- combinedNetwork[, 3:ncol(combinedNetwork)] + # nullEdgeVals <- t(matrix(rep(nullDistribution, nrow(combinedNetwork)), ncol = nrow(combinedNetwork))) + # pValues <- matrixTests::row_wilcoxon_twosample(x = ourEdgeVals, y = nullEdgeVals, alternative = "greater")$pvalue + # + # # Adjust the p-values. + # if(doFDRAdjustment == TRUE){ + # pValues <- stats::p.adjust(pValues, method = "fdr") + # } + # whichSig <- which(pValues < alpha) + # + # # Subset the network. + # significantEdges <- rownames(combinedNetwork)[whichSig] + # subnetwork <- combinedNetwork[significantEdges, c(1:2)] + # if(verbose == TRUE){ + # message(paste("Retained", length(significantEdges), "out of", length(rownames(combinedNetwork)), "edges")) + # } + # For each gene, find the significant edges from each hop. significantSubnetworks <- FindSignificantEdgesForHop(geneSet = geneSet, combinedNetwork = combinedNetwork, diff --git a/man/GenerateNullPANDADistribution.Rd b/man/GenerateNullPANDADistribution.Rd index 707fef48..edab243d 100644 --- a/man/GenerateNullPANDADistribution.Rd +++ b/man/GenerateNullPANDADistribution.Rd @@ -10,20 +10,25 @@ gene in question. We obtain this by inputting an empty prior and an identity coe matrix.} \usage{ GenerateNullPANDADistribution( - ngenes = 20000, - nTranscriptionFactors = 600, + ppiFile, + motifFile, sampSize = 20, numberOfPandas = 10 ) } \arguments{ -\item{ngenes}{Number of genes to simulate} - -\item{nTranscriptionFactors}{Number of transcription factors to simulate} +\item{ppiFile}{The location of the protein-protein interaction network between transcription factors. +This should be a TSV file where the first two columns are the transcription +factors and the third is whether there is a PPI between them.} \item{sampSize}{Number of samples to simulate} \item{numberOfPandas}{Number of null PANDA networks to generate} + +\item{motif}{The location of the motif prior from genes to transcription factors. This should +be a TSV file where the first column is the transcription factors, the +second is the genes, and the third is whether the transcription factor's +binding motif is in the gene promoter region.} } \description{ Generate a null distribution of edge scores for PANDA-like networks; that is, From f1b48ad80eb957b8c965c87937a72eae15456f9b Mon Sep 17 00:00:00 2001 From: Tara Eicher Date: Tue, 28 May 2024 14:03:44 -0400 Subject: [PATCH 06/17] Moved the significance testing to the beginning and added FDR adjustment --- R/BLOBFISH.R | 179 ++++++++++++++++----------- man/BuildSubnetwork.Rd | 17 ++- man/FindSignificantEdgesForHop.Rd | 7 +- man/RunBLOBFISH.Rd | 17 ++- man/SignificantBreadthFirstSearch.Rd | 14 +-- 5 files changed, 150 insertions(+), 84 deletions(-) diff --git a/R/BLOBFISH.R b/R/BLOBFISH.R index 7ded2a48..13a38c2f 100644 --- a/R/BLOBFISH.R +++ b/R/BLOBFISH.R @@ -10,10 +10,18 @@ #' @param nullDistribution The null distribution, specified as a vector of values. #' @param verbose Whether or not to print detailed information about the run. #' @param topX Select the X lowest significant p-values for each gene. NULL by default. +#' @param doFDRAdjustment Whether or not to perform FDR adjustment. +#' @param pValueChunks The number of chunks to split when calculating the p-value. This +#' parameter allows the edges to be split into chunks to prevent memory errors. +#' @param pValueFile The file where the p-values should be saved. If NULL, they are not +#' saved and need to be recalculated. +#' @param loadPValues Whether p-values should be loaded from pValueFile or re-generated. +#' Default is FALSE. #' @returns A bipartite subnetwork in the same format as the original networks. #' @export RunBLOBFISH <- function(geneSet, networks, alpha, hopConstraint, nullDistribution, - verbose = FALSE, topX = NULL){ + verbose = FALSE, topX = NULL, doFDRAdjustment = TRUE, + pValueChunks = 100, loadPValues = FALSE, pValueFile = "pvalues.RDS"){ # Check for invalid inputs. if(!is.character(geneSet) || !is.list(networks) || !is.numeric(alpha) || !is.numeric(hopConstraint)){ @@ -39,7 +47,11 @@ RunBLOBFISH <- function(geneSet, networks, alpha, hopConstraint, nullDistributio alpha = alpha, hopConstraint = hopConstraint, nullDistribution = nullDistribution, - verbose = verbose, topX = topX) + verbose = verbose, topX = topX, + doFDRAdjustment = doFDRAdjustment, + pValueChunks = pValueChunks, + loadPValues = loadPValues, + pValueFile = pValueFile) # Return. return(subnetwork) @@ -129,46 +141,87 @@ GenerateNullPANDADistribution <- function(ppiFile, motifFile, sampSize = 20, #' @param nullDistribution The null distribution, specified as a vector of values. #' @param verbose Whether or not to print detailed information about the run. #' @param topX Select the X lowest significant p-values for each gene. NULL by default. +#' @param doFDRAdjustment Whether or not to perform FDR adjustment. +#' @param pValueChunks The number of chunks to split when calculating the p-value. This +#' parameter allows the edges to be split into chunks to prevent memory errors. +#' @param pValueFile The file where the p-values should be saved. If NULL, they are not +#' saved and need to be recalculated. +#' @param loadPValues Whether p-values should be loaded from pValueFile or re-generated. +#' Default is FALSE. #' @returns A bipartite subnetwork in the same format as the original networks. BuildSubnetwork <- function(geneSet, networks, alpha, hopConstraint, nullDistribution, - verbose = FALSE, topX = NULL){ + verbose = FALSE, topX = NULL, doFDRAdjustment = TRUE, + pValueChunks = 100, loadPValues = FALSE, pValueFile = "pvalues.RDS"){ # Name edges for each network. networksNamed <- lapply(networks, function(network){ rownames(network) <- paste(network$tf, network$gene, sep = "__") return(network) }) - + # Paste together the networks. combinedNetwork <- networksNamed[[1]] for(i in 2:length(networksNamed)){ combinedNetwork[,2+i] <- networksNamed[[i]]$score } - # # Find all significant edges in the network. - # ourEdgeVals <- combinedNetwork[, 3:ncol(combinedNetwork)] - # nullEdgeVals <- t(matrix(rep(nullDistribution, nrow(combinedNetwork)), ncol = nrow(combinedNetwork))) - # pValues <- matrixTests::row_wilcoxon_twosample(x = ourEdgeVals, y = nullEdgeVals, alternative = "greater")$pvalue - # - # # Adjust the p-values. - # if(doFDRAdjustment == TRUE){ - # pValues <- stats::p.adjust(pValues, method = "fdr") - # } - # whichSig <- which(pValues < alpha) - # - # # Subset the network. - # significantEdges <- rownames(combinedNetwork)[whichSig] - # subnetwork <- combinedNetwork[significantEdges, c(1:2)] - # if(verbose == TRUE){ - # message(paste("Retained", length(significantEdges), "out of", length(rownames(combinedNetwork)), "edges")) - # } + # Find all significant edges in the network. + pValues <- rep(NA, nrow(combinedNetwork)) + + # If we are calculating p-values for the first time, calculate and save them. + if(loadPValues == FALSE){ + startIndex <- 1 + endIndex <- min(startIndex + ceiling(nrow(combinedNetwork) / pValueChunks), + nrow(combinedNetwork)) + for(i in 1:pValueChunks){ + + # Calculate p-values for this chunk. + ourEdgeVals <- combinedNetwork[startIndex:endIndex, 3:ncol(combinedNetwork)] + nullEdgeVals <- t(matrix(rep(nullDistribution, + nrow(ourEdgeVals)), ncol = nrow(ourEdgeVals))) + pValues[startIndex:endIndex] <- matrixTests::row_wilcoxon_twosample(x = ourEdgeVals, + y = nullEdgeVals, + alternative = "greater")$pvalue + + # Print status. + if(verbose == TRUE){ + message(paste("Completed p-values for chunk", i, "out of", pValueChunks)) + } + + # Update indices. + startIndex <- endIndex + 1 + endIndex <- min(startIndex + ceiling(nrow(combinedNetwork) / pValueChunks), + nrow(combinedNetwork)) + } + + # Adjust the p-values. + if(doFDRAdjustment == TRUE){ + pValues <- stats::p.adjust(pValues, method = "fdr") + } + names(pValues) <- rownames(combinedNetwork) + + # Save the p-values. + if(!is.null(pValueFile)){ + saveRDS(pValues, pValueFile) + } + }else{ + # Read the saved p-values. + pValues <- readRDS(pValueFile) + } + + # Subset the network. + whichSig <- which(pValues < alpha) + significantEdges <- rownames(combinedNetwork)[whichSig] + subnetwork <- combinedNetwork[significantEdges, c(1:2)] + pValues <- pValues[significantEdges] + if(verbose == TRUE){ + message(paste("Retained", length(significantEdges), "out of", length(rownames(combinedNetwork)), "edges")) + } # For each gene, find the significant edges from each hop. - significantSubnetworks <- FindSignificantEdgesForHop(geneSet = geneSet, - combinedNetwork = combinedNetwork, - alpha = alpha, + significantSubnetworks <- FindSignificantEdgesForHop(geneSet = geneSet, pValues = pValues, + combinedNetwork = subnetwork, hopConstraint = hopConstraint / 2, - nullDistribution = nullDistribution, verbose = verbose, topX = topX) # Find matches for each hop. Note that we do not need to consider cases where the @@ -185,13 +238,12 @@ BuildSubnetwork <- function(geneSet, networks, alpha, hopConstraint, nullDistrib #' @param geneSet A character vector of genes comprising the targets of interest. #' @param combinedNetwork A concatenation of n PANDA-like networks with the following format: #' tf,gene,score_net1, score_net2, ... , score_netn -#' @param alpha The significance cutoff for the statistical test. +#' @param pValues The p-values for all edges. #' @param hopConstraint The maximum number of hops to be considered for a gene. -#' @param nullDistribution The null distribution, specified as a vector of values. #' @param verbose Whether or not to print detailed information about the run. #' @param topX Select the X lowest significant p-values for each gene. NULL by default. #' @returns A bipartite subnetwork in the same format as the original networks. -FindSignificantEdgesForHop <- function(geneSet, combinedNetwork, alpha, hopConstraint, nullDistribution, +FindSignificantEdgesForHop <- function(geneSet, combinedNetwork, hopConstraint, pValues, verbose = FALSE, topX = NULL){ # Build the significant subnetwork for each gene, up to the hop constraint. uniqueGeneSet <- sort(unique(geneSet)) @@ -202,13 +254,13 @@ FindSignificantEdgesForHop <- function(geneSet, combinedNetwork, alpha, hopConst message(paste("Evaluating hop 1 for gene", gene)) } subnetwork1Hop <- SignificantBreadthFirstSearch(networks = combinedNetwork, - alpha = alpha, + pValues = pValues, startingNodes = gene, nodesToExclude = c(), - startFromTF = FALSE, doFDRAdjustment = FALSE, - nullDistribution, verbose = verbose, + startFromTF = FALSE, + verbose = verbose, topX = topX) - + # Set the starting and excluded set for the next hop. startingNodes <- unique(subnetwork1Hop$tf) topXNew <- NULL @@ -234,14 +286,13 @@ FindSignificantEdgesForHop <- function(geneSet, combinedNetwork, alpha, hopConst message(paste("Evaluating hop", hop, "for gene", gene)) } subnetworkHops <- SignificantBreadthFirstSearch(networks = combinedNetwork, - alpha = alpha, + pValues = pValues, startingNodes = startingNodes, nodesToExclude = excludedSubset, - startFromTF = TRUE, doFDRAdjustment = FALSE, - nullDistribution = nullDistribution, + startFromTF = TRUE, verbose = verbose, topX = topXNew) - + # Set the starting and excluded set for the next hop. excludedSubset <- c(excludedSubset, startingNodes) startingNodes <- setdiff(unique(subnetworkHops$gene), excludedSubset) @@ -255,14 +306,13 @@ FindSignificantEdgesForHop <- function(geneSet, combinedNetwork, alpha, hopConst message(paste("Evaluating hop", hop, "for gene", gene)) } subnetworkHops <- SignificantBreadthFirstSearch(networks = combinedNetwork, - alpha = alpha, + pValues = pValues, startingNodes = startingNodes, nodesToExclude = excludedSubset, - startFromTF = FALSE, doFDRAdjustment = FALSE, - nullDistribution = nullDistribution, + startFromTF = FALSE, verbose = verbose, topX = topXNew) - + # Set the starting and excluded set for the next hop. excludedSubset <- c(excludedSubset, startingNodes) startingNodes <- setdiff(unique(subnetworkHops$tf), excludedSubset) @@ -273,7 +323,7 @@ FindSignificantEdgesForHop <- function(geneSet, combinedNetwork, alpha, hopConst # Add to the list. allSubnetworksForGene[[length(allSubnetworksForGene) + 1]] <- subnetworkHops - + # Increment hops. hop <- hop + 1 } @@ -290,7 +340,7 @@ FindSignificantEdgesForHop <- function(geneSet, combinedNetwork, alpha, hopConst #' @param networks A concatenation of n PANDA-like networks with the following format: #' tf,gene,score_net1, score_net2, ... , score_netn #' Edges must be specified as "tf__gene". -#' @param alpha The significance cutoff for the statistical test. +#' @param pValues The p-values from the original network. #' @param startingNodes The list of nodes from which to start. #' @param nodesToExclude The list of nodes to exclude from the search. #' @param startFromTF Whether to start from transcription factors (TRUE) or genes (FALSE). @@ -299,9 +349,9 @@ FindSignificantEdgesForHop <- function(geneSet, combinedNetwork, alpha, hopConst #' @param verbose Whether or not to print detailed information about the run. #' @param topX Select the X lowest significant p-values for each gene. NULL by default. #' @returns A bipartite subnetwork in the same format as the original networks. -SignificantBreadthFirstSearch <- function(networks, alpha, startingNodes, - nodesToExclude, startFromTF, doFDRAdjustment = FALSE, - nullDistribution, verbose = FALSE, topX = NULL){ +SignificantBreadthFirstSearch <- function(networks, pValues, startingNodes, + nodesToExclude, startFromTF, + verbose = FALSE, topX = NULL){ # Check that provided nodes overlap with the networks. if((length(setdiff(startingNodes, networks$tf)) > 0 && startFromTF == TRUE) || (length(setdiff(startingNodes, networks$gene)) > 0 && startFromTF == FALSE)){ @@ -331,33 +381,24 @@ SignificantBreadthFirstSearch <- function(networks, alpha, startingNodes, tfLongList <- unlist(lapply(tfsToTest, function(tf){ return(rep(tf, length(genesToTest))) })) - allEdges <- paste(tfLongList, geneLongList, sep = "__") - + allPossibleEdges <- paste(tfLongList, geneLongList, sep = "__") + allEdges <- intersect(allPossibleEdges, rownames(networks)) + # For each edge, measure its significance. - subnetwork <- networks[c(), 3:ncol(networks)] + subnetwork <- networks if(length(allEdges) > 0){ - ourEdgeVals <- networks[allEdges, 3:ncol(networks)] - #nullDistribution <- sample(nullDistribution, size = floor(length(nullDistribution) / length(allEdges)) * length(allEdges)) - nullEdgeVals <- t(matrix(rep(nullDistribution, length(allEdges)), ncol = length(allEdges))) - pValues <- matrixTests::row_wilcoxon_twosample(x = ourEdgeVals, y = nullEdgeVals, alternative = "greater")$pvalue - - # Adjust the p-values. - if(doFDRAdjustment == TRUE){ - pValues <- stats::p.adjust(pValues, method = "fdr") - } - whichSig <- which(pValues < alpha) - + # If topX is specified, filter again. - if(!is.null(topX) && length(whichSig) > topX){ - whichTopX <- order(pValues[whichSig])[1:topX] - whichSig <- whichSig[whichTopX] + significantEdges <- allEdges + if(!is.null(topX) && length(ourEdgeVals) > topX){ + whichTopX <- order(pValues[ourEdgeVals])[1:topX] + significantEdges <- allEdges[whichTopX] } # Return the edges meeting alpha. - significantEdges <- allEdges[whichSig] subnetwork <- networks[significantEdges, c(1:2)] if(verbose == TRUE){ - message(paste("Retained", length(significantEdges), "out of", length(allEdges), "edges")) + message(paste("Retained", length(significantEdges), "edges")) } } @@ -386,7 +427,7 @@ FindConnectionsForAllHopCounts <- function(subnetworks, verbose = FALSE){ gene1 <- names(subnetworks)[i] gene2 <- names(subnetworks)[j] connectingSubnetwork <- data.frame(tf = NA, gene = NA)[0,] - + # If there were no edges at this hop count for one or both genes, # do not evaluate. if(length(subnetworks[[gene1]]) >= hops && length(subnetworks[[gene2]]) >= hops){ @@ -451,7 +492,7 @@ FindConnectionsForAllHopCounts <- function(subnetworks, verbose = FALSE){ } } } - + # Return the subnetwork, which should now contain all of the edges connecting the # gene pair at the prespecified number of hops. return(connectingSubnetwork) @@ -460,11 +501,11 @@ FindConnectionsForAllHopCounts <- function(subnetworks, verbose = FALSE){ connectingSubnetworkAll <- do.call(rbind, genePairSpecificHopCountSubnetwork) return(connectingSubnetworkAll) }) - + # Bind together the subnetworks for each gene. return(do.call(rbind, geneSpecificHopCountSubnetwork)) }) - + # Bind together the subnetworks for each hop count. compositeSubnetwork <- do.call(rbind, hopCountSubnetworks) compositeSubnetworkEdges <- paste(compositeSubnetwork$tf, compositeSubnetwork$gene, sep = "__") @@ -485,7 +526,7 @@ FindConnectionsForAllHopCounts <- function(subnetworks, verbose = FALSE){ genesToRemove <- setdiff(genesToRemove, names(subnetworks)) tfsToRemove <- names(tfCounts)[which(tfCounts == 1)] compositeSubnetworkDedup <- compositeSubnetworkDedup[which(compositeSubnetworkDedup$gene %in% setdiff(names(geneCounts), - genesToRemove)),] + genesToRemove)),] compositeSubnetworkDedup <- compositeSubnetworkDedup[which(compositeSubnetworkDedup$tf %in% setdiff(names(tfCounts), tfsToRemove)),] return(compositeSubnetworkDedup) diff --git a/man/BuildSubnetwork.Rd b/man/BuildSubnetwork.Rd index ff686596..4cb57a09 100644 --- a/man/BuildSubnetwork.Rd +++ b/man/BuildSubnetwork.Rd @@ -11,7 +11,11 @@ BuildSubnetwork( hopConstraint, nullDistribution, verbose = FALSE, - topX = NULL + topX = NULL, + doFDRAdjustment = TRUE, + pValueChunks = 100, + loadPValues = FALSE, + pValueFile = "pvalues.RDS" ) } \arguments{ @@ -30,6 +34,17 @@ Must be an even number.} \item{verbose}{Whether or not to print detailed information about the run.} \item{topX}{Select the X lowest significant p-values for each gene. NULL by default.} + +\item{doFDRAdjustment}{Whether or not to perform FDR adjustment.} + +\item{pValueChunks}{The number of chunks to split when calculating the p-value. This +parameter allows the edges to be split into chunks to prevent memory errors.} + +\item{loadPValues}{Whether p-values should be loaded from pValueFile or re-generated. +Default is FALSE.} + +\item{pValueFile}{The file where the p-values should be saved. If NULL, they are not +saved and need to be recalculated.} } \value{ A bipartite subnetwork in the same format as the original networks. diff --git a/man/FindSignificantEdgesForHop.Rd b/man/FindSignificantEdgesForHop.Rd index f737b1e9..12cc0a1a 100644 --- a/man/FindSignificantEdgesForHop.Rd +++ b/man/FindSignificantEdgesForHop.Rd @@ -7,9 +7,8 @@ FindSignificantEdgesForHop( geneSet, combinedNetwork, - alpha, hopConstraint, - nullDistribution, + pValues, verbose = FALSE, topX = NULL ) @@ -20,11 +19,9 @@ FindSignificantEdgesForHop( \item{combinedNetwork}{A concatenation of n PANDA-like networks with the following format: tf,gene,score_net1, score_net2, ... , score_netn} -\item{alpha}{The significance cutoff for the statistical test.} - \item{hopConstraint}{The maximum number of hops to be considered for a gene.} -\item{nullDistribution}{The null distribution, specified as a vector of values.} +\item{pValues}{The p-values for all edges.} \item{verbose}{Whether or not to print detailed information about the run.} diff --git a/man/RunBLOBFISH.Rd b/man/RunBLOBFISH.Rd index b3929034..c00eea32 100644 --- a/man/RunBLOBFISH.Rd +++ b/man/RunBLOBFISH.Rd @@ -13,7 +13,11 @@ RunBLOBFISH( hopConstraint, nullDistribution, verbose = FALSE, - topX = NULL + topX = NULL, + doFDRAdjustment = TRUE, + pValueChunks = 100, + loadPValues = FALSE, + pValueFile = "pvalues.RDS" ) } \arguments{ @@ -32,6 +36,17 @@ Must be an even number.} \item{verbose}{Whether or not to print detailed information about the run.} \item{topX}{Select the X lowest significant p-values for each gene. NULL by default.} + +\item{doFDRAdjustment}{Whether or not to perform FDR adjustment.} + +\item{pValueChunks}{The number of chunks to split when calculating the p-value. This +parameter allows the edges to be split into chunks to prevent memory errors.} + +\item{loadPValues}{Whether p-values should be loaded from pValueFile or re-generated. +Default is FALSE.} + +\item{pValueFile}{The file where the p-values should be saved. If NULL, they are not +saved and need to be recalculated.} } \value{ A bipartite subnetwork in the same format as the original networks. diff --git a/man/SignificantBreadthFirstSearch.Rd b/man/SignificantBreadthFirstSearch.Rd index c2d4a0c2..2e6e531a 100644 --- a/man/SignificantBreadthFirstSearch.Rd +++ b/man/SignificantBreadthFirstSearch.Rd @@ -7,12 +7,10 @@ specified.} \usage{ SignificantBreadthFirstSearch( networks, - alpha, + pValues, startingNodes, nodesToExclude, startFromTF, - doFDRAdjustment = FALSE, - nullDistribution, verbose = FALSE, topX = NULL ) @@ -22,7 +20,7 @@ SignificantBreadthFirstSearch( tf,gene,score_net1, score_net2, ... , score_netn Edges must be specified as "tf__gene".} -\item{alpha}{The significance cutoff for the statistical test.} +\item{pValues}{The p-values from the original network.} \item{startingNodes}{The list of nodes from which to start.} @@ -30,13 +28,13 @@ Edges must be specified as "tf__gene".} \item{startFromTF}{Whether to start from transcription factors (TRUE) or genes (FALSE).} -\item{doFDRAdjustment}{Whether or not to adjust the p-values using FDR.} - -\item{nullDistribution}{The null distribution, specified as a vector of values.} - \item{verbose}{Whether or not to print detailed information about the run.} \item{topX}{Select the X lowest significant p-values for each gene. NULL by default.} + +\item{doFDRAdjustment}{Whether or not to adjust the p-values using FDR.} + +\item{nullDistribution}{The null distribution, specified as a vector of values.} } \value{ A bipartite subnetwork in the same format as the original networks. From 0f03e64abcc691fcf54e836efcff540615e20c38 Mon Sep 17 00:00:00 2001 From: Tara Eicher Date: Tue, 28 May 2024 14:14:31 -0400 Subject: [PATCH 07/17] Moved calculation of p-values to a separate function --- NAMESPACE | 1 + R/BLOBFISH.R | 92 ++++++++++++++++++++++++++--------------- man/CalculatePValues.Rd | 37 +++++++++++++++++ 3 files changed, 96 insertions(+), 34 deletions(-) create mode 100644 man/CalculatePValues.Rd diff --git a/NAMESPACE b/NAMESPACE index fe1ec11e..91d1aee3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand +export(CalculatePValues) export(GenerateNullPANDADistribution) export(PlotNetwork) export(RunBLOBFISH) diff --git a/R/BLOBFISH.R b/R/BLOBFISH.R index 13a38c2f..9913fe0b 100644 --- a/R/BLOBFISH.R +++ b/R/BLOBFISH.R @@ -170,40 +170,11 @@ BuildSubnetwork <- function(geneSet, networks, alpha, hopConstraint, nullDistrib # If we are calculating p-values for the first time, calculate and save them. if(loadPValues == FALSE){ - startIndex <- 1 - endIndex <- min(startIndex + ceiling(nrow(combinedNetwork) / pValueChunks), - nrow(combinedNetwork)) - for(i in 1:pValueChunks){ - - # Calculate p-values for this chunk. - ourEdgeVals <- combinedNetwork[startIndex:endIndex, 3:ncol(combinedNetwork)] - nullEdgeVals <- t(matrix(rep(nullDistribution, - nrow(ourEdgeVals)), ncol = nrow(ourEdgeVals))) - pValues[startIndex:endIndex] <- matrixTests::row_wilcoxon_twosample(x = ourEdgeVals, - y = nullEdgeVals, - alternative = "greater")$pvalue - - # Print status. - if(verbose == TRUE){ - message(paste("Completed p-values for chunk", i, "out of", pValueChunks)) - } - - # Update indices. - startIndex <- endIndex + 1 - endIndex <- min(startIndex + ceiling(nrow(combinedNetwork) / pValueChunks), - nrow(combinedNetwork)) - } - - # Adjust the p-values. - if(doFDRAdjustment == TRUE){ - pValues <- stats::p.adjust(pValues, method = "fdr") - } - names(pValues) <- rownames(combinedNetwork) - - # Save the p-values. - if(!is.null(pValueFile)){ - saveRDS(pValues, pValueFile) - } + pValues <- CalculatePValues(network = combinedNetwork, + pValueChunks = pValueChunks, + nullDistribution = nullDistribution, + doFDRAdjustment = doFDRAdjustment, + pValueFile = pValueFile) }else{ # Read the saved p-values. pValues <- readRDS(pValueFile) @@ -233,6 +204,59 @@ BuildSubnetwork <- function(geneSet, networks, alpha, hopConstraint, nullDistrib verbose = verbose) return(subnetwork) } + +#' Calculate p-values for all edges in the network using a Wilcoxon two-sample test +#' for each edge. +#' @param network A combination of PANDA-like networks, with the following format +#' (e.g., 3 networks), provided as a data frame: +#' tf,gene,score1,score2,score3 +#' @param nullDistribution The null distribution, specified as a vector of values. +#' @param pValueChunks The number of chunks to split when calculating the p-value. This +#' parameter allows the edges to be split into chunks to prevent memory errors. +#' @param doFDRAdjustment Whether or not to perform FDR adjustment. +#' @param pValueFile The file where the p-values should be saved. If NULL, they are not +#' saved and need to be recalculated. +#' @returns A vector of p-values, one for each edge. +#' @export +CalculatePValues <- function(network, nullDistribution, pValueChunks = 100, + doFDRAdjustment = TRUE, pValueFile = "pvalues.RDS"){ + + # Set the initial start and end indices. + startIndex <- 1 + endIndex <- min(startIndex + ceiling(nrow(combinedNetwork) / pValueChunks), + nrow(combinedNetwork)) + for(i in 1:pValueChunks){ + + # Calculate p-values for this chunk. + ourEdgeVals <- combinedNetwork[startIndex:endIndex, 3:ncol(combinedNetwork)] + nullEdgeVals <- t(matrix(rep(nullDistribution, + nrow(ourEdgeVals)), ncol = nrow(ourEdgeVals))) + pValues[startIndex:endIndex] <- matrixTests::row_wilcoxon_twosample(x = ourEdgeVals, + y = nullEdgeVals, + alternative = "greater")$pvalue + + # Print status. + if(verbose == TRUE){ + message(paste("Completed p-values for chunk", i, "out of", pValueChunks)) + } + + # Update indices. + startIndex <- endIndex + 1 + endIndex <- min(startIndex + ceiling(nrow(combinedNetwork) / pValueChunks), + nrow(combinedNetwork)) + } + + # Adjust the p-values. + if(doFDRAdjustment == TRUE){ + pValues <- stats::p.adjust(pValues, method = "fdr") + } + names(pValues) <- rownames(combinedNetwork) + + # Save the p-values. + if(!is.null(pValueFile)){ + saveRDS(pValues, pValueFile) + } +} #' Find the subnetwork of significant edges n / 2 hops away from each gene. #' @param geneSet A character vector of genes comprising the targets of interest. diff --git a/man/CalculatePValues.Rd b/man/CalculatePValues.Rd new file mode 100644 index 00000000..0f91f425 --- /dev/null +++ b/man/CalculatePValues.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/BLOBFISH.R +\name{CalculatePValues} +\alias{CalculatePValues} +\title{Calculate p-values for all edges in the network using a Wilcoxon two-sample test +for each edge.} +\usage{ +CalculatePValues( + network, + nullDistribution, + pValueChunks = 100, + doFDRAdjustment = TRUE, + pValueFile = "pvalues.RDS" +) +} +\arguments{ +\item{network}{A combination of PANDA-like networks, with the following format +(e.g., 3 networks), provided as a data frame: +tf,gene,score1,score2,score3} + +\item{nullDistribution}{The null distribution, specified as a vector of values.} + +\item{pValueChunks}{The number of chunks to split when calculating the p-value. This +parameter allows the edges to be split into chunks to prevent memory errors.} + +\item{doFDRAdjustment}{Whether or not to perform FDR adjustment.} + +\item{pValueFile}{The file where the p-values should be saved. If NULL, they are not +saved and need to be recalculated.} +} +\value{ +A vector of p-values, one for each edge. +} +\description{ +Calculate p-values for all edges in the network using a Wilcoxon two-sample test +for each edge. +} From a1a3981eeca47d11367824dbbd2022276da13411 Mon Sep 17 00:00:00 2001 From: Tara Eicher Date: Tue, 28 May 2024 14:58:20 -0400 Subject: [PATCH 08/17] Updated tests for BLOBFISH --- R/BLOBFISH.R | 22 +++++--- man/CalculatePValues.Rd | 5 +- tests/testthat/test-blobfish.R | 94 ++++++++++++++++------------------ 3 files changed, 64 insertions(+), 57 deletions(-) diff --git a/R/BLOBFISH.R b/R/BLOBFISH.R index 9913fe0b..30863caa 100644 --- a/R/BLOBFISH.R +++ b/R/BLOBFISH.R @@ -216,19 +216,24 @@ BuildSubnetwork <- function(geneSet, networks, alpha, hopConstraint, nullDistrib #' @param doFDRAdjustment Whether or not to perform FDR adjustment. #' @param pValueFile The file where the p-values should be saved. If NULL, they are not #' saved and need to be recalculated. +#' @param verbose Whether or not to print detailed information about the run. #' @returns A vector of p-values, one for each edge. #' @export CalculatePValues <- function(network, nullDistribution, pValueChunks = 100, - doFDRAdjustment = TRUE, pValueFile = "pvalues.RDS"){ + doFDRAdjustment = TRUE, pValueFile = "pvalues.RDS", + verbose = FALSE){ + + # Initialize p-values. + pValues <- rep(NA, nrow(network)) # Set the initial start and end indices. startIndex <- 1 - endIndex <- min(startIndex + ceiling(nrow(combinedNetwork) / pValueChunks), - nrow(combinedNetwork)) + endIndex <- min(startIndex + ceiling(nrow(network) / pValueChunks), + nrow(network)) for(i in 1:pValueChunks){ # Calculate p-values for this chunk. - ourEdgeVals <- combinedNetwork[startIndex:endIndex, 3:ncol(combinedNetwork)] + ourEdgeVals <- network[startIndex:endIndex, 3:ncol(network)] nullEdgeVals <- t(matrix(rep(nullDistribution, nrow(ourEdgeVals)), ncol = nrow(ourEdgeVals))) pValues[startIndex:endIndex] <- matrixTests::row_wilcoxon_twosample(x = ourEdgeVals, @@ -242,20 +247,23 @@ CalculatePValues <- function(network, nullDistribution, pValueChunks = 100, # Update indices. startIndex <- endIndex + 1 - endIndex <- min(startIndex + ceiling(nrow(combinedNetwork) / pValueChunks), - nrow(combinedNetwork)) + endIndex <- min(startIndex + ceiling(nrow(network) / pValueChunks), + nrow(network)) } # Adjust the p-values. if(doFDRAdjustment == TRUE){ pValues <- stats::p.adjust(pValues, method = "fdr") } - names(pValues) <- rownames(combinedNetwork) + names(pValues) <- rownames(network) # Save the p-values. if(!is.null(pValueFile)){ saveRDS(pValues, pValueFile) } + + # Return the p-values. + return(pValues) } #' Find the subnetwork of significant edges n / 2 hops away from each gene. diff --git a/man/CalculatePValues.Rd b/man/CalculatePValues.Rd index 0f91f425..d10b5959 100644 --- a/man/CalculatePValues.Rd +++ b/man/CalculatePValues.Rd @@ -10,7 +10,8 @@ CalculatePValues( nullDistribution, pValueChunks = 100, doFDRAdjustment = TRUE, - pValueFile = "pvalues.RDS" + pValueFile = "pvalues.RDS", + verbose = FALSE ) } \arguments{ @@ -27,6 +28,8 @@ parameter allows the edges to be split into chunks to prevent memory errors.} \item{pValueFile}{The file where the p-values should be saved. If NULL, they are not saved and need to be recalculated.} + +\item{verbose}{Whether or not to print detailed information about the run.} } \value{ A vector of p-values, one for each edge. diff --git a/tests/testthat/test-blobfish.R b/tests/testthat/test-blobfish.R index 27438f31..12013b40 100644 --- a/tests/testthat/test-blobfish.R +++ b/tests/testthat/test-blobfish.R @@ -36,59 +36,48 @@ test_that("[BLOBFISH] SignificantBreadthFirstSearch() function yields expected r # Use all combined scores as the null distribution. null <- stats::rnorm(n = nrow(fullNetworks) * 3) / 100 + # Calculate the p-values. + pvalues <- CalculatePValues(network = fullNetworks, pValueChunks = 2, + nullDistribution = null, verbose = TRUE) + whichSig <- which(pvalues < 0.1) + significantEdges <- rownames(fullNetworks)[whichSig] + subnetwork <- fullNetworks[significantEdges, c(1:2)] + pvalues = pvalues[significantEdges] + # Test that errors are thrown when appropriate. - expect_error(SignificantBreadthFirstSearch(networks = fullNetworks, - alpha = 0.05, startingNodes = c("blob", "fish"), - nodesToExclude = c(), startFromTF = TRUE, - nullDistribution = null), + expect_error(SignificantBreadthFirstSearch(networks = subnetwork, pValues = pvalues, + startingNodes = c("blob", "fish"), + nodesToExclude = c(), startFromTF = TRUE), "ERROR: Starting nodes do not overlap with network nodes") - expect_error(SignificantBreadthFirstSearch(networks = fullNetworks, - alpha = 0.05, startingNodes = c("geneA", "geneB"), - nodesToExclude = c(), startFromTF = TRUE, - nullDistribution = null), + expect_error(SignificantBreadthFirstSearch(networks = subnetwork, pValues = pvalues, + startingNodes = c("geneA", "geneB"), + nodesToExclude = c(), startFromTF = TRUE), "ERROR: Starting nodes do not overlap with network nodes") - expect_error(SignificantBreadthFirstSearch(networks = fullNetworks, - alpha = 0.05, startingNodes = c("tf1", "tf2"), - nodesToExclude = c(), startFromTF = FALSE, - nullDistribution = null), + expect_error(SignificantBreadthFirstSearch(networks = subnetwork, pValues = pvalues, + startingNodes = c("tf1", "tf2"), + nodesToExclude = c(), startFromTF = FALSE), "ERROR: Starting nodes do not overlap with network nodes") - expect_error(SignificantBreadthFirstSearch(networks = fullNetworks, - alpha = 0.05, startingNodes = c("geneA", "geneB", "geneC"), - nodesToExclude = c("blob", "fish"), startFromTF = FALSE, - nullDistribution = null), + expect_error(SignificantBreadthFirstSearch(networks = subnetwork, pValues = pvalues, + startingNodes = c("geneA", "geneB", "geneC"), + nodesToExclude = c("blob", "fish"), startFromTF = FALSE), "ERROR: List of nodes to exclude does not overlap with network nodes") - expect_error(SignificantBreadthFirstSearch(networks = fullNetworks, - alpha = 0.05, startingNodes = c("geneA", "geneB", "geneC"), - nodesToExclude = c("geneA", "geneB"), startFromTF = FALSE, - nullDistribution = null), + expect_error(SignificantBreadthFirstSearch(networks = subnetwork, pValues = pvalues, + startingNodes = c("geneA", "geneB", "geneC"), + nodesToExclude = c("geneA", "geneB"), startFromTF = FALSE), "ERROR: Starting nodes cannot overlap with nodes to exclude") # Ensure that, when starting from genes A, B, and C, we obtain the correct values. expect_true(length(setdiff(c("tf2__geneA", "tf2__geneB", "tf3__geneA", "tf4__geneC"), - rownames(SignificantBreadthFirstSearch(networks = fullNetworks, - alpha = 0.05, startingNodes = c("geneA", "geneB", "geneC"), - nodesToExclude = c(), startFromTF = FALSE, doFDRAdjustment = FALSE, - nullDistribution = null)))) == 0) + rownames(SignificantBreadthFirstSearch(networks = subnetwork, pValues = pvalues, + startingNodes = c("geneA", "geneB", "geneC"), + nodesToExclude = c(), startFromTF = FALSE)))) == 0) # Ensure that, when starting from transcription factors TF2, TF3, and TF4 and removing genes A, B, and C, we obtain the correct values. expect_true(length(setdiff(c("tf3__geneD", "tf4__geneD"), - rownames(SignificantBreadthFirstSearch(networks = fullNetworks, - alpha = 0.05, startingNodes = c("tf2", "tf3", "tf4"), - nodesToExclude = c("geneA", "geneB", "geneC"), startFromTF = TRUE, doFDRAdjustment = TRUE, - nullDistribution = null)))) == 0) - - # Ensure that we obtain nothing when we include nothing significant. - expect_true(length(intersect(c("tf2__geneA", "tf2__geneB", "tf3__geneA", "tf4__geneC"), rownames(SignificantBreadthFirstSearch(networks = fullNetworks, - alpha = 0.05, startingNodes = c("1", "2", "3"), - nodesToExclude = c("geneA", "geneB", "geneC"), startFromTF = TRUE, doFDRAdjustment = FALSE, - nullDistribution = null)))) == 0) - - # Ensure that we obtain nothing when everything is excluded. - expect_equal(length(rownames(SignificantBreadthFirstSearch(networks = fullNetworks, - alpha = 0.05, startingNodes = c("tf1", "tf2", "tf3"), - nodesToExclude = unique(fullNetworks$gene), - startFromTF = TRUE, doFDRAdjustment = FALSE, - nullDistribution = null))), 0) + rownames(SignificantBreadthFirstSearch(networks = subnetwork, pValues = pvalues, + startingNodes = c("tf2", "tf3", "tf4"), + nodesToExclude = c("geneA", "geneB", "geneC"), + startFromTF = TRUE)))) == 0) }) test_that("[BLOBFISH] FindConnectionsForAllHopCounts() function yields expected results", { @@ -155,6 +144,14 @@ test_that("[BLOBFISH] FindSignificantEdgesForHop() function yields expected resu # Null distribution is a narrower version of the normal distribution. null <- rnorm(n = nrow(fullNetworks) * 3) / 100 + # Calculate the p-values. + pvalues <- CalculatePValues(network = fullNetworks, pValueChunks = 2, + nullDistribution = null, verbose = TRUE) + whichSig <- which(pvalues < 0.1) + significantEdges <- rownames(fullNetworks)[whichSig] + subnetwork <- fullNetworks[significantEdges, c(1:2)] + pvalues = pvalues[significantEdges] + # Set up the subnetwork from the previous example. geneANetHop1 <- data.frame(tf = c("tf2", "tf3"), gene = c("geneA", "geneA")) rownames(geneANetHop1) <- paste(geneANetHop1$tf, geneANetHop1$gene, sep = "__") @@ -187,8 +184,8 @@ test_that("[BLOBFISH] FindSignificantEdgesForHop() function yields expected resu # Test method with all genes as input. sigEdges <- FindSignificantEdgesForHop(geneSet = c("geneA", "geneB", "geneC", "geneD"), - combinedNetwork = fullNetworks, - alpha = 0.05, hopConstraint = 3, nullDistribution = null) + combinedNetwork = subnetwork, pValues = pvalues, + hopConstraint = 3) expect_true(length(setdiff(rownames(subnetworksFull$geneA[[1]]), rownames(sigEdges$geneA[[1]]))) == 0) expect_true(length(setdiff(rownames(subnetworksFull$geneA[[2]]), rownames(sigEdges$geneA[[2]]))) == 0) expect_true(length(setdiff(rownames(subnetworksFull$geneA[[3]]), rownames(sigEdges$geneA[[3]]))) == 0) @@ -204,8 +201,7 @@ test_that("[BLOBFISH] FindSignificantEdgesForHop() function yields expected resu # Test method with only genes A, B, C. sigEdges <- FindSignificantEdgesForHop(geneSet = c("geneA", "geneB", "geneC"), - combinedNetwork = fullNetworks, - alpha = 0.05, hopConstraint = 3, nullDistribution = null) + combinedNetwork = subnetwork, hopConstraint = 3) expect_true(length(setdiff(rownames(subnetworksFull$geneA[[1]]), rownames(sigEdges$geneA[[1]]))) == 0) expect_true(length(setdiff(rownames(subnetworksFull$geneA[[2]]), rownames(sigEdges$geneA[[2]]))) == 0) expect_true(length(setdiff(rownames(subnetworksFull$geneA[[3]]), rownames(sigEdges$geneA[[3]]))) == 0) @@ -218,8 +214,8 @@ test_that("[BLOBFISH] FindSignificantEdgesForHop() function yields expected resu # Test method with only genes A and C. sigEdges <- FindSignificantEdgesForHop(geneSet = c("geneA", "geneC"), - combinedNetwork = fullNetworks, - alpha = 0.4, hopConstraint = 3, nullDistribution = null) + combinedNetwork = subnetwork, + hopConstraint = 3) expect_true(length(setdiff(rownames(subnetworksFull$geneA[[1]]), rownames(sigEdges$geneA[[1]]))) == 0) expect_true(length(setdiff(rownames(subnetworksFull$geneA[[2]]), rownames(sigEdges$geneA[[2]]))) == 0) expect_true(length(setdiff(rownames(subnetworksFull$geneA[[3]]), rownames(sigEdges$geneA[[3]]))) == 0) @@ -274,13 +270,13 @@ test_that("[BLOBFISH] BuildSubnetwork() function yields expected results",{ expect_true(length(setdiff(c("tf2__geneA", "tf2__geneB", "tf3__geneA", "tf3__geneD", "tf4__geneC", "tf4__geneD"), rownames(BuildSubnetwork(geneSet = c("geneA", "geneB", "geneC", "geneD"), networks = list(fullNetwork1, fullNetwork2, fullNetwork3), - alpha = 0.05, hopConstraint = 4, nullDistribution = null)))) == 0) + alpha = 0.1, hopConstraint = 4, nullDistribution = null)))) == 0) # Obtain the overlaps for only the first two hops, for only A and C. expect_true(length(setdiff(c("tf3__geneA", "tf3__geneD", "tf4__geneC", "tf4__geneD"), rownames(BuildSubnetwork(geneSet = c("geneA", "geneC"), networks = list(fullNetwork1, fullNetwork2, fullNetwork3), - alpha = 0.05, hopConstraint = 4, nullDistribution = null)))) == 0) + alpha = 0.1, hopConstraint = 4, nullDistribution = null)))) == 0) }) test_that("[BLOBFISH] RunBLOBFISH() function yields expected results",{ From 10ddc53138024e723e67a55a890fae572a1a6469 Mon Sep 17 00:00:00 2001 From: Tara Eicher Date: Fri, 31 May 2024 13:57:18 -0400 Subject: [PATCH 09/17] Added color-coding functionality to the plots --- R/BLOBFISH.R | 74 ++++++++++++++++++++++++++++------------------ man/PlotNetwork.Rd | 19 ++++-------- 2 files changed, 51 insertions(+), 42 deletions(-) diff --git a/R/BLOBFISH.R b/R/BLOBFISH.R index 30863caa..e5525b67 100644 --- a/R/BLOBFISH.R +++ b/R/BLOBFISH.R @@ -24,13 +24,17 @@ RunBLOBFISH <- function(geneSet, networks, alpha, hopConstraint, nullDistributio pValueChunks = 100, loadPValues = FALSE, pValueFile = "pvalues.RDS"){ # Check for invalid inputs. - if(!is.character(geneSet) || !is.list(networks) || !is.numeric(alpha) || !is.numeric(hopConstraint)){ + if(!is.character(geneSet) || (!is.list(networks) && !is.data.frame(networks)) || !is.numeric(alpha) || !is.numeric(hopConstraint)){ stop(paste("Wrong input type! geneSet must be a character vector. networks must be a list.", "alpha and hopConstraint must be scalar numeric values.")) - }else if(!is.data.frame(networks[[1]])){ + }else if(!is.data.frame(networks) && !is.data.frame(networks[[1]])){ stop("Each network must be a data frame.") - }else if(ncol(networks[[1]]) != 3 || colnames(networks[[1]])[1] != "tf" || - colnames(networks[[1]])[2] != "gene" || colnames(networks[[1]])[3] != "score"){ + }else if(!is.data.frame(networks) && (ncol(networks[[1]]) != 3 || colnames(networks[[1]])[1] != "tf" || + colnames(networks[[1]])[2] != "gene" || colnames(networks[[1]])[3] != "score")){ + stop(paste("Each network must have transcription factors in the first column,", + "target genes in the second column, and scores in the third column.")) + }else if(is.data.frame(networks) && (ncol(networks) < 3 || colnames(networks)[1] != "tf" || + colnames(networks)[2] != "gene")){ stop(paste("Each network must have transcription factors in the first column,", "target genes in the second column, and scores in the third column.")) }else if(alpha > 1 || alpha <=0){ @@ -154,15 +158,17 @@ BuildSubnetwork <- function(geneSet, networks, alpha, hopConstraint, nullDistrib pValueChunks = 100, loadPValues = FALSE, pValueFile = "pvalues.RDS"){ # Name edges for each network. - networksNamed <- lapply(networks, function(network){ - rownames(network) <- paste(network$tf, network$gene, sep = "__") - return(network) - }) - - # Paste together the networks. - combinedNetwork <- networksNamed[[1]] - for(i in 2:length(networksNamed)){ - combinedNetwork[,2+i] <- networksNamed[[i]]$score + combinedNetwork <- networks + if(!is.data.frame(combinedNetwork)){ + networksNamed <- lapply(networks, function(network){ + rownames(network) <- paste(network$tf, network$gene, sep = "__") + return(network) + }) + # Paste together the networks. + combinedNetwork <- networksNamed[[1]] + for(i in 2:length(networksNamed)){ + combinedNetwork[,2+i] <- networksNamed[[i]]$score + } } # Find all significant edges in the network. @@ -185,7 +191,10 @@ BuildSubnetwork <- function(geneSet, networks, alpha, hopConstraint, nullDistrib significantEdges <- rownames(combinedNetwork)[whichSig] subnetwork <- combinedNetwork[significantEdges, c(1:2)] pValues <- pValues[significantEdges] + genesWithNoSigEdges <- setdiff(geneSet, subnetwork$gene) + geneSet <- intersect(geneSet, subnetwork$gene) if(verbose == TRUE){ + message(paste("The following genes had no significant edges:", paste(genesWithNoSigEdges, collapse = ","))) message(paste("Retained", length(significantEdges), "out of", length(rownames(combinedNetwork)), "edges")) } @@ -569,14 +578,13 @@ FindConnectionsForAllHopCounts <- function(subnetworks, verbose = FALSE){ #' @param network A data frame with the following format: #' tf,gene #' @param genesOfInterest Which genes of interest to highlight -#' @param geneOfInterestColor Color for the genes of interest #' @param tfColor Color for the transcription factors -#' @param otherGenesColor Color for the other genes +#' @param geneColorMapping Color mapping from a set of genes to a color. The +#' nodes and edges connected to them will be this color. If NULL, all genes and +#' their edges will be gray. The format is a data frame, where the first column ("gene") +#' is the name of the gene and the second ("color") is the color. #' @param nodeSize Size of node #' @param edgeWidth Width of edges -#' @param edgeColor Color of edges -#' @param edgeWidthHop1 Width of edges 1 hop away from genes of interest. -#' @param edgeColorHop1 Color of edges 1 hop away from genes of interest. #' @param vertexLabels Which vertex labels to include. By default, none are included. #' @param layoutBipartite Whether or not to layout as a bipartite graph. #' @param vertexLabelSize The size of label to use for the vertex, as a fraction of the default. @@ -584,29 +592,39 @@ FindConnectionsForAllHopCounts <- function(subnetworks, verbose = FALSE){ #' Default is TRUE. #' @returns A bipartite plot of the network #' @export -PlotNetwork <- function(network, genesOfInterest, geneOfInterestColor = "red", - tfColor = "blue", otherGenesColor = "gray", nodeSize = 1, +PlotNetwork <- function(network, genesOfInterest, + tfColor = "blue", nodeSize = 1, edgeWidth = 0.5, vertexLabels = NA, vertexLabelSize = 0.7, - vertexLabelOffset = 0.5, layoutBipartite = TRUE, edgeColor = "gray", - edgeWidthHop1 = 0.5, edgeColorHop1 = "gray"){ + vertexLabelOffset = 0.5, layoutBipartite = TRUE, geneColorMapping = NULL){ # Set the node attributes. uniqueNodes <- unique(c(network$tf, network$gene)) nodeAttrs <- data.frame(node = uniqueNodes, - color = rep(otherGenesColor, length(uniqueNodes)), + color = rep("gray", length(uniqueNodes)), size = rep(nodeSize, length(uniqueNodes)), frame.width = rep(0, length(uniqueNodes)), label.color = "black", label.cex = vertexLabelSize, label.dist = vertexLabelOffset) rownames(nodeAttrs) <- uniqueNodes + + # Add TF colors. nodeAttrs[which(uniqueNodes %in% network$tf), "color"] <- tfColor - nodeAttrs[which(uniqueNodes %in% genesOfInterest), "color"] <- geneOfInterestColor + + # Add gene colors. + rownames(geneColorMapping) <- geneColorMapping$gene + geneColorMapping <- geneColorMapping[intersect(rownames(geneColorMapping), uniqueNodes),] + if(!is.null(geneColorMapping)){ + nodeAttrs[rownames(geneColorMapping), "color"] <- geneColorMapping$color + } + str(nodeAttrs) # Add edge attributes. - network$color <- edgeColor - network$color[which(network$gene %in% genesOfInterest)] <- edgeColorHop1 + if(!is.null(geneColorMapping)){ + for(gene in rownames(geneColorMapping)){ + network[which(network$gene == gene), "color"] <- geneColorMapping[gene, "color"] + } + } network$width <- edgeWidth - network$width[which(network$gene %in% genesOfInterest)] <- edgeWidthHop1 - + # Create a graph object. graph <- igraph::graph_from_data_frame(network, vertices = nodeAttrs, directed = FALSE) V(graph)$type <- V(graph)$name %in% network$tf diff --git a/man/PlotNetwork.Rd b/man/PlotNetwork.Rd index d36319c1..27f08536 100644 --- a/man/PlotNetwork.Rd +++ b/man/PlotNetwork.Rd @@ -8,18 +8,14 @@ and additional genes.} PlotNetwork( network, genesOfInterest, - geneOfInterestColor = "red", tfColor = "blue", - otherGenesColor = "gray", nodeSize = 1, edgeWidth = 0.5, vertexLabels = NA, vertexLabelSize = 0.7, vertexLabelOffset = 0.5, layoutBipartite = TRUE, - edgeColor = "gray", - edgeWidthHop1 = 0.5, - edgeColorHop1 = "gray" + geneColorMapping = NULL ) } \arguments{ @@ -28,12 +24,8 @@ tf,gene} \item{genesOfInterest}{Which genes of interest to highlight} -\item{geneOfInterestColor}{Color for the genes of interest} - \item{tfColor}{Color for the transcription factors} -\item{otherGenesColor}{Color for the other genes} - \item{nodeSize}{Size of node} \item{edgeWidth}{Width of edges} @@ -47,11 +39,10 @@ Default is TRUE.} \item{layoutBipartite}{Whether or not to layout as a bipartite graph.} -\item{edgeColor}{Color of edges} - -\item{edgeWidthHop1}{Width of edges 1 hop away from genes of interest.} - -\item{edgeColorHop1}{Color of edges 1 hop away from genes of interest.} +\item{geneColorMapping}{Color mapping from a set of genes to a color. The +nodes and edges connected to them will be this color. If NULL, all genes and +their edges will be gray. The format is a data frame, where the first column ("gene") +is the name of the gene and the second ("color") is the color.} } \value{ A bipartite plot of the network From cefd15830e40a9d04f4317dbd6840e1d458d020d Mon Sep 17 00:00:00 2001 From: Tara Eicher Date: Tue, 18 Jun 2024 16:08:39 -0400 Subject: [PATCH 10/17] Delete tests/testthat/test-blobfish.R Deleted the test file as requested to see if we are able to build. --- tests/testthat/test-blobfish.R | 314 --------------------------------- 1 file changed, 314 deletions(-) delete mode 100644 tests/testthat/test-blobfish.R diff --git a/tests/testthat/test-blobfish.R b/tests/testthat/test-blobfish.R deleted file mode 100644 index 12013b40..00000000 --- a/tests/testthat/test-blobfish.R +++ /dev/null @@ -1,314 +0,0 @@ -# unit-tests for BLOBFISH -context("test BLOBFISH functions") - -test_that("[BLOBFISH] SignificantBreadthFirstSearch() function yields expected results", { - - # Construct a starting network, which will be modified. - # Here, we expect genes A and B to be connected after 1 hop via TF2, genes A and D - # to be connected after 1 hop via TF3, and genes A and C to be connected after 2 - # hops via TF4. - startingNetwork <- data.frame(tf = c(rep(c("tf1", "tf2", "tf3", "tf4"), 4)), - gene = c(rep("geneA", 4), rep("geneB", 4), rep("geneC", 4), rep("geneD", 4)), - score = c(-3, 3, 5, -5, 0, 4, 0.0005, -0.5, 0, 0.0005, -1, 4, 0.0005, -2, 5, 3)) - addedNoise1 <- data.frame(tf = rep(1:100, 4), gene = c(rep("geneA", 100), - rep("geneB", 100), - rep("geneC", 100), - rep("geneD", 100)), - score = stats::rnorm(400) / 1000) - addedNoise2 <- data.frame(tf = rep(1:100, 4), gene = c(rep("geneA", 100), - rep("geneB", 100), - rep("geneC", 100), - rep("geneD", 100)), - score = stats::rnorm(400) / 10000) - addedNoise3 <- data.frame(tf = rep(1:100, 4), gene = c(rep("geneA", 100), - rep("geneB", 100), - rep("geneC", 100), - rep("geneD", 100)), - score = stats::rnorm(400) / 10000) - fullNetwork1 <- rbind(startingNetwork, addedNoise1) - rownames(fullNetwork1) <- paste(fullNetwork1$tf, fullNetwork1$gene, sep = "__") - - # Create a set of networks close to the original. - fullNetworks <- fullNetwork1 - fullNetworks[,4] <- fullNetwork1$score - 0.00005 - fullNetworks[,5] <- fullNetwork1$score + 0.00005 - - # Use all combined scores as the null distribution. - null <- stats::rnorm(n = nrow(fullNetworks) * 3) / 100 - - # Calculate the p-values. - pvalues <- CalculatePValues(network = fullNetworks, pValueChunks = 2, - nullDistribution = null, verbose = TRUE) - whichSig <- which(pvalues < 0.1) - significantEdges <- rownames(fullNetworks)[whichSig] - subnetwork <- fullNetworks[significantEdges, c(1:2)] - pvalues = pvalues[significantEdges] - - # Test that errors are thrown when appropriate. - expect_error(SignificantBreadthFirstSearch(networks = subnetwork, pValues = pvalues, - startingNodes = c("blob", "fish"), - nodesToExclude = c(), startFromTF = TRUE), - "ERROR: Starting nodes do not overlap with network nodes") - expect_error(SignificantBreadthFirstSearch(networks = subnetwork, pValues = pvalues, - startingNodes = c("geneA", "geneB"), - nodesToExclude = c(), startFromTF = TRUE), - "ERROR: Starting nodes do not overlap with network nodes") - expect_error(SignificantBreadthFirstSearch(networks = subnetwork, pValues = pvalues, - startingNodes = c("tf1", "tf2"), - nodesToExclude = c(), startFromTF = FALSE), - "ERROR: Starting nodes do not overlap with network nodes") - expect_error(SignificantBreadthFirstSearch(networks = subnetwork, pValues = pvalues, - startingNodes = c("geneA", "geneB", "geneC"), - nodesToExclude = c("blob", "fish"), startFromTF = FALSE), - "ERROR: List of nodes to exclude does not overlap with network nodes") - expect_error(SignificantBreadthFirstSearch(networks = subnetwork, pValues = pvalues, - startingNodes = c("geneA", "geneB", "geneC"), - nodesToExclude = c("geneA", "geneB"), startFromTF = FALSE), - "ERROR: Starting nodes cannot overlap with nodes to exclude") - - # Ensure that, when starting from genes A, B, and C, we obtain the correct values. - expect_true(length(setdiff(c("tf2__geneA", "tf2__geneB", "tf3__geneA", "tf4__geneC"), - rownames(SignificantBreadthFirstSearch(networks = subnetwork, pValues = pvalues, - startingNodes = c("geneA", "geneB", "geneC"), - nodesToExclude = c(), startFromTF = FALSE)))) == 0) - - # Ensure that, when starting from transcription factors TF2, TF3, and TF4 and removing genes A, B, and C, we obtain the correct values. - expect_true(length(setdiff(c("tf3__geneD", "tf4__geneD"), - rownames(SignificantBreadthFirstSearch(networks = subnetwork, pValues = pvalues, - startingNodes = c("tf2", "tf3", "tf4"), - nodesToExclude = c("geneA", "geneB", "geneC"), - startFromTF = TRUE)))) == 0) -}) -test_that("[BLOBFISH] FindConnectionsForAllHopCounts() function yields expected results", { - - # Set up the subnetwork from the previous example. - geneANetHop1 <- data.frame(tf = c("tf2", "tf3"), gene = c("geneA", "geneA")) - geneBNetHop1 <- data.frame(tf = "tf2", gene = "geneB") - geneCNetHop1 <- data.frame(tf = "tf4", gene = "geneC") - geneDNetHop1 <- data.frame(tf = c("tf3", "tf4"), gene = c("geneD", "geneD")) - geneANetHop2 <- rbind(geneBNetHop1, geneDNetHop1[1,]) - geneBNetHop2 <- geneANetHop1[1,] - geneCNetHop2 <- geneDNetHop1[2,] - geneDNetHop2 <- rbind(geneANetHop1[2,], geneCNetHop1) - geneANetHop3 <- geneDNetHop1[2,] - geneBNetHop3 <- geneANetHop1[2,] - geneCNetHop3 <- geneDNetHop1[1,] - geneDNetHop3 <- geneANetHop1[1,] - subnetworks <- list(geneA = list(geneANetHop1, geneANetHop2, geneANetHop3), - geneB = list(geneBNetHop1, geneBNetHop2, geneBNetHop3), - geneC = list(geneCNetHop1, geneCNetHop2, geneCNetHop3), - geneD = list(geneDNetHop1, geneDNetHop2, geneDNetHop3)) - - # Obtain the overlaps for 1, 2, and 3 hops. - expect_equal(rownames(FindConnectionsForAllHopCounts(subnetworks)), - c("tf2__geneA", "tf2__geneB", "tf3__geneA", "tf3__geneD", "tf4__geneC", "tf4__geneD")) - - # Obtain the overlaps for only the first two hops, for only A and C. - subnetworks <- list(geneA = list(geneANetHop1, geneANetHop2), - geneC = list(geneCNetHop1, geneCNetHop2)) - expect_equal(rownames(FindConnectionsForAllHopCounts(subnetworks)), - c("tf3__geneA", "tf3__geneD", "tf4__geneC", "tf4__geneD")) -}) -test_that("[BLOBFISH] FindSignificantEdgesForHop() function yields expected results",{ - - # Construct a starting network, which will be modified. - # Here, we expect genes A and B to be connected after 1 hop via TF2, genes A and D - # to be connected after 1 hop via TF3, and genes A and C to be connected after 2 - # hops via TF4. - startingNetwork <- data.frame(tf = c(rep(c("tf1", "tf2", "tf3", "tf4"), 4)), - gene = c(rep("geneA", 4), rep("geneB", 4), rep("geneC", 4), rep("geneD", 4)), - score = c(-3, 3, 5, -5, 0, 4, 0.0005, -0.5, 0, 0.0005, -1, 4, 0.0005, -2, 5, 3)) - addedNoise1 <- data.frame(tf = rep(1:100, 4), gene = c(rep("geneA", 100), - rep("geneB", 100), - rep("geneC", 100), - rep("geneD", 100)), - score = stats::rnorm(400) / 10000) - addedNoise2 <- data.frame(tf = rep(1:100, 4), gene = c(rep("geneA", 100), - rep("geneB", 100), - rep("geneC", 100), - rep("geneD", 100)), - score = stats::rnorm(400) / 10000) - addedNoise3 <- data.frame(tf = rep(1:100, 4), gene = c(rep("geneA", 100), - rep("geneB", 100), - rep("geneC", 100), - rep("geneD", 100)), - score = stats::rnorm(400) / 10000) - fullNetwork1 <- rbind(startingNetwork, addedNoise1) - rownames(fullNetwork1) <- paste(fullNetwork1$tf, fullNetwork1$gene, sep = "__") - - # Create a set of networks close to the original. - fullNetworks <- fullNetwork1 - fullNetworks[,4] <- fullNetwork1$score - 0.00005 - fullNetworks[,5] <- fullNetwork1$score + 0.00005 - - # Null distribution is a narrower version of the normal distribution. - null <- rnorm(n = nrow(fullNetworks) * 3) / 100 - - # Calculate the p-values. - pvalues <- CalculatePValues(network = fullNetworks, pValueChunks = 2, - nullDistribution = null, verbose = TRUE) - whichSig <- which(pvalues < 0.1) - significantEdges <- rownames(fullNetworks)[whichSig] - subnetwork <- fullNetworks[significantEdges, c(1:2)] - pvalues = pvalues[significantEdges] - - # Set up the subnetwork from the previous example. - geneANetHop1 <- data.frame(tf = c("tf2", "tf3"), gene = c("geneA", "geneA")) - rownames(geneANetHop1) <- paste(geneANetHop1$tf, geneANetHop1$gene, sep = "__") - geneBNetHop1 <- data.frame(tf = "tf2", gene = "geneB") - rownames(geneBNetHop1) <- paste(geneBNetHop1$tf, geneBNetHop1$gene, sep = "__") - geneCNetHop1 <- data.frame(tf = "tf4", gene = "geneC") - rownames(geneCNetHop1) <- paste(geneCNetHop1$tf, geneCNetHop1$gene, sep = "__") - geneDNetHop1 <- data.frame(tf = c("tf3", "tf4"), gene = c("geneD", "geneD")) - rownames(geneDNetHop1) <- paste(geneDNetHop1$tf, geneDNetHop1$gene, sep = "__") - geneANetHop2 <- rbind(geneBNetHop1, geneDNetHop1[1,]) - rownames(geneANetHop2) <- paste(geneANetHop2$tf, geneANetHop2$gene, sep = "__") - geneBNetHop2 <- geneANetHop1[1,] - rownames(geneBNetHop2) <- paste(geneBNetHop2$tf, geneBNetHop2$gene, sep = "__") - geneCNetHop2 <- geneDNetHop1[2,] - rownames(geneCNetHop2) <- paste(geneCNetHop2$tf, geneCNetHop2$gene, sep = "__") - geneDNetHop2 <- rbind(geneANetHop1[2,], geneCNetHop1) - rownames(geneDNetHop2) <- paste(geneDNetHop2$tf, geneDNetHop2$gene, sep = "__") - geneANetHop3 <- geneDNetHop1[2,] - rownames(geneANetHop3) <- paste(geneANetHop3$tf, geneANetHop3$gene, sep = "__") - geneBNetHop3 <- geneANetHop1[2,] - rownames(geneBNetHop3) <- paste(geneBNetHop3$tf, geneBNetHop3$gene, sep = "__") - geneCNetHop3 <- geneDNetHop1[1,] - rownames(geneCNetHop3) <- paste(geneCNetHop3$tf, geneCNetHop3$gene, sep = "__") - geneDNetHop3 <- geneANetHop1[1,] - rownames(geneDNetHop3) <- paste(geneDNetHop3$tf, geneDNetHop3$gene, sep = "__") - subnetworksFull <- list(geneA = list(geneANetHop1, geneANetHop2, geneANetHop3), - geneB = list(geneBNetHop1, geneBNetHop2, geneBNetHop3), - geneC = list(geneCNetHop1, geneCNetHop2, geneCNetHop3), - geneD = list(geneDNetHop1, geneDNetHop2, geneDNetHop3)) - - # Test method with all genes as input. - sigEdges <- FindSignificantEdgesForHop(geneSet = c("geneA", "geneB", "geneC", "geneD"), - combinedNetwork = subnetwork, pValues = pvalues, - hopConstraint = 3) - expect_true(length(setdiff(rownames(subnetworksFull$geneA[[1]]), rownames(sigEdges$geneA[[1]]))) == 0) - expect_true(length(setdiff(rownames(subnetworksFull$geneA[[2]]), rownames(sigEdges$geneA[[2]]))) == 0) - expect_true(length(setdiff(rownames(subnetworksFull$geneA[[3]]), rownames(sigEdges$geneA[[3]]))) == 0) - expect_true(length(setdiff(rownames(subnetworksFull$geneB[[1]]), rownames(sigEdges$geneB[[1]]))) == 0) - expect_true(length(setdiff(rownames(subnetworksFull$geneB[[2]]), rownames(sigEdges$geneB[[2]]))) == 0) - expect_true(length(setdiff(rownames(subnetworksFull$geneB[[3]]), rownames(sigEdges$geneB[[3]]))) == 0) - expect_true(length(setdiff(rownames(subnetworksFull$geneC[[1]]), rownames(sigEdges$geneC[[1]]))) == 0) - expect_true(length(setdiff(rownames(subnetworksFull$geneC[[2]]), rownames(sigEdges$geneC[[2]]))) == 0) - expect_true(length(setdiff(rownames(subnetworksFull$geneC[[3]]), rownames(sigEdges$geneC[[3]]))) == 0) - expect_true(length(setdiff(rownames(subnetworksFull$geneD[[1]]), rownames(sigEdges$geneD[[1]]))) == 0) - expect_true(length(setdiff(rownames(subnetworksFull$geneD[[2]]), rownames(sigEdges$geneD[[2]]))) == 0) - expect_true(length(setdiff(rownames(subnetworksFull$geneD[[3]]), rownames(sigEdges$geneD[[3]]))) == 0) - - # Test method with only genes A, B, C. - sigEdges <- FindSignificantEdgesForHop(geneSet = c("geneA", "geneB", "geneC"), - combinedNetwork = subnetwork, hopConstraint = 3) - expect_true(length(setdiff(rownames(subnetworksFull$geneA[[1]]), rownames(sigEdges$geneA[[1]]))) == 0) - expect_true(length(setdiff(rownames(subnetworksFull$geneA[[2]]), rownames(sigEdges$geneA[[2]]))) == 0) - expect_true(length(setdiff(rownames(subnetworksFull$geneA[[3]]), rownames(sigEdges$geneA[[3]]))) == 0) - expect_true(length(setdiff(rownames(subnetworksFull$geneB[[1]]), rownames(sigEdges$geneB[[1]]))) == 0) - expect_true(length(setdiff(rownames(subnetworksFull$geneB[[2]]), rownames(sigEdges$geneB[[2]]))) == 0) - expect_true(length(setdiff(rownames(subnetworksFull$geneB[[3]]), rownames(sigEdges$geneB[[3]]))) == 0) - expect_true(length(setdiff(rownames(subnetworksFull$geneC[[1]]), rownames(sigEdges$geneC[[1]]))) == 0) - expect_true(length(setdiff(rownames(subnetworksFull$geneC[[2]]), rownames(sigEdges$geneC[[2]]))) == 0) - expect_true(length(setdiff(rownames(subnetworksFull$geneC[[3]]), rownames(sigEdges$geneC[[3]]))) == 0) - - # Test method with only genes A and C. - sigEdges <- FindSignificantEdgesForHop(geneSet = c("geneA", "geneC"), - combinedNetwork = subnetwork, - hopConstraint = 3) - expect_true(length(setdiff(rownames(subnetworksFull$geneA[[1]]), rownames(sigEdges$geneA[[1]]))) == 0) - expect_true(length(setdiff(rownames(subnetworksFull$geneA[[2]]), rownames(sigEdges$geneA[[2]]))) == 0) - expect_true(length(setdiff(rownames(subnetworksFull$geneA[[3]]), rownames(sigEdges$geneA[[3]]))) == 0) - expect_true(length(setdiff(rownames(subnetworksFull$geneC[[1]]), rownames(sigEdges$geneC[[1]]))) == 0) - expect_true(length(setdiff(rownames(subnetworksFull$geneC[[2]]), rownames(sigEdges$geneC[[2]]))) == 0) - expect_true(length(setdiff(rownames(subnetworksFull$geneC[[3]]), rownames(sigEdges$geneC[[3]]))) == 0) -}) -test_that("[BLOBFISH] BuildSubnetwork() function yields expected results",{ - - # Construct a starting network, which will be modified. - # Here, we expect genes A and B to be connected after 1 hop via TF2, genes A and D - # to be connected after 1 hop via TF3, and genes A and C to be connected after 2 - # hops via TF4. - startingNetwork <- data.frame(tf = c(rep(c("tf1", "tf2", "tf3", "tf4"), 4)), - gene = c(rep("geneA", 4), rep("geneB", 4), rep("geneC", 4), rep("geneD", 4)), - score = c(-3, 3, 5, -5, 0, 4, 0.0005, -0.5, 0, 0.0005, -1, 4, 0.0005, -2, 5, 3)) - addedNoise1 <- data.frame(tf = rep(1:100, 4), gene = c(rep("geneA", 100), - rep("geneB", 100), - rep("geneC", 100), - rep("geneD", 100)), - score = stats::rnorm(400) / 10000) - addedNoise2 <- data.frame(tf = rep(1:100, 4), gene = c(rep("geneA", 100), - rep("geneB", 100), - rep("geneC", 100), - rep("geneD", 100)), - score = stats::rnorm(400) / 10000) - addedNoise3 <- data.frame(tf = rep(1:100, 4), gene = c(rep("geneA", 100), - rep("geneB", 100), - rep("geneC", 100), - rep("geneD", 100)), - score = stats::rnorm(400) / 1000) - fullNetwork1 <- rbind(startingNetwork, addedNoise1) - rownames(fullNetwork1) <- paste(fullNetwork1$tf, fullNetwork1$gene, sep = "__") - - # Create a set of networks close to the original. - fullNetwork2 <- fullNetwork1 - fullNetwork3 <- fullNetwork1 - fullNetwork2$score <- c(fullNetwork1$score - 0.00005) - fullNetwork3$score <- c(fullNetwork1$score + 0.00005) - - # Paste together the networks. - combinedNetwork <- fullNetwork1 - combinedNetwork[,4] <- fullNetwork2$score - combinedNetwork[,5] <- fullNetwork3$score - - # Null distribution is a narrower version of the normal distribution. - null <- rnorm(n = nrow(combinedNetwork) * 3) / 100 - - # Verify that FindConnectionsForAllHopCounts() and FindSignificantEdgesForHop() are - # working in tandem. - # Obtain the overlaps for 1, 2, and 3 hops. - expect_true(length(setdiff(c("tf2__geneA", "tf2__geneB", "tf3__geneA", "tf3__geneD", "tf4__geneC", "tf4__geneD"), - rownames(BuildSubnetwork(geneSet = c("geneA", "geneB", "geneC", "geneD"), - networks = list(fullNetwork1, fullNetwork2, fullNetwork3), - alpha = 0.1, hopConstraint = 4, nullDistribution = null)))) == 0) - - # Obtain the overlaps for only the first two hops, for only A and C. - expect_true(length(setdiff(c("tf3__geneA", "tf3__geneD", "tf4__geneC", "tf4__geneD"), - rownames(BuildSubnetwork(geneSet = c("geneA", "geneC"), - networks = list(fullNetwork1, fullNetwork2, fullNetwork3), - alpha = 0.1, hopConstraint = 4, nullDistribution = null)))) == 0) -}) -test_that("[BLOBFISH] RunBLOBFISH() function yields expected results",{ - - # Check errors. - expect_error(RunBLOBFISH(geneSet = 4, networks = list(), alpha = 0.5, hopConstraint = 5, nullDistribution = c(0,0,0)), - paste("Wrong input type! geneSet must be a character vector. networks must be a list.", - "alpha and hopConstraint must be scalar numeric values.")) - expect_error(RunBLOBFISH(geneSet = "g1", networks = 67, alpha = 0.5, hopConstraint = 5, nullDistribution = c(0,0,0)), - paste("Wrong input type! geneSet must be a character vector. networks must be a list.", - "alpha and hopConstraint must be scalar numeric values.")) - expect_error(RunBLOBFISH(geneSet = "g1", networks = list(), alpha = "hi", hopConstraint = 5, nullDistribution = c(0,0,0)), - paste("Wrong input type! geneSet must be a character vector. networks must be a list.", - "alpha and hopConstraint must be scalar numeric values.")) - expect_error(RunBLOBFISH(geneSet = "g1", networks = list(), alpha = 0.5, hopConstraint = "hi", nullDistribution = c(0,0,0)), - paste("Wrong input type! geneSet must be a character vector. networks must be a list.", - "alpha and hopConstraint must be scalar numeric values.")) - expect_error(RunBLOBFISH(geneSet = "g1", networks = list(1,2,3), alpha = 0.5, hopConstraint = 5, nullDistribution = c(0,0,0)), - "Each network must be a data frame.") - expect_error(RunBLOBFISH(geneSet = "g1", networks = list(data.frame(tf = NA, gene = NA, score = NA, blah = NA)), - alpha = 0.5, hopConstraint = 5, nullDistribution = c(0,0,0)), - paste("Each network must have transcription factors in the first column,", - "target genes in the second column, and scores in the third column.")) - expect_error(RunBLOBFISH(geneSet = "g1", networks = list(data.frame(tf = NA, gene = NA, score = NA)), - alpha = -1, hopConstraint = 5, nullDistribution = c(0,0,0)), "alpha must be between 0 and 1, not including 0.") - expect_error(RunBLOBFISH(geneSet = "g1", networks = list(data.frame(tf = NA, gene = NA, score = NA)), - alpha = 0, hopConstraint = 5, nullDistribution = c(0,0,0)), "alpha must be between 0 and 1, not including 0.") - expect_error(RunBLOBFISH(geneSet = "g1", networks = list(data.frame(tf = NA, gene = NA, score = NA)), - alpha = 1.2, hopConstraint = 5, nullDistribution = c(0,0,0)), "alpha must be between 0 and 1, not including 0.") - expect_error(RunBLOBFISH(geneSet = "g1", networks = list(data.frame(tf = NA, gene = NA, score = NA)), - alpha = 1, hopConstraint = -4, nullDistribution = c(0,0,0)), "hopConstraint must be an even number of at least 2.") - expect_error(RunBLOBFISH(geneSet = "g1", networks = list(data.frame(tf = NA, gene = NA, score = NA)), - alpha = 1, hopConstraint = 7, nullDistribution = c(0,0,0)), "hopConstraint must be an even number of at least 2.") - expect_error(RunBLOBFISH(geneSet = "g1", networks = list(data.frame(tf = NA, gene = NA, score = NA)), - alpha = 1, hopConstraint = 4, nullDistribution = "hi"), "nullDistribution must be numeric.") -}) \ No newline at end of file From 73de562304fac6f9de609d71e44b628b181ae392 Mon Sep 17 00:00:00 2001 From: Tara Eicher Date: Fri, 21 Jun 2024 14:02:01 -0400 Subject: [PATCH 11/17] Fixed devtools::check() warnings and notes --- R/BLOBFISH.R | 17 +++++++++-------- man/GenerateNullPANDADistribution.Rd | 10 +++++----- man/SignificantBreadthFirstSearch.Rd | 4 ---- 3 files changed, 14 insertions(+), 17 deletions(-) diff --git a/R/BLOBFISH.R b/R/BLOBFISH.R index e5525b67..91de646c 100644 --- a/R/BLOBFISH.R +++ b/R/BLOBFISH.R @@ -1,3 +1,5 @@ +library(fgsea) + #' Given a set of genes of interest, full bipartite networks with scores (one network for each sample), a significance #' cutoff for statistical testing, and a hop constraint, BLOBFISH finds a subnetwork of #' significant edges connecting the genes. @@ -70,7 +72,7 @@ RunBLOBFISH <- function(geneSet, networks, alpha, hopConstraint, nullDistributio #' @param ppiFile The location of the protein-protein interaction network between transcription factors. #' This should be a TSV file where the first two columns are the transcription #' factors and the third is whether there is a PPI between them. -#' @param motif The location of the motif prior from genes to transcription factors. This should +#' @param motifFile The location of the motif prior from genes to transcription factors. This should #' be a TSV file where the first column is the transcription factors, the #' second is the genes, and the third is whether the transcription factor's #' binding motif is in the gene promoter region. @@ -120,8 +122,7 @@ GenerateNullPANDADistribution <- function(ppiFile, motifFile, sampSize = 20, # Create new edges including other transcription factors. newEdges <- paste(interactingTF, edge, sep = "__") - str(newEdges) - + # Return the old and new edges. return(c(edge, newEdges)) })) @@ -239,12 +240,14 @@ CalculatePValues <- function(network, nullDistribution, pValueChunks = 100, startIndex <- 1 endIndex <- min(startIndex + ceiling(nrow(network) / pValueChunks), nrow(network)) - for(i in 1:pValueChunks){ + i <- 1 + while(i < pValueChunks && startIndex <= endIndex){ # Calculate p-values for this chunk. ourEdgeVals <- network[startIndex:endIndex, 3:ncol(network)] nullEdgeVals <- t(matrix(rep(nullDistribution, nrow(ourEdgeVals)), ncol = nrow(ourEdgeVals))) + pValues[startIndex:endIndex] <- matrixTests::row_wilcoxon_twosample(x = ourEdgeVals, y = nullEdgeVals, alternative = "greater")$pvalue @@ -258,6 +261,7 @@ CalculatePValues <- function(network, nullDistribution, pValueChunks = 100, startIndex <- endIndex + 1 endIndex <- min(startIndex + ceiling(nrow(network) / pValueChunks), nrow(network)) + i <- i + 1 } # Adjust the p-values. @@ -385,8 +389,6 @@ FindSignificantEdgesForHop <- function(geneSet, combinedNetwork, hopConstraint, #' @param startingNodes The list of nodes from which to start. #' @param nodesToExclude The list of nodes to exclude from the search. #' @param startFromTF Whether to start from transcription factors (TRUE) or genes (FALSE). -#' @param doFDRAdjustment Whether or not to adjust the p-values using FDR. -#' @param nullDistribution The null distribution, specified as a vector of values. #' @param verbose Whether or not to print detailed information about the run. #' @param topX Select the X lowest significant p-values for each gene. NULL by default. #' @returns A bipartite subnetwork in the same format as the original networks. @@ -615,8 +617,7 @@ PlotNetwork <- function(network, genesOfInterest, if(!is.null(geneColorMapping)){ nodeAttrs[rownames(geneColorMapping), "color"] <- geneColorMapping$color } - str(nodeAttrs) - + # Add edge attributes. if(!is.null(geneColorMapping)){ for(gene in rownames(geneColorMapping)){ diff --git a/man/GenerateNullPANDADistribution.Rd b/man/GenerateNullPANDADistribution.Rd index edab243d..38f62cd4 100644 --- a/man/GenerateNullPANDADistribution.Rd +++ b/man/GenerateNullPANDADistribution.Rd @@ -21,14 +21,14 @@ GenerateNullPANDADistribution( This should be a TSV file where the first two columns are the transcription factors and the third is whether there is a PPI between them.} -\item{sampSize}{Number of samples to simulate} - -\item{numberOfPandas}{Number of null PANDA networks to generate} - -\item{motif}{The location of the motif prior from genes to transcription factors. This should +\item{motifFile}{The location of the motif prior from genes to transcription factors. This should be a TSV file where the first column is the transcription factors, the second is the genes, and the third is whether the transcription factor's binding motif is in the gene promoter region.} + +\item{sampSize}{Number of samples to simulate} + +\item{numberOfPandas}{Number of null PANDA networks to generate} } \description{ Generate a null distribution of edge scores for PANDA-like networks; that is, diff --git a/man/SignificantBreadthFirstSearch.Rd b/man/SignificantBreadthFirstSearch.Rd index 2e6e531a..bc401d5f 100644 --- a/man/SignificantBreadthFirstSearch.Rd +++ b/man/SignificantBreadthFirstSearch.Rd @@ -31,10 +31,6 @@ Edges must be specified as "tf__gene".} \item{verbose}{Whether or not to print detailed information about the run.} \item{topX}{Select the X lowest significant p-values for each gene. NULL by default.} - -\item{doFDRAdjustment}{Whether or not to adjust the p-values using FDR.} - -\item{nullDistribution}{The null distribution, specified as a vector of values.} } \value{ A bipartite subnetwork in the same format as the original networks. From f06ad95a0e09dea64fcb8b4b4d0600a4d5a6fedd Mon Sep 17 00:00:00 2001 From: Tara Eicher Date: Tue, 25 Jun 2024 13:49:32 -0400 Subject: [PATCH 12/17] First commit --- man/cobra.Rd | 49 ------------------------------------------------- man/prior.pp.Rd | 19 ------------------- 2 files changed, 68 deletions(-) delete mode 100644 man/cobra.Rd delete mode 100644 man/prior.pp.Rd diff --git a/man/cobra.Rd b/man/cobra.Rd deleted file mode 100644 index d3952aba..00000000 --- a/man/cobra.Rd +++ /dev/null @@ -1,49 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/COBRA.R -\name{cobra} -\alias{cobra} -\title{Run COBRA in R} -\usage{ -cobra(X, expressionData, standardize = T) -} -\arguments{ -\item{X}{: design matrix of size (n, q), n = number of samples, q = number of covariates} - -\item{expressionData}{: gene expression as a matrix of size (g, n), g = number of genes} - -\item{standardize}{: boolean flag to standardize the gene expression as a pre-processing step - -Outputs:} -} -\value{ -psi : impact of each covariate on the eigenvalues as a matrix of size (q, n) - -Q : eigenvectors corresponding to non-zero eigenvalues as a matrix of size (g, n) - -D : non-zero eigenvalues as a list of length n - -G : (standardized) gene expression as a matrix of size (g, n) -} -\description{ -Description: - COBRA decomposes a (partial) gene co-expression matrix as a - linear combination of covariate-specific components. - It can be applied for batch correction, differential co-expression - analysis controlling for variables, and to understand the impact of - variables of interest to the observed co-expression. -} -\details{ -Inputs: -} -\examples{ - -g <- 100 # number of genes -n <- 10 # number of samples -q <- 2 # number of covariates -X <- X <- cbind(rep(1, n), rbinom(n, 1, 0.5)) -expressionData=matrix(rnorm(g*n, 1, 1), ncol = n, nrow = g) - -# Run COBRA algorithm -cobra_output <- cobra(X, expressionData) - -} diff --git a/man/prior.pp.Rd b/man/prior.pp.Rd deleted file mode 100644 index 04719523..00000000 --- a/man/prior.pp.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/TIGER.R -\name{prior.pp} -\alias{prior.pp} -\title{Filter low confident edge signs in the prior network using GeneNet} -\usage{ -prior.pp(prior, expr) -} -\arguments{ -\item{prior}{A prior network (adjacency matrix) with rows as TFs and columns as genes.} - -\item{expr}{A normalized log-transformed gene expression matrix.} -} -\value{ -A filtered prior network (adjacency matrix). -} -\description{ -Filter low confident edge signs in the prior network using GeneNet -} From 7acea5552cfa5e2b7855da0234bc532459106e3c Mon Sep 17 00:00:00 2001 From: Tara Eicher Date: Tue, 25 Jun 2024 14:21:11 -0400 Subject: [PATCH 13/17] man/cobra.Rd --- man/cobra.Rd | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 man/cobra.Rd diff --git a/man/cobra.Rd b/man/cobra.Rd new file mode 100644 index 00000000..b88fcf9a --- /dev/null +++ b/man/cobra.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/COBRA.R +\name{cobra} +\alias{cobra} +\title{Run COBRA in R} +\usage{ +cobra(X, expressionData, method = "pearson") +} +\arguments{ +\item{X}{: design matrix of size (n, q), n = number of samples, q = number of covariates} + +\item{expressionData}{: gene expression as a matrix of size (g, n), g = number of genes} + +\item{method}{: if pearson, the decomposition of the co-expression matrix is compouted. If pcorsh, COBRA decomposes the regularized partial co-expression + +Outputs:} +} +\value{ +psi : impact of each covariate on the eigenvalues as a matrix of size (q, n) + +Q : eigenvectors corresponding to non-zero eigenvalues as a matrix of size (g, n) + +D : non-zero eigenvalues as a list of length n + +G : (standardized) gene expression as a matrix of size (g, n) +} +\description{ +Description: + COBRA decomposes a (partial) gene co-expression matrix as a + linear combination of covariate-specific components. + It can be applied for batch correction, differential co-expression + analysis controlling for variables, and to understand the impact of + variables of interest to the observed co-expression. +} +\details{ +Inputs: +} +\examples{ + +g <- 100 # number of genes +n <- 10 # number of samples +q <- 2 # number of covariates +X <- X <- cbind(rep(1, n), rbinom(n, 1, 0.5)) +expressionData=matrix(rnorm(g*n, 1, 1), ncol = n, nrow = g) + +# Run COBRA algorithm +cobra_output <- cobra(X, expressionData) + +} From 496133f062d4c85e78e88e592a91e60e18dafcd0 Mon Sep 17 00:00:00 2001 From: Tara Eicher Date: Wed, 26 Jun 2024 10:08:54 -0400 Subject: [PATCH 14/17] Added tests back --- tests/testthat/test-blobfish.R | 314 +++++++++++++++++++++++++++++++++ 1 file changed, 314 insertions(+) create mode 100644 tests/testthat/test-blobfish.R diff --git a/tests/testthat/test-blobfish.R b/tests/testthat/test-blobfish.R new file mode 100644 index 00000000..12013b40 --- /dev/null +++ b/tests/testthat/test-blobfish.R @@ -0,0 +1,314 @@ +# unit-tests for BLOBFISH +context("test BLOBFISH functions") + +test_that("[BLOBFISH] SignificantBreadthFirstSearch() function yields expected results", { + + # Construct a starting network, which will be modified. + # Here, we expect genes A and B to be connected after 1 hop via TF2, genes A and D + # to be connected after 1 hop via TF3, and genes A and C to be connected after 2 + # hops via TF4. + startingNetwork <- data.frame(tf = c(rep(c("tf1", "tf2", "tf3", "tf4"), 4)), + gene = c(rep("geneA", 4), rep("geneB", 4), rep("geneC", 4), rep("geneD", 4)), + score = c(-3, 3, 5, -5, 0, 4, 0.0005, -0.5, 0, 0.0005, -1, 4, 0.0005, -2, 5, 3)) + addedNoise1 <- data.frame(tf = rep(1:100, 4), gene = c(rep("geneA", 100), + rep("geneB", 100), + rep("geneC", 100), + rep("geneD", 100)), + score = stats::rnorm(400) / 1000) + addedNoise2 <- data.frame(tf = rep(1:100, 4), gene = c(rep("geneA", 100), + rep("geneB", 100), + rep("geneC", 100), + rep("geneD", 100)), + score = stats::rnorm(400) / 10000) + addedNoise3 <- data.frame(tf = rep(1:100, 4), gene = c(rep("geneA", 100), + rep("geneB", 100), + rep("geneC", 100), + rep("geneD", 100)), + score = stats::rnorm(400) / 10000) + fullNetwork1 <- rbind(startingNetwork, addedNoise1) + rownames(fullNetwork1) <- paste(fullNetwork1$tf, fullNetwork1$gene, sep = "__") + + # Create a set of networks close to the original. + fullNetworks <- fullNetwork1 + fullNetworks[,4] <- fullNetwork1$score - 0.00005 + fullNetworks[,5] <- fullNetwork1$score + 0.00005 + + # Use all combined scores as the null distribution. + null <- stats::rnorm(n = nrow(fullNetworks) * 3) / 100 + + # Calculate the p-values. + pvalues <- CalculatePValues(network = fullNetworks, pValueChunks = 2, + nullDistribution = null, verbose = TRUE) + whichSig <- which(pvalues < 0.1) + significantEdges <- rownames(fullNetworks)[whichSig] + subnetwork <- fullNetworks[significantEdges, c(1:2)] + pvalues = pvalues[significantEdges] + + # Test that errors are thrown when appropriate. + expect_error(SignificantBreadthFirstSearch(networks = subnetwork, pValues = pvalues, + startingNodes = c("blob", "fish"), + nodesToExclude = c(), startFromTF = TRUE), + "ERROR: Starting nodes do not overlap with network nodes") + expect_error(SignificantBreadthFirstSearch(networks = subnetwork, pValues = pvalues, + startingNodes = c("geneA", "geneB"), + nodesToExclude = c(), startFromTF = TRUE), + "ERROR: Starting nodes do not overlap with network nodes") + expect_error(SignificantBreadthFirstSearch(networks = subnetwork, pValues = pvalues, + startingNodes = c("tf1", "tf2"), + nodesToExclude = c(), startFromTF = FALSE), + "ERROR: Starting nodes do not overlap with network nodes") + expect_error(SignificantBreadthFirstSearch(networks = subnetwork, pValues = pvalues, + startingNodes = c("geneA", "geneB", "geneC"), + nodesToExclude = c("blob", "fish"), startFromTF = FALSE), + "ERROR: List of nodes to exclude does not overlap with network nodes") + expect_error(SignificantBreadthFirstSearch(networks = subnetwork, pValues = pvalues, + startingNodes = c("geneA", "geneB", "geneC"), + nodesToExclude = c("geneA", "geneB"), startFromTF = FALSE), + "ERROR: Starting nodes cannot overlap with nodes to exclude") + + # Ensure that, when starting from genes A, B, and C, we obtain the correct values. + expect_true(length(setdiff(c("tf2__geneA", "tf2__geneB", "tf3__geneA", "tf4__geneC"), + rownames(SignificantBreadthFirstSearch(networks = subnetwork, pValues = pvalues, + startingNodes = c("geneA", "geneB", "geneC"), + nodesToExclude = c(), startFromTF = FALSE)))) == 0) + + # Ensure that, when starting from transcription factors TF2, TF3, and TF4 and removing genes A, B, and C, we obtain the correct values. + expect_true(length(setdiff(c("tf3__geneD", "tf4__geneD"), + rownames(SignificantBreadthFirstSearch(networks = subnetwork, pValues = pvalues, + startingNodes = c("tf2", "tf3", "tf4"), + nodesToExclude = c("geneA", "geneB", "geneC"), + startFromTF = TRUE)))) == 0) +}) +test_that("[BLOBFISH] FindConnectionsForAllHopCounts() function yields expected results", { + + # Set up the subnetwork from the previous example. + geneANetHop1 <- data.frame(tf = c("tf2", "tf3"), gene = c("geneA", "geneA")) + geneBNetHop1 <- data.frame(tf = "tf2", gene = "geneB") + geneCNetHop1 <- data.frame(tf = "tf4", gene = "geneC") + geneDNetHop1 <- data.frame(tf = c("tf3", "tf4"), gene = c("geneD", "geneD")) + geneANetHop2 <- rbind(geneBNetHop1, geneDNetHop1[1,]) + geneBNetHop2 <- geneANetHop1[1,] + geneCNetHop2 <- geneDNetHop1[2,] + geneDNetHop2 <- rbind(geneANetHop1[2,], geneCNetHop1) + geneANetHop3 <- geneDNetHop1[2,] + geneBNetHop3 <- geneANetHop1[2,] + geneCNetHop3 <- geneDNetHop1[1,] + geneDNetHop3 <- geneANetHop1[1,] + subnetworks <- list(geneA = list(geneANetHop1, geneANetHop2, geneANetHop3), + geneB = list(geneBNetHop1, geneBNetHop2, geneBNetHop3), + geneC = list(geneCNetHop1, geneCNetHop2, geneCNetHop3), + geneD = list(geneDNetHop1, geneDNetHop2, geneDNetHop3)) + + # Obtain the overlaps for 1, 2, and 3 hops. + expect_equal(rownames(FindConnectionsForAllHopCounts(subnetworks)), + c("tf2__geneA", "tf2__geneB", "tf3__geneA", "tf3__geneD", "tf4__geneC", "tf4__geneD")) + + # Obtain the overlaps for only the first two hops, for only A and C. + subnetworks <- list(geneA = list(geneANetHop1, geneANetHop2), + geneC = list(geneCNetHop1, geneCNetHop2)) + expect_equal(rownames(FindConnectionsForAllHopCounts(subnetworks)), + c("tf3__geneA", "tf3__geneD", "tf4__geneC", "tf4__geneD")) +}) +test_that("[BLOBFISH] FindSignificantEdgesForHop() function yields expected results",{ + + # Construct a starting network, which will be modified. + # Here, we expect genes A and B to be connected after 1 hop via TF2, genes A and D + # to be connected after 1 hop via TF3, and genes A and C to be connected after 2 + # hops via TF4. + startingNetwork <- data.frame(tf = c(rep(c("tf1", "tf2", "tf3", "tf4"), 4)), + gene = c(rep("geneA", 4), rep("geneB", 4), rep("geneC", 4), rep("geneD", 4)), + score = c(-3, 3, 5, -5, 0, 4, 0.0005, -0.5, 0, 0.0005, -1, 4, 0.0005, -2, 5, 3)) + addedNoise1 <- data.frame(tf = rep(1:100, 4), gene = c(rep("geneA", 100), + rep("geneB", 100), + rep("geneC", 100), + rep("geneD", 100)), + score = stats::rnorm(400) / 10000) + addedNoise2 <- data.frame(tf = rep(1:100, 4), gene = c(rep("geneA", 100), + rep("geneB", 100), + rep("geneC", 100), + rep("geneD", 100)), + score = stats::rnorm(400) / 10000) + addedNoise3 <- data.frame(tf = rep(1:100, 4), gene = c(rep("geneA", 100), + rep("geneB", 100), + rep("geneC", 100), + rep("geneD", 100)), + score = stats::rnorm(400) / 10000) + fullNetwork1 <- rbind(startingNetwork, addedNoise1) + rownames(fullNetwork1) <- paste(fullNetwork1$tf, fullNetwork1$gene, sep = "__") + + # Create a set of networks close to the original. + fullNetworks <- fullNetwork1 + fullNetworks[,4] <- fullNetwork1$score - 0.00005 + fullNetworks[,5] <- fullNetwork1$score + 0.00005 + + # Null distribution is a narrower version of the normal distribution. + null <- rnorm(n = nrow(fullNetworks) * 3) / 100 + + # Calculate the p-values. + pvalues <- CalculatePValues(network = fullNetworks, pValueChunks = 2, + nullDistribution = null, verbose = TRUE) + whichSig <- which(pvalues < 0.1) + significantEdges <- rownames(fullNetworks)[whichSig] + subnetwork <- fullNetworks[significantEdges, c(1:2)] + pvalues = pvalues[significantEdges] + + # Set up the subnetwork from the previous example. + geneANetHop1 <- data.frame(tf = c("tf2", "tf3"), gene = c("geneA", "geneA")) + rownames(geneANetHop1) <- paste(geneANetHop1$tf, geneANetHop1$gene, sep = "__") + geneBNetHop1 <- data.frame(tf = "tf2", gene = "geneB") + rownames(geneBNetHop1) <- paste(geneBNetHop1$tf, geneBNetHop1$gene, sep = "__") + geneCNetHop1 <- data.frame(tf = "tf4", gene = "geneC") + rownames(geneCNetHop1) <- paste(geneCNetHop1$tf, geneCNetHop1$gene, sep = "__") + geneDNetHop1 <- data.frame(tf = c("tf3", "tf4"), gene = c("geneD", "geneD")) + rownames(geneDNetHop1) <- paste(geneDNetHop1$tf, geneDNetHop1$gene, sep = "__") + geneANetHop2 <- rbind(geneBNetHop1, geneDNetHop1[1,]) + rownames(geneANetHop2) <- paste(geneANetHop2$tf, geneANetHop2$gene, sep = "__") + geneBNetHop2 <- geneANetHop1[1,] + rownames(geneBNetHop2) <- paste(geneBNetHop2$tf, geneBNetHop2$gene, sep = "__") + geneCNetHop2 <- geneDNetHop1[2,] + rownames(geneCNetHop2) <- paste(geneCNetHop2$tf, geneCNetHop2$gene, sep = "__") + geneDNetHop2 <- rbind(geneANetHop1[2,], geneCNetHop1) + rownames(geneDNetHop2) <- paste(geneDNetHop2$tf, geneDNetHop2$gene, sep = "__") + geneANetHop3 <- geneDNetHop1[2,] + rownames(geneANetHop3) <- paste(geneANetHop3$tf, geneANetHop3$gene, sep = "__") + geneBNetHop3 <- geneANetHop1[2,] + rownames(geneBNetHop3) <- paste(geneBNetHop3$tf, geneBNetHop3$gene, sep = "__") + geneCNetHop3 <- geneDNetHop1[1,] + rownames(geneCNetHop3) <- paste(geneCNetHop3$tf, geneCNetHop3$gene, sep = "__") + geneDNetHop3 <- geneANetHop1[1,] + rownames(geneDNetHop3) <- paste(geneDNetHop3$tf, geneDNetHop3$gene, sep = "__") + subnetworksFull <- list(geneA = list(geneANetHop1, geneANetHop2, geneANetHop3), + geneB = list(geneBNetHop1, geneBNetHop2, geneBNetHop3), + geneC = list(geneCNetHop1, geneCNetHop2, geneCNetHop3), + geneD = list(geneDNetHop1, geneDNetHop2, geneDNetHop3)) + + # Test method with all genes as input. + sigEdges <- FindSignificantEdgesForHop(geneSet = c("geneA", "geneB", "geneC", "geneD"), + combinedNetwork = subnetwork, pValues = pvalues, + hopConstraint = 3) + expect_true(length(setdiff(rownames(subnetworksFull$geneA[[1]]), rownames(sigEdges$geneA[[1]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneA[[2]]), rownames(sigEdges$geneA[[2]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneA[[3]]), rownames(sigEdges$geneA[[3]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneB[[1]]), rownames(sigEdges$geneB[[1]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneB[[2]]), rownames(sigEdges$geneB[[2]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneB[[3]]), rownames(sigEdges$geneB[[3]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneC[[1]]), rownames(sigEdges$geneC[[1]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneC[[2]]), rownames(sigEdges$geneC[[2]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneC[[3]]), rownames(sigEdges$geneC[[3]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneD[[1]]), rownames(sigEdges$geneD[[1]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneD[[2]]), rownames(sigEdges$geneD[[2]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneD[[3]]), rownames(sigEdges$geneD[[3]]))) == 0) + + # Test method with only genes A, B, C. + sigEdges <- FindSignificantEdgesForHop(geneSet = c("geneA", "geneB", "geneC"), + combinedNetwork = subnetwork, hopConstraint = 3) + expect_true(length(setdiff(rownames(subnetworksFull$geneA[[1]]), rownames(sigEdges$geneA[[1]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneA[[2]]), rownames(sigEdges$geneA[[2]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneA[[3]]), rownames(sigEdges$geneA[[3]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneB[[1]]), rownames(sigEdges$geneB[[1]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneB[[2]]), rownames(sigEdges$geneB[[2]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneB[[3]]), rownames(sigEdges$geneB[[3]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneC[[1]]), rownames(sigEdges$geneC[[1]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneC[[2]]), rownames(sigEdges$geneC[[2]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneC[[3]]), rownames(sigEdges$geneC[[3]]))) == 0) + + # Test method with only genes A and C. + sigEdges <- FindSignificantEdgesForHop(geneSet = c("geneA", "geneC"), + combinedNetwork = subnetwork, + hopConstraint = 3) + expect_true(length(setdiff(rownames(subnetworksFull$geneA[[1]]), rownames(sigEdges$geneA[[1]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneA[[2]]), rownames(sigEdges$geneA[[2]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneA[[3]]), rownames(sigEdges$geneA[[3]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneC[[1]]), rownames(sigEdges$geneC[[1]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneC[[2]]), rownames(sigEdges$geneC[[2]]))) == 0) + expect_true(length(setdiff(rownames(subnetworksFull$geneC[[3]]), rownames(sigEdges$geneC[[3]]))) == 0) +}) +test_that("[BLOBFISH] BuildSubnetwork() function yields expected results",{ + + # Construct a starting network, which will be modified. + # Here, we expect genes A and B to be connected after 1 hop via TF2, genes A and D + # to be connected after 1 hop via TF3, and genes A and C to be connected after 2 + # hops via TF4. + startingNetwork <- data.frame(tf = c(rep(c("tf1", "tf2", "tf3", "tf4"), 4)), + gene = c(rep("geneA", 4), rep("geneB", 4), rep("geneC", 4), rep("geneD", 4)), + score = c(-3, 3, 5, -5, 0, 4, 0.0005, -0.5, 0, 0.0005, -1, 4, 0.0005, -2, 5, 3)) + addedNoise1 <- data.frame(tf = rep(1:100, 4), gene = c(rep("geneA", 100), + rep("geneB", 100), + rep("geneC", 100), + rep("geneD", 100)), + score = stats::rnorm(400) / 10000) + addedNoise2 <- data.frame(tf = rep(1:100, 4), gene = c(rep("geneA", 100), + rep("geneB", 100), + rep("geneC", 100), + rep("geneD", 100)), + score = stats::rnorm(400) / 10000) + addedNoise3 <- data.frame(tf = rep(1:100, 4), gene = c(rep("geneA", 100), + rep("geneB", 100), + rep("geneC", 100), + rep("geneD", 100)), + score = stats::rnorm(400) / 1000) + fullNetwork1 <- rbind(startingNetwork, addedNoise1) + rownames(fullNetwork1) <- paste(fullNetwork1$tf, fullNetwork1$gene, sep = "__") + + # Create a set of networks close to the original. + fullNetwork2 <- fullNetwork1 + fullNetwork3 <- fullNetwork1 + fullNetwork2$score <- c(fullNetwork1$score - 0.00005) + fullNetwork3$score <- c(fullNetwork1$score + 0.00005) + + # Paste together the networks. + combinedNetwork <- fullNetwork1 + combinedNetwork[,4] <- fullNetwork2$score + combinedNetwork[,5] <- fullNetwork3$score + + # Null distribution is a narrower version of the normal distribution. + null <- rnorm(n = nrow(combinedNetwork) * 3) / 100 + + # Verify that FindConnectionsForAllHopCounts() and FindSignificantEdgesForHop() are + # working in tandem. + # Obtain the overlaps for 1, 2, and 3 hops. + expect_true(length(setdiff(c("tf2__geneA", "tf2__geneB", "tf3__geneA", "tf3__geneD", "tf4__geneC", "tf4__geneD"), + rownames(BuildSubnetwork(geneSet = c("geneA", "geneB", "geneC", "geneD"), + networks = list(fullNetwork1, fullNetwork2, fullNetwork3), + alpha = 0.1, hopConstraint = 4, nullDistribution = null)))) == 0) + + # Obtain the overlaps for only the first two hops, for only A and C. + expect_true(length(setdiff(c("tf3__geneA", "tf3__geneD", "tf4__geneC", "tf4__geneD"), + rownames(BuildSubnetwork(geneSet = c("geneA", "geneC"), + networks = list(fullNetwork1, fullNetwork2, fullNetwork3), + alpha = 0.1, hopConstraint = 4, nullDistribution = null)))) == 0) +}) +test_that("[BLOBFISH] RunBLOBFISH() function yields expected results",{ + + # Check errors. + expect_error(RunBLOBFISH(geneSet = 4, networks = list(), alpha = 0.5, hopConstraint = 5, nullDistribution = c(0,0,0)), + paste("Wrong input type! geneSet must be a character vector. networks must be a list.", + "alpha and hopConstraint must be scalar numeric values.")) + expect_error(RunBLOBFISH(geneSet = "g1", networks = 67, alpha = 0.5, hopConstraint = 5, nullDistribution = c(0,0,0)), + paste("Wrong input type! geneSet must be a character vector. networks must be a list.", + "alpha and hopConstraint must be scalar numeric values.")) + expect_error(RunBLOBFISH(geneSet = "g1", networks = list(), alpha = "hi", hopConstraint = 5, nullDistribution = c(0,0,0)), + paste("Wrong input type! geneSet must be a character vector. networks must be a list.", + "alpha and hopConstraint must be scalar numeric values.")) + expect_error(RunBLOBFISH(geneSet = "g1", networks = list(), alpha = 0.5, hopConstraint = "hi", nullDistribution = c(0,0,0)), + paste("Wrong input type! geneSet must be a character vector. networks must be a list.", + "alpha and hopConstraint must be scalar numeric values.")) + expect_error(RunBLOBFISH(geneSet = "g1", networks = list(1,2,3), alpha = 0.5, hopConstraint = 5, nullDistribution = c(0,0,0)), + "Each network must be a data frame.") + expect_error(RunBLOBFISH(geneSet = "g1", networks = list(data.frame(tf = NA, gene = NA, score = NA, blah = NA)), + alpha = 0.5, hopConstraint = 5, nullDistribution = c(0,0,0)), + paste("Each network must have transcription factors in the first column,", + "target genes in the second column, and scores in the third column.")) + expect_error(RunBLOBFISH(geneSet = "g1", networks = list(data.frame(tf = NA, gene = NA, score = NA)), + alpha = -1, hopConstraint = 5, nullDistribution = c(0,0,0)), "alpha must be between 0 and 1, not including 0.") + expect_error(RunBLOBFISH(geneSet = "g1", networks = list(data.frame(tf = NA, gene = NA, score = NA)), + alpha = 0, hopConstraint = 5, nullDistribution = c(0,0,0)), "alpha must be between 0 and 1, not including 0.") + expect_error(RunBLOBFISH(geneSet = "g1", networks = list(data.frame(tf = NA, gene = NA, score = NA)), + alpha = 1.2, hopConstraint = 5, nullDistribution = c(0,0,0)), "alpha must be between 0 and 1, not including 0.") + expect_error(RunBLOBFISH(geneSet = "g1", networks = list(data.frame(tf = NA, gene = NA, score = NA)), + alpha = 1, hopConstraint = -4, nullDistribution = c(0,0,0)), "hopConstraint must be an even number of at least 2.") + expect_error(RunBLOBFISH(geneSet = "g1", networks = list(data.frame(tf = NA, gene = NA, score = NA)), + alpha = 1, hopConstraint = 7, nullDistribution = c(0,0,0)), "hopConstraint must be an even number of at least 2.") + expect_error(RunBLOBFISH(geneSet = "g1", networks = list(data.frame(tf = NA, gene = NA, score = NA)), + alpha = 1, hopConstraint = 4, nullDistribution = "hi"), "nullDistribution must be numeric.") +}) \ No newline at end of file From 2c48229c0f45f04783ec8576b9fda4ca09041c7b Mon Sep 17 00:00:00 2001 From: Tara Eicher Date: Wed, 26 Jun 2024 10:20:32 -0400 Subject: [PATCH 15/17] Fixed warnings in build --- R/BLOBFISH.R | 22 ++++++++++------------ man/GenerateNullPANDADistribution.Rd | 10 +++++----- man/SignificantBreadthFirstSearch.Rd | 4 ---- 3 files changed, 15 insertions(+), 21 deletions(-) diff --git a/R/BLOBFISH.R b/R/BLOBFISH.R index e5525b67..b8cc8a7b 100644 --- a/R/BLOBFISH.R +++ b/R/BLOBFISH.R @@ -70,7 +70,7 @@ RunBLOBFISH <- function(geneSet, networks, alpha, hopConstraint, nullDistributio #' @param ppiFile The location of the protein-protein interaction network between transcription factors. #' This should be a TSV file where the first two columns are the transcription #' factors and the third is whether there is a PPI between them. -#' @param motif The location of the motif prior from genes to transcription factors. This should +#' @param motifFile The location of the motif prior from genes to transcription factors. This should #' be a TSV file where the first column is the transcription factors, the #' second is the genes, and the third is whether the transcription factor's #' binding motif is in the gene promoter region. @@ -84,8 +84,8 @@ GenerateNullPANDADistribution <- function(ppiFile, motifFile, sampSize = 20, set.seed(1) # Read the motif. - motif <- read.table(motifFile, sep = "\t") - ppi <- read.table(ppiFile, sep = "\t") + motif <- utils::read.table(motifFile, sep = "\t") + ppi <- utils::read.table(ppiFile, sep = "\t") # Generate the null PANDA edges. nullPandas <- lapply(1:numberOfPandas, function(i){ @@ -120,8 +120,7 @@ GenerateNullPANDADistribution <- function(ppiFile, motifFile, sampSize = 20, # Create new edges including other transcription factors. newEdges <- paste(interactingTF, edge, sep = "__") - str(newEdges) - + # Return the old and new edges. return(c(edge, newEdges)) })) @@ -239,7 +238,8 @@ CalculatePValues <- function(network, nullDistribution, pValueChunks = 100, startIndex <- 1 endIndex <- min(startIndex + ceiling(nrow(network) / pValueChunks), nrow(network)) - for(i in 1:pValueChunks){ + i <- 1 + while(i < pValueChunks && startIndex <= endIndex){ # Calculate p-values for this chunk. ourEdgeVals <- network[startIndex:endIndex, 3:ncol(network)] @@ -258,6 +258,7 @@ CalculatePValues <- function(network, nullDistribution, pValueChunks = 100, startIndex <- endIndex + 1 endIndex <- min(startIndex + ceiling(nrow(network) / pValueChunks), nrow(network)) + i <- i + 1 } # Adjust the p-values. @@ -385,8 +386,6 @@ FindSignificantEdgesForHop <- function(geneSet, combinedNetwork, hopConstraint, #' @param startingNodes The list of nodes from which to start. #' @param nodesToExclude The list of nodes to exclude from the search. #' @param startFromTF Whether to start from transcription factors (TRUE) or genes (FALSE). -#' @param doFDRAdjustment Whether or not to adjust the p-values using FDR. -#' @param nullDistribution The null distribution, specified as a vector of values. #' @param verbose Whether or not to print detailed information about the run. #' @param topX Select the X lowest significant p-values for each gene. NULL by default. #' @returns A bipartite subnetwork in the same format as the original networks. @@ -431,8 +430,8 @@ SignificantBreadthFirstSearch <- function(networks, pValues, startingNodes, # If topX is specified, filter again. significantEdges <- allEdges - if(!is.null(topX) && length(ourEdgeVals) > topX){ - whichTopX <- order(pValues[ourEdgeVals])[1:topX] + if(!is.null(topX) && length(allEdges) > topX){ + whichTopX <- order(pValues[allEdges])[1:topX] significantEdges <- allEdges[whichTopX] } @@ -615,8 +614,7 @@ PlotNetwork <- function(network, genesOfInterest, if(!is.null(geneColorMapping)){ nodeAttrs[rownames(geneColorMapping), "color"] <- geneColorMapping$color } - str(nodeAttrs) - + # Add edge attributes. if(!is.null(geneColorMapping)){ for(gene in rownames(geneColorMapping)){ diff --git a/man/GenerateNullPANDADistribution.Rd b/man/GenerateNullPANDADistribution.Rd index edab243d..38f62cd4 100644 --- a/man/GenerateNullPANDADistribution.Rd +++ b/man/GenerateNullPANDADistribution.Rd @@ -21,14 +21,14 @@ GenerateNullPANDADistribution( This should be a TSV file where the first two columns are the transcription factors and the third is whether there is a PPI between them.} -\item{sampSize}{Number of samples to simulate} - -\item{numberOfPandas}{Number of null PANDA networks to generate} - -\item{motif}{The location of the motif prior from genes to transcription factors. This should +\item{motifFile}{The location of the motif prior from genes to transcription factors. This should be a TSV file where the first column is the transcription factors, the second is the genes, and the third is whether the transcription factor's binding motif is in the gene promoter region.} + +\item{sampSize}{Number of samples to simulate} + +\item{numberOfPandas}{Number of null PANDA networks to generate} } \description{ Generate a null distribution of edge scores for PANDA-like networks; that is, diff --git a/man/SignificantBreadthFirstSearch.Rd b/man/SignificantBreadthFirstSearch.Rd index 2e6e531a..bc401d5f 100644 --- a/man/SignificantBreadthFirstSearch.Rd +++ b/man/SignificantBreadthFirstSearch.Rd @@ -31,10 +31,6 @@ Edges must be specified as "tf__gene".} \item{verbose}{Whether or not to print detailed information about the run.} \item{topX}{Select the X lowest significant p-values for each gene. NULL by default.} - -\item{doFDRAdjustment}{Whether or not to adjust the p-values using FDR.} - -\item{nullDistribution}{The null distribution, specified as a vector of values.} } \value{ A bipartite subnetwork in the same format as the original networks. From 32ca1c9ad8d5c3c505f6342d7a1930ca762dc4b0 Mon Sep 17 00:00:00 2001 From: Tara Eicher Date: Wed, 26 Jun 2024 10:31:10 -0400 Subject: [PATCH 16/17] Removed local import --- R/BLOBFISH.R | 3 --- 1 file changed, 3 deletions(-) diff --git a/R/BLOBFISH.R b/R/BLOBFISH.R index 76a78533..b8cc8a7b 100644 --- a/R/BLOBFISH.R +++ b/R/BLOBFISH.R @@ -1,5 +1,3 @@ -library(fgsea) - #' Given a set of genes of interest, full bipartite networks with scores (one network for each sample), a significance #' cutoff for statistical testing, and a hop constraint, BLOBFISH finds a subnetwork of #' significant edges connecting the genes. @@ -247,7 +245,6 @@ CalculatePValues <- function(network, nullDistribution, pValueChunks = 100, ourEdgeVals <- network[startIndex:endIndex, 3:ncol(network)] nullEdgeVals <- t(matrix(rep(nullDistribution, nrow(ourEdgeVals)), ncol = nrow(ourEdgeVals))) - pValues[startIndex:endIndex] <- matrixTests::row_wilcoxon_twosample(x = ourEdgeVals, y = nullEdgeVals, alternative = "greater")$pvalue From 8dc1d3ae5e7a8b0aa1c0595980910e765b702056 Mon Sep 17 00:00:00 2001 From: Tara Eicher Date: Wed, 26 Jun 2024 16:35:45 -0400 Subject: [PATCH 17/17] Added description of BLOBFISH to the README --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 166e6e03..23fc6546 100644 --- a/README.md +++ b/README.md @@ -94,6 +94,11 @@ netZooR currently integrates: COBRA (Co-expression Batch Reduction Adjustment) Micheletti, Schlauch et al. is method for correction of high-order batch effects such as those that persist in co-expression networks. Batch effects and other covariates are known to induce spurious associations in co-expression networks and confound differential gene expression analyses. These effects are corrected for using various methods prior to downstream analyses such as the inference of co-expression networks and computing differences between them. In differential co-expression analysis, the pairwise joint distribution of genes is considered rather than independently analyzing the distribution of expression levels for each individual gene. Computing co-expression matrices after standard batch correction on gene expression data is not sufficient to account for the possibility of batch-induced changes in the correlation between genes as existing batch correction methods act solely on the marginal distribution of each gene. Consequently, uncorrected, artifactual differential co-expression can skew the correlation structure such that network-based methods that use gene co-expression can produce false, nonbiological associations even using data corrected using standard batch correction. Co-expression Batch Reduction Adjustment (COBRA) addresses this question by computing a batch-corrected gene co-expression matrix based on estimating a conditional covariance matrix. COBRA estimates a reduced set of parameters that express the co-expression matrix as a function of the sample covariates and can be used to control for continuous and categorical covariates. The method is computationally fast and makes use of the inherently modular structure of genomic data to estimate accurate gene regulatory associations and enable functional analysis for high-dimensional genomic data. +
+BLOBFISH +BLOBFISH (Bipartite Limited Subnetworks from Multiple Observations using Breadth-First Search with Constrained Hops) Eicher et al. is a method to obtain a subnetwork connecting nodes of interest across observation-specific biological networks. Many biological networks are bipartite, such as expression quantitative trait loci (eQTL) networks, gene regulatory networks, and multi-omic partial correlation networks. However, the size of omics-scale bipartite networks can make them difficult to interpret as a whole; motivating the development of tools that evaluate connectivity between a subset of nodes. In addition, observation-specific networks (i.e., sample-specific or subject-specific networks) introduce the possibility of subsetting robust edges that are consistent across observations. BLOBFISH evaluates connectivity between a subset of nodes in a set of observation-specific bipartite networks by first finding significant edges across observations in comparison to a null distribution, and then using a breadth-first-search to uncover paths between seed nodes limited to a prespecified number of hops. +
+ * Source protein-protein interaction network from [STRINGdb](https://string-db.org/) based on a list of protein of interest. * Plot one PANDA network in [Cytoscape](https://cytoscape.org/).