From 436561fd6179ececdf8ae6faa6da3dc4963a11e6 Mon Sep 17 00:00:00 2001
From: Chris Flerin <ccflerin@gmail.com>
Date: Tue, 24 Nov 2020 11:40:43 +0100
Subject: [PATCH] Skip correlation calculation in ctx if it already exists in
 the adj input file

---
 src/pyscenic/utils.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/pyscenic/utils.py b/src/pyscenic/utils.py
index ce0effe..6fb9784 100644
--- a/src/pyscenic/utils.py
+++ b/src/pyscenic/utils.py
@@ -261,14 +261,18 @@ def iter_modules(adjc, context):
         # Relationship between TF and its target, i.e. activator or repressor, is derived using the original expression
         # profiles. The Pearson product-moment correlation coefficient is used to derive this information.
 
-        # Add correlation column and create two disjoint set of adjacencies.
-        LOGGER.info("Calculating Pearson correlations.")
-        # test for genes present in the adjacencies but not present in the expression matrix:
-        unique_adj_genes = set(adjacencies[COLUMN_NAME_TF]).union(set(adjacencies[COLUMN_NAME_TARGET])) - set(ex_mtx.columns)
-        assert len(unique_adj_genes)==0, f"Found {len(unique_adj_genes)} genes present in the network (adjacencies) output, but missing from the expression matrix. Is this a different gene expression matrix?"
-        LOGGER.warn(f"Note on correlation calculation: the default behaviour for calculating the correlations has changed after pySCENIC verion 0.9.16. Previously, the default was to calculate the correlation between a TF and target gene using only cells with non-zero expression values (mask_dropouts=True). The current default is now to use all cells to match the behavior of the R verision of SCENIC. The original settings can be retained by setting 'rho_mask_dropouts=True' in the modules_from_adjacencies function, or '--mask_dropouts' from the CLI.\n\tDropout masking is currently set to [{rho_mask_dropouts}].")
-        adjacencies = add_correlation(adjacencies, ex_mtx,
-                                  rho_threshold=rho_threshold, mask_dropouts=rho_mask_dropouts)
+        if not {'regulation', 'rho'}.issubset(adjacencies.columns):
+            # Add correlation column and create two disjoint set of adjacencies.
+            LOGGER.info("Calculating Pearson correlations.")
+            # test for genes present in the adjacencies but not present in the expression matrix:
+            unique_adj_genes = set(adjacencies[COLUMN_NAME_TF]).union(set(adjacencies[COLUMN_NAME_TARGET])) - set(ex_mtx.columns)
+            assert len(unique_adj_genes)==0, f"Found {len(unique_adj_genes)} genes present in the network (adjacencies) output, but missing from the expression matrix. Is this a different gene expression matrix?"
+            LOGGER.warn(f"Note on correlation calculation: the default behaviour for calculating the correlations has changed after pySCENIC verion 0.9.16. Previously, the default was to calculate the correlation between a TF and target gene using only cells with non-zero expression values (mask_dropouts=True). The current default is now to use all cells to match the behavior of the R verision of SCENIC. The original settings can be retained by setting 'rho_mask_dropouts=True' in the modules_from_adjacencies function, or '--mask_dropouts' from the CLI.\n\tDropout masking is currently set to [{rho_mask_dropouts}].")
+            adjacencies = add_correlation(adjacencies, ex_mtx,
+                                      rho_threshold=rho_threshold, mask_dropouts=rho_mask_dropouts)
+        else:
+            LOGGER.info("Using existing Pearson correlations from the adjacencies file.")
+
         activating_modules = adjacencies[adjacencies[COLUMN_NAME_REGULATION] > 0.0]
         if keep_only_activating:
             modules_iter = iter_modules(activating_modules, frozenset([ACTIVATING_MODULE]))