From 436561fd6179ececdf8ae6faa6da3dc4963a11e6 Mon Sep 17 00:00:00 2001 From: Chris Flerin Date: Tue, 24 Nov 2020 11:40:43 +0100 Subject: [PATCH] Skip correlation calculation in ctx if it already exists in the adj input file --- src/pyscenic/utils.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/pyscenic/utils.py b/src/pyscenic/utils.py index ce0effe..6fb9784 100644 --- a/src/pyscenic/utils.py +++ b/src/pyscenic/utils.py @@ -261,14 +261,18 @@ def iter_modules(adjc, context): # Relationship between TF and its target, i.e. activator or repressor, is derived using the original expression # profiles. The Pearson product-moment correlation coefficient is used to derive this information. - # Add correlation column and create two disjoint set of adjacencies. - LOGGER.info("Calculating Pearson correlations.") - # test for genes present in the adjacencies but not present in the expression matrix: - unique_adj_genes = set(adjacencies[COLUMN_NAME_TF]).union(set(adjacencies[COLUMN_NAME_TARGET])) - set(ex_mtx.columns) - assert len(unique_adj_genes)==0, f"Found {len(unique_adj_genes)} genes present in the network (adjacencies) output, but missing from the expression matrix. Is this a different gene expression matrix?" - LOGGER.warn(f"Note on correlation calculation: the default behaviour for calculating the correlations has changed after pySCENIC verion 0.9.16. Previously, the default was to calculate the correlation between a TF and target gene using only cells with non-zero expression values (mask_dropouts=True). The current default is now to use all cells to match the behavior of the R verision of SCENIC. The original settings can be retained by setting 'rho_mask_dropouts=True' in the modules_from_adjacencies function, or '--mask_dropouts' from the CLI.\n\tDropout masking is currently set to [{rho_mask_dropouts}].") - adjacencies = add_correlation(adjacencies, ex_mtx, - rho_threshold=rho_threshold, mask_dropouts=rho_mask_dropouts) + if not {'regulation', 'rho'}.issubset(adjacencies.columns): + # Add correlation column and create two disjoint set of adjacencies. + LOGGER.info("Calculating Pearson correlations.") + # test for genes present in the adjacencies but not present in the expression matrix: + unique_adj_genes = set(adjacencies[COLUMN_NAME_TF]).union(set(adjacencies[COLUMN_NAME_TARGET])) - set(ex_mtx.columns) + assert len(unique_adj_genes)==0, f"Found {len(unique_adj_genes)} genes present in the network (adjacencies) output, but missing from the expression matrix. Is this a different gene expression matrix?" + LOGGER.warn(f"Note on correlation calculation: the default behaviour for calculating the correlations has changed after pySCENIC verion 0.9.16. Previously, the default was to calculate the correlation between a TF and target gene using only cells with non-zero expression values (mask_dropouts=True). The current default is now to use all cells to match the behavior of the R verision of SCENIC. The original settings can be retained by setting 'rho_mask_dropouts=True' in the modules_from_adjacencies function, or '--mask_dropouts' from the CLI.\n\tDropout masking is currently set to [{rho_mask_dropouts}].") + adjacencies = add_correlation(adjacencies, ex_mtx, + rho_threshold=rho_threshold, mask_dropouts=rho_mask_dropouts) + else: + LOGGER.info("Using existing Pearson correlations from the adjacencies file.") + activating_modules = adjacencies[adjacencies[COLUMN_NAME_REGULATION] > 0.0] if keep_only_activating: modules_iter = iter_modules(activating_modules, frozenset([ACTIVATING_MODULE]))