Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Changeable cutoff for number of genes found in DB for module2df #387

Open
wants to merge 12 commits into
base: dev
Choose a base branch
from
2 changes: 2 additions & 0 deletions .github/ISSUE_TEMPLATE/bug-error-report.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ assignees: ''
**Describe the bug**
A clear and concise description of what the bug is.

> Mote that most *errors* are due to the input from the user, and therefore should be treated as questions in the Discussions. Please, only report them as bugs if you are quite certain that they are not behaving as expected.

**Steps to reproduce the behavior**
1. Command run when the error occurred:
<!-- Please specify the command used (if applicable, otherwise delete this block): -->
Expand Down
14 changes: 14 additions & 0 deletions .github/ISSUE_TEMPLATE/no-questions-here-please-.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
---
name: No questions here please!
about: Questions about the results, design, or run strategy for pySCENIC
title: "[results]"
labels: question
assignees: ''

---

For **questions** about using SCENIC, please use the Discussions: https://github.com/aertslab/SCENIC/discussions/

Create an issue only to report **bugs**.

> Mote that most *errors* are due to the input from the user, and therefore should be treated as questions in the Discussions. Please, only report them as bugs if you are quite certain that they are not behaving as expected.
10 changes: 0 additions & 10 deletions .github/ISSUE_TEMPLATE/results-design-questions.md

This file was deleted.

1 change: 1 addition & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ Additional resources
For more information, please visit LCB_,
the main `SCENIC website <https://scenic.aertslab.org/>`_,
or `SCENIC (R version) <https://github.com/aertslab/SCENIC>`_.
There is a tutorial to `create new cisTarget databases <https://github.com/aertslab/create_cisTarget_databases>`_.
The CLI to pySCENIC has also been streamlined into a pipeline that can be run with a single command, using the Nextflow workflow manager.
There are two Nextflow implementations available:

Expand Down
4 changes: 4 additions & 0 deletions src/pyscenic/cli/pyscenic.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,7 @@ def prune_targets_command(args):
client_or_address=args.mode,
module_chunksize=args.chunk_size,
num_workers=args.num_workers,
frac_mapping_module=args.frac_mapping_module
)

LOGGER.info("Writing results to file.")
Expand Down Expand Up @@ -372,6 +373,9 @@ def add_module_parameters(parser):
group.add_argument(
'--min_genes', type=int, default=20, help='The minimum number of genes in a module (default: 20).'
)
group.add_argument(
'--frac_mapping_module', type=float, default=0.8, help='Minimum fraction of genes per module needed to be annotated in the database (default: 0.2)'
)
group.add_argument(
'--expression_mtx_fname',
type=argparse.FileType('r'),
Expand Down
3 changes: 2 additions & 1 deletion src/pyscenic/prune.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,7 @@ def prune2df(
num_workers=None,
module_chunksize=100,
filter_for_annotation=True,
frac_mapping_module=0.8
) -> pd.DataFrame:
"""
Calculate all regulons for a given sequence of ranking databases and a sequence of co-expression modules.
Expand Down Expand Up @@ -391,7 +392,7 @@ def prune2df(
filter_for_annotation=filter_for_annotation,
)
transformation_func = partial(
modules2df, module2features_func=module2features_func, weighted_recovery=weighted_recovery
modules2df, module2features_func=module2features_func, weighted_recovery=weighted_recovery, frac_mapping_module=frac_mapping_module
)
# Create a distributed dataframe from individual delayed objects to avoid out of memory problems.
aggregation_func = (
Expand Down
10 changes: 7 additions & 3 deletions src/pyscenic/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,7 @@ def module2df(
weighted_recovery=False,
return_recovery_curves=False,
module2features_func=module2features,
frac_mapping_module=0.8,
) -> pd.DataFrame:
""" """
# Derive enriched and TF-annotated features for module.
Expand All @@ -241,9 +242,9 @@ def module2df(
# If less than 80% of the genes are mapped to the ranking database, the module is skipped.
n_missing = len(module) - len(genes)
frac_missing = float(n_missing) / len(module)
if frac_missing >= 0.20:
if frac_missing >= (1-frac_mapping_module):
LOGGER.warning(
"Less than 80% of the genes in {} could be mapped to {}. Skipping this module.".format(module.name, db.name)
"Less than {}% of the genes in {} could be mapped to {}. Skipping this module.".format(frac_mapping_module*100,module.name, db.name)
)
return DF_META_DATA

Expand Down Expand Up @@ -293,12 +294,13 @@ def modules2df(
weighted_recovery=False,
return_recovery_curves=False,
module2features_func=module2features,
frac_mapping_module=0.8,
) -> pd.DataFrame:
# Make sure return recovery curves is always set to false because the metadata for the distributed dataframe needs
# to be fixed for the dask framework.
# TODO: Remove this restriction.
return pd.concat(
[module2df(db, module, motif_annotations, weighted_recovery, False, module2features_func) for module in modules]
[module2df(db, module, motif_annotations, weighted_recovery, False, module2features_func, frac_mapping_module) for module in modules]
)


Expand Down Expand Up @@ -443,6 +445,7 @@ def module2regulon(
weighted_recovery=weighted_recovery,
return_recovery_curves=return_recovery_curves,
module2features_func=module2features_func,
frac_mapping_module=0.8,
)
if len(df) == 0:
return None
Expand All @@ -467,5 +470,6 @@ def modules2regulons(
weighted_recovery=weighted_recovery,
return_recovery_curves=return_recovery_curves,
module2features_func=module2features_func,
frac_mapping_module=0.8,
)
return [] if len(df) == 0 else df2regulons(df)