Skip to content

Commit

Permalink
Merge pull request #140 from cflerin/dev
Browse files Browse the repository at this point in the history
GRN multiprocessing and sparse matrix support
  • Loading branch information
bramvds authored Feb 27, 2020
2 parents 41ea9cd + df478d1 commit 3de37cb
Show file tree
Hide file tree
Showing 4 changed files with 161 additions and 10 deletions.
1 change: 1 addition & 0 deletions requirements_docker.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
adjustText==0.7.3
anndata==0.6.22.post1
annoy==1.15.2
ansiwrap==0.8.4
Expand Down
130 changes: 130 additions & 0 deletions scripts/arboreto_with_multiprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
#!/usr/bin/env python3

import sys
import time
import loompy as lp
import pandas as pd
from multiprocessing import Pool, cpu_count
import argparse
import tqdm

from arboreto.utils import load_tf_names
from arboreto.algo import genie3, grnboost2, _prepare_input
from arboreto.core import SGBM_KWARGS, RF_KWARGS, EARLY_STOP_WINDOW_LENGTH
from arboreto.core import to_tf_matrix, target_gene_indices, infer_partial_network

from pyscenic.cli.utils import load_exp_matrix


################################################################################
################################################################################

parser_grn = argparse.ArgumentParser(description='Run Arboreto using a multiprocessing pool')

parser_grn.add_argument('expression_mtx_fname',
type=argparse.FileType('r'),
help='The name of the file that contains the expression matrix for the single cell experiment.'
' Two file formats are supported: csv (rows=cells x columns=genes) or loom (rows=genes x columns=cells).')
parser_grn.add_argument('tfs_fname',
type=argparse.FileType('r'),
help='The name of the file that contains the list of transcription factors (TXT; one TF per line).')
parser_grn.add_argument('-m', '--method', choices=['genie3', 'grnboost2'],
default='grnboost2',
help='The algorithm for gene regulatory network reconstruction (default: grnboost2).')
parser_grn.add_argument('-o', '--output',
type=argparse.FileType('w'), default=sys.stdout,
help='Output file/stream, i.e. a table of TF-target genes (TSV).')
parser_grn.add_argument('--num_workers',
type=int, default=cpu_count(),
help='The number of workers to use. (default: {}).'.format(cpu_count()))
parser_grn.add_argument('--seed', type=int, required=False, default=None,
help='Seed value for regressor random state initialization (optional)')

parser_grn.add_argument('--cell_id_attribute',
type=str, default='CellID',
help='The name of the column attribute that specifies the identifiers of the cells in the loom file.')
parser_grn.add_argument('--gene_attribute',
type=str, default='Gene',
help='The name of the row attribute that specifies the gene symbols in the loom file.')
parser_grn.add_argument('--sparse', action='store_const', const=True, default=False,
help='If set, load the expression data as a sparse (CSC) matrix.')
parser_grn.add_argument('-t', '--transpose', action='store_const', const = 'yes',
help='Transpose the expression matrix (rows=genes x columns=cells).')

args = parser_grn.parse_args()


################################################################################
################################################################################


if(args.method == 'grnboost2'):
method_params = [
'GBM', # regressor_type
SGBM_KWARGS # regressor_kwargs
]
elif(args.method == 'genie3'):
method_params = [
'RF', # regressor_type
RF_KWARGS # regressor_kwargs
]


def run_infer_partial_network(target_gene_index):
target_gene_name = gene_names[target_gene_index]
target_gene_expression = ex_matrix[:, target_gene_index]

n = infer_partial_network(
regressor_type=method_params[0],
regressor_kwargs=method_params[1],
tf_matrix=tf_matrix,
tf_matrix_gene_names=tf_matrix_gene_names,
target_gene_name=target_gene_name,
target_gene_expression=target_gene_expression,
include_meta=False,
early_stop_window_length=EARLY_STOP_WINDOW_LENGTH,
seed=args.seed)
return( n )


if __name__ == '__main__':

start_time = time.time()
ex_matrix = load_exp_matrix(args.expression_mtx_fname.name,
(args.transpose == 'yes'),
args.sparse,
args.cell_id_attribute,
args.gene_attribute)

if args.sparse:
gene_names = ex_matrix[1]
ex_matrix = ex_matrix[0]
else:
gene_names = ex_matrix.columns

end_time = time.time()
print(f'Loaded expression matrix of {ex_matrix.shape[0]} cells and {ex_matrix.shape[1]} genes in {end_time - start_time} seconds...', file=sys.stdout)

tf_names = load_tf_names(args.tfs_fname.name)
print(f'Loaded {len(tf_names)} TFs...', file=sys.stdout)

ex_matrix, gene_names, tf_names = _prepare_input(ex_matrix, gene_names, tf_names)
tf_matrix, tf_matrix_gene_names = to_tf_matrix(ex_matrix, gene_names, tf_names)

print(f'starting {args.method} using {args.num_workers} processes...', file=sys.stdout)
start_time = time.time()

with Pool(args.num_workers) as p:
adjs = list(tqdm.tqdm(p.imap(run_infer_partial_network,
target_gene_indices(gene_names, target_genes='all'),
chunksize=1
),
total=len(gene_names)))

adj = pd.concat(adjs).sort_values(by='importance', ascending=False)

end_time = time.time()
print(f'Done in {end_time - start_time} seconds.', file=sys.stdout)

adj.to_csv(args.output, index=False, sep="\t")

13 changes: 11 additions & 2 deletions src/pyscenic/cli/pyscenic.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def find_adjacencies_command(args):
try:
ex_mtx = load_exp_matrix(args.expression_mtx_fname.name,
(args.transpose == 'yes'),
args.sparse,
args.cell_id_attribute,
args.gene_attribute)
except ValueError as e:
Expand All @@ -50,8 +51,12 @@ def find_adjacencies_command(args):

tf_names = load_tf_names(args.tfs_fname.name)

n_total_genes = len(ex_mtx.columns)
n_matching_genes = len(ex_mtx.columns.isin(tf_names))
if args.sparse:
n_total_genes = len(ex_mtx[1])
n_matching_genes = len(ex_mtx[1].isin(tf_names))
else:
n_total_genes = len(ex_mtx.columns)
n_matching_genes = len(ex_mtx.columns.isin(tf_names))
if n_total_genes == 0:
LOGGER.error("The expression matrix supplied does not contain any genes. "
"Make sure the extension of the file matches the format (tab separation for TSV and "
Expand Down Expand Up @@ -86,6 +91,7 @@ def adjacencies2modules(args):
try:
ex_mtx = load_exp_matrix(args.expression_mtx_fname.name,
(args.transpose == 'yes'),
False, # sparse loading is disabled here for now
args.cell_id_attribute,
args.gene_attribute)
except ValueError as e:
Expand Down Expand Up @@ -173,6 +179,7 @@ def aucell_command(args):
try:
ex_mtx = load_exp_matrix(args.expression_mtx_fname.name,
(args.transpose == 'yes'),
False, # sparse loading is disabled here for now
args.cell_id_attribute,
args.gene_attribute)
except ValueError as e:
Expand Down Expand Up @@ -284,6 +291,8 @@ def add_loom_parameters(parser):
group.add_argument('--gene_attribute',
type=str, default=ATTRIBUTE_NAME_GENE,
help='The name of the row attribute that specifies the gene symbols in the loom file.')
group.add_argument('--sparse', action='store_const', const=True, default=False,
help='If set, load the expression data as a sparse matrix. Currently applies to the grn inference step only.')
return parser


Expand Down
27 changes: 19 additions & 8 deletions src/pyscenic/cli/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def save_df_as_loom(df: pd.DataFrame, fname: str) -> None:


def load_exp_matrix_as_loom(fname,
return_sparse=False,
attribute_name_cell_id: str = ATTRIBUTE_NAME_CELL_IDENTIFIER,
attribute_name_gene: str = ATTRIBUTE_NAME_GENE) -> pd.DataFrame:
"""
Expand All @@ -56,13 +57,21 @@ def load_exp_matrix_as_loom(fname,
:param fname: The name of the loom file to load.
:return: A 2-dimensional dataframe (rows = cells x columns = genes).
"""
with lp.connect(fname,mode='r',validate=False) as ds:
# The orientation of the loom file is always:
# - Columns represent cells or aggregates of cells
# - Rows represent genes
return pd.DataFrame(data=ds[:, :],
index=ds.ra[attribute_name_gene],
columns=ds.ca[attribute_name_cell_id]).T
if return_sparse:
with lp.connect(fname,mode='r',validate=False) as ds:
ex_mtx = ds.layers[''].sparse().T.tocsc()
genes = pd.Series(ds.ra[attribute_name_gene])
cells = ds.ca[attribute_name_cell_id]
return ex_mtx, genes, cells

else:
with lp.connect(fname,mode='r',validate=False) as ds:
# The orientation of the loom file is always:
# - Columns represent cells or aggregates of cells
# - Rows represent genes
return pd.DataFrame(data=ds[:, :],
index=ds.ra[attribute_name_gene],
columns=ds.ca[attribute_name_cell_id]).T


FILE_EXTENSION2SEPARATOR = {
Expand All @@ -72,6 +81,7 @@ def load_exp_matrix_as_loom(fname,


def load_exp_matrix(fname: str, transpose: bool = False,
return_sparse: bool = False,
attribute_name_cell_id: str = ATTRIBUTE_NAME_CELL_IDENTIFIER,
attribute_name_gene: str = ATTRIBUTE_NAME_GENE) -> pd.DataFrame:
"""
Expand All @@ -81,14 +91,15 @@ def load_exp_matrix(fname: str, transpose: bool = False,
:param fname: The name of the file that contains the expression matrix.
:param transpose: Is the expression matrix stored as (rows = genes x columns = cells)?
:param return_sparse: Returns a sparse matrix when loading from loom
:return: A 2-dimensional dataframe (rows = cells x columns = genes).
"""
extension = os.path.splitext(fname)[1].lower()
if extension in FILE_EXTENSION2SEPARATOR.keys():
df = pd.read_csv(fname, sep=FILE_EXTENSION2SEPARATOR[extension], header=0, index_col=0)
return df.T if transpose else df
elif extension == '.loom':
return load_exp_matrix_as_loom(fname, attribute_name_cell_id, attribute_name_gene)
return load_exp_matrix_as_loom(fname, return_sparse, attribute_name_cell_id, attribute_name_gene)
else:
raise ValueError("Unknown file format \"{}\".".format(fname))

Expand Down

0 comments on commit 3de37cb

Please sign in to comment.