Skip to content

Commit 3de37cb

Browse files
authored
Merge pull request #140 from cflerin/dev
GRN multiprocessing and sparse matrix support
2 parents 41ea9cd + df478d1 commit 3de37cb

File tree

4 files changed

+161
-10
lines changed

4 files changed

+161
-10
lines changed

requirements_docker.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
adjustText==0.7.3
12
anndata==0.6.22.post1
23
annoy==1.15.2
34
ansiwrap==0.8.4
Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
#!/usr/bin/env python3
2+
3+
import sys
4+
import time
5+
import loompy as lp
6+
import pandas as pd
7+
from multiprocessing import Pool, cpu_count
8+
import argparse
9+
import tqdm
10+
11+
from arboreto.utils import load_tf_names
12+
from arboreto.algo import genie3, grnboost2, _prepare_input
13+
from arboreto.core import SGBM_KWARGS, RF_KWARGS, EARLY_STOP_WINDOW_LENGTH
14+
from arboreto.core import to_tf_matrix, target_gene_indices, infer_partial_network
15+
16+
from pyscenic.cli.utils import load_exp_matrix
17+
18+
19+
################################################################################
20+
################################################################################
21+
22+
parser_grn = argparse.ArgumentParser(description='Run Arboreto using a multiprocessing pool')
23+
24+
parser_grn.add_argument('expression_mtx_fname',
25+
type=argparse.FileType('r'),
26+
help='The name of the file that contains the expression matrix for the single cell experiment.'
27+
' Two file formats are supported: csv (rows=cells x columns=genes) or loom (rows=genes x columns=cells).')
28+
parser_grn.add_argument('tfs_fname',
29+
type=argparse.FileType('r'),
30+
help='The name of the file that contains the list of transcription factors (TXT; one TF per line).')
31+
parser_grn.add_argument('-m', '--method', choices=['genie3', 'grnboost2'],
32+
default='grnboost2',
33+
help='The algorithm for gene regulatory network reconstruction (default: grnboost2).')
34+
parser_grn.add_argument('-o', '--output',
35+
type=argparse.FileType('w'), default=sys.stdout,
36+
help='Output file/stream, i.e. a table of TF-target genes (TSV).')
37+
parser_grn.add_argument('--num_workers',
38+
type=int, default=cpu_count(),
39+
help='The number of workers to use. (default: {}).'.format(cpu_count()))
40+
parser_grn.add_argument('--seed', type=int, required=False, default=None,
41+
help='Seed value for regressor random state initialization (optional)')
42+
43+
parser_grn.add_argument('--cell_id_attribute',
44+
type=str, default='CellID',
45+
help='The name of the column attribute that specifies the identifiers of the cells in the loom file.')
46+
parser_grn.add_argument('--gene_attribute',
47+
type=str, default='Gene',
48+
help='The name of the row attribute that specifies the gene symbols in the loom file.')
49+
parser_grn.add_argument('--sparse', action='store_const', const=True, default=False,
50+
help='If set, load the expression data as a sparse (CSC) matrix.')
51+
parser_grn.add_argument('-t', '--transpose', action='store_const', const = 'yes',
52+
help='Transpose the expression matrix (rows=genes x columns=cells).')
53+
54+
args = parser_grn.parse_args()
55+
56+
57+
################################################################################
58+
################################################################################
59+
60+
61+
if(args.method == 'grnboost2'):
62+
method_params = [
63+
'GBM', # regressor_type
64+
SGBM_KWARGS # regressor_kwargs
65+
]
66+
elif(args.method == 'genie3'):
67+
method_params = [
68+
'RF', # regressor_type
69+
RF_KWARGS # regressor_kwargs
70+
]
71+
72+
73+
def run_infer_partial_network(target_gene_index):
74+
target_gene_name = gene_names[target_gene_index]
75+
target_gene_expression = ex_matrix[:, target_gene_index]
76+
77+
n = infer_partial_network(
78+
regressor_type=method_params[0],
79+
regressor_kwargs=method_params[1],
80+
tf_matrix=tf_matrix,
81+
tf_matrix_gene_names=tf_matrix_gene_names,
82+
target_gene_name=target_gene_name,
83+
target_gene_expression=target_gene_expression,
84+
include_meta=False,
85+
early_stop_window_length=EARLY_STOP_WINDOW_LENGTH,
86+
seed=args.seed)
87+
return( n )
88+
89+
90+
if __name__ == '__main__':
91+
92+
start_time = time.time()
93+
ex_matrix = load_exp_matrix(args.expression_mtx_fname.name,
94+
(args.transpose == 'yes'),
95+
args.sparse,
96+
args.cell_id_attribute,
97+
args.gene_attribute)
98+
99+
if args.sparse:
100+
gene_names = ex_matrix[1]
101+
ex_matrix = ex_matrix[0]
102+
else:
103+
gene_names = ex_matrix.columns
104+
105+
end_time = time.time()
106+
print(f'Loaded expression matrix of {ex_matrix.shape[0]} cells and {ex_matrix.shape[1]} genes in {end_time - start_time} seconds...', file=sys.stdout)
107+
108+
tf_names = load_tf_names(args.tfs_fname.name)
109+
print(f'Loaded {len(tf_names)} TFs...', file=sys.stdout)
110+
111+
ex_matrix, gene_names, tf_names = _prepare_input(ex_matrix, gene_names, tf_names)
112+
tf_matrix, tf_matrix_gene_names = to_tf_matrix(ex_matrix, gene_names, tf_names)
113+
114+
print(f'starting {args.method} using {args.num_workers} processes...', file=sys.stdout)
115+
start_time = time.time()
116+
117+
with Pool(args.num_workers) as p:
118+
adjs = list(tqdm.tqdm(p.imap(run_infer_partial_network,
119+
target_gene_indices(gene_names, target_genes='all'),
120+
chunksize=1
121+
),
122+
total=len(gene_names)))
123+
124+
adj = pd.concat(adjs).sort_values(by='importance', ascending=False)
125+
126+
end_time = time.time()
127+
print(f'Done in {end_time - start_time} seconds.', file=sys.stdout)
128+
129+
adj.to_csv(args.output, index=False, sep="\t")
130+

src/pyscenic/cli/pyscenic.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ def find_adjacencies_command(args):
4242
try:
4343
ex_mtx = load_exp_matrix(args.expression_mtx_fname.name,
4444
(args.transpose == 'yes'),
45+
args.sparse,
4546
args.cell_id_attribute,
4647
args.gene_attribute)
4748
except ValueError as e:
@@ -50,8 +51,12 @@ def find_adjacencies_command(args):
5051

5152
tf_names = load_tf_names(args.tfs_fname.name)
5253

53-
n_total_genes = len(ex_mtx.columns)
54-
n_matching_genes = len(ex_mtx.columns.isin(tf_names))
54+
if args.sparse:
55+
n_total_genes = len(ex_mtx[1])
56+
n_matching_genes = len(ex_mtx[1].isin(tf_names))
57+
else:
58+
n_total_genes = len(ex_mtx.columns)
59+
n_matching_genes = len(ex_mtx.columns.isin(tf_names))
5560
if n_total_genes == 0:
5661
LOGGER.error("The expression matrix supplied does not contain any genes. "
5762
"Make sure the extension of the file matches the format (tab separation for TSV and "
@@ -86,6 +91,7 @@ def adjacencies2modules(args):
8691
try:
8792
ex_mtx = load_exp_matrix(args.expression_mtx_fname.name,
8893
(args.transpose == 'yes'),
94+
False, # sparse loading is disabled here for now
8995
args.cell_id_attribute,
9096
args.gene_attribute)
9197
except ValueError as e:
@@ -173,6 +179,7 @@ def aucell_command(args):
173179
try:
174180
ex_mtx = load_exp_matrix(args.expression_mtx_fname.name,
175181
(args.transpose == 'yes'),
182+
False, # sparse loading is disabled here for now
176183
args.cell_id_attribute,
177184
args.gene_attribute)
178185
except ValueError as e:
@@ -284,6 +291,8 @@ def add_loom_parameters(parser):
284291
group.add_argument('--gene_attribute',
285292
type=str, default=ATTRIBUTE_NAME_GENE,
286293
help='The name of the row attribute that specifies the gene symbols in the loom file.')
294+
group.add_argument('--sparse', action='store_const', const=True, default=False,
295+
help='If set, load the expression data as a sparse matrix. Currently applies to the grn inference step only.')
287296
return parser
288297

289298

src/pyscenic/cli/utils.py

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ def save_df_as_loom(df: pd.DataFrame, fname: str) -> None:
4848

4949

5050
def load_exp_matrix_as_loom(fname,
51+
return_sparse=False,
5152
attribute_name_cell_id: str = ATTRIBUTE_NAME_CELL_IDENTIFIER,
5253
attribute_name_gene: str = ATTRIBUTE_NAME_GENE) -> pd.DataFrame:
5354
"""
@@ -56,13 +57,21 @@ def load_exp_matrix_as_loom(fname,
5657
:param fname: The name of the loom file to load.
5758
:return: A 2-dimensional dataframe (rows = cells x columns = genes).
5859
"""
59-
with lp.connect(fname,mode='r',validate=False) as ds:
60-
# The orientation of the loom file is always:
61-
# - Columns represent cells or aggregates of cells
62-
# - Rows represent genes
63-
return pd.DataFrame(data=ds[:, :],
64-
index=ds.ra[attribute_name_gene],
65-
columns=ds.ca[attribute_name_cell_id]).T
60+
if return_sparse:
61+
with lp.connect(fname,mode='r',validate=False) as ds:
62+
ex_mtx = ds.layers[''].sparse().T.tocsc()
63+
genes = pd.Series(ds.ra[attribute_name_gene])
64+
cells = ds.ca[attribute_name_cell_id]
65+
return ex_mtx, genes, cells
66+
67+
else:
68+
with lp.connect(fname,mode='r',validate=False) as ds:
69+
# The orientation of the loom file is always:
70+
# - Columns represent cells or aggregates of cells
71+
# - Rows represent genes
72+
return pd.DataFrame(data=ds[:, :],
73+
index=ds.ra[attribute_name_gene],
74+
columns=ds.ca[attribute_name_cell_id]).T
6675

6776

6877
FILE_EXTENSION2SEPARATOR = {
@@ -72,6 +81,7 @@ def load_exp_matrix_as_loom(fname,
7281

7382

7483
def load_exp_matrix(fname: str, transpose: bool = False,
84+
return_sparse: bool = False,
7585
attribute_name_cell_id: str = ATTRIBUTE_NAME_CELL_IDENTIFIER,
7686
attribute_name_gene: str = ATTRIBUTE_NAME_GENE) -> pd.DataFrame:
7787
"""
@@ -81,14 +91,15 @@ def load_exp_matrix(fname: str, transpose: bool = False,
8191
8292
:param fname: The name of the file that contains the expression matrix.
8393
:param transpose: Is the expression matrix stored as (rows = genes x columns = cells)?
94+
:param return_sparse: Returns a sparse matrix when loading from loom
8495
:return: A 2-dimensional dataframe (rows = cells x columns = genes).
8596
"""
8697
extension = os.path.splitext(fname)[1].lower()
8798
if extension in FILE_EXTENSION2SEPARATOR.keys():
8899
df = pd.read_csv(fname, sep=FILE_EXTENSION2SEPARATOR[extension], header=0, index_col=0)
89100
return df.T if transpose else df
90101
elif extension == '.loom':
91-
return load_exp_matrix_as_loom(fname, attribute_name_cell_id, attribute_name_gene)
102+
return load_exp_matrix_as_loom(fname, return_sparse, attribute_name_cell_id, attribute_name_gene)
92103
else:
93104
raise ValueError("Unknown file format \"{}\".".format(fname))
94105

0 commit comments

Comments
 (0)