Merge pull request #140 from cflerin/dev

bramvds · web-flow · commit 3de37cbd1925 · 2020-02-27T18:31:45.000+01:00
GRN multiprocessing and sparse matrix support
diff --git a/requirements_docker.txt b/requirements_docker.txt
@@ -1,3 +1,4 @@
+adjustText==0.7.3
 anndata==0.6.22.post1
 annoy==1.15.2
 ansiwrap==0.8.4
diff --git a/scripts/arboreto_with_multiprocessing.py b/scripts/arboreto_with_multiprocessing.py
@@ -0,0 +1,130 @@
+#!/usr/bin/env python3
+
+import sys
+import time
+import loompy as lp
+import pandas as pd
+from multiprocessing import Pool, cpu_count
+import argparse
+import tqdm
+
+from arboreto.utils import load_tf_names
+from arboreto.algo import genie3, grnboost2, _prepare_input
+from arboreto.core import SGBM_KWARGS, RF_KWARGS, EARLY_STOP_WINDOW_LENGTH
+from arboreto.core import to_tf_matrix, target_gene_indices, infer_partial_network
+
+from pyscenic.cli.utils import load_exp_matrix
+
+
+################################################################################
+################################################################################
+
+parser_grn = argparse.ArgumentParser(description='Run Arboreto using a multiprocessing pool')
+
+parser_grn.add_argument('expression_mtx_fname',
+        type=argparse.FileType('r'),
+        help='The name of the file that contains the expression matrix for the single cell experiment.'
+        ' Two file formats are supported: csv (rows=cells x columns=genes) or loom (rows=genes x columns=cells).')
+parser_grn.add_argument('tfs_fname',
+        type=argparse.FileType('r'),
+        help='The name of the file that contains the list of transcription factors (TXT; one TF per line).')
+parser_grn.add_argument('-m', '--method', choices=['genie3', 'grnboost2'],
+        default='grnboost2',
+        help='The algorithm for gene regulatory network reconstruction (default: grnboost2).')
+parser_grn.add_argument('-o', '--output',
+        type=argparse.FileType('w'), default=sys.stdout,
+        help='Output file/stream, i.e. a table of TF-target genes (TSV).')
+parser_grn.add_argument('--num_workers',
+        type=int, default=cpu_count(),
+        help='The number of workers to use. (default: {}).'.format(cpu_count()))
+parser_grn.add_argument('--seed', type=int, required=False, default=None,
+        help='Seed value for regressor random state initialization (optional)')
+
+parser_grn.add_argument('--cell_id_attribute',
+        type=str, default='CellID',
+        help='The name of the column attribute that specifies the identifiers of the cells in the loom file.')
+parser_grn.add_argument('--gene_attribute',
+        type=str, default='Gene',
+        help='The name of the row attribute that specifies the gene symbols in the loom file.')
+parser_grn.add_argument('--sparse', action='store_const', const=True, default=False,
+        help='If set, load the expression data as a sparse (CSC) matrix.')
+parser_grn.add_argument('-t', '--transpose', action='store_const', const = 'yes',
+        help='Transpose the expression matrix (rows=genes x columns=cells).')
+
+args = parser_grn.parse_args()
+
+
+################################################################################
+################################################################################
+
+
+if(args.method == 'grnboost2'):
+    method_params = [
+        'GBM',      # regressor_type
+        SGBM_KWARGS # regressor_kwargs
+        ]
+elif(args.method == 'genie3'):
+    method_params = [
+        'RF',       # regressor_type
+        RF_KWARGS   # regressor_kwargs
+        ]
+
+
+def run_infer_partial_network(target_gene_index):
+    target_gene_name = gene_names[target_gene_index]
+    target_gene_expression = ex_matrix[:, target_gene_index]
+
+    n = infer_partial_network(
+        regressor_type=method_params[0],
+        regressor_kwargs=method_params[1],
+        tf_matrix=tf_matrix,
+        tf_matrix_gene_names=tf_matrix_gene_names,
+        target_gene_name=target_gene_name,
+        target_gene_expression=target_gene_expression,
+        include_meta=False,
+        early_stop_window_length=EARLY_STOP_WINDOW_LENGTH,
+        seed=args.seed)
+    return( n )
+
+
+if __name__ == '__main__':
+
+    start_time = time.time()
+    ex_matrix = load_exp_matrix(args.expression_mtx_fname.name,
+                             (args.transpose == 'yes'),
+                             args.sparse,
+                             args.cell_id_attribute,
+                             args.gene_attribute)
+
+    if args.sparse:
+        gene_names = ex_matrix[1]
+        ex_matrix = ex_matrix[0]
+    else:
+        gene_names = ex_matrix.columns
+
+    end_time = time.time()
+    print(f'Loaded expression matrix of {ex_matrix.shape[0]} cells and {ex_matrix.shape[1]} genes in {end_time - start_time} seconds...', file=sys.stdout)
+
+    tf_names = load_tf_names(args.tfs_fname.name)
+    print(f'Loaded {len(tf_names)} TFs...', file=sys.stdout)
+
+    ex_matrix, gene_names, tf_names = _prepare_input(ex_matrix, gene_names, tf_names)
+    tf_matrix, tf_matrix_gene_names = to_tf_matrix(ex_matrix, gene_names, tf_names)
+
+    print(f'starting {args.method} using {args.num_workers} processes...', file=sys.stdout)
+    start_time = time.time()
+
+    with Pool(args.num_workers) as p:
+        adjs = list(tqdm.tqdm(p.imap(run_infer_partial_network,
+                                     target_gene_indices(gene_names, target_genes='all'),
+                                     chunksize=1
+                                     ),
+                              total=len(gene_names)))
+
+    adj = pd.concat(adjs).sort_values(by='importance', ascending=False)
+
+    end_time = time.time()
+    print(f'Done in {end_time - start_time} seconds.', file=sys.stdout)
+
+    adj.to_csv(args.output, index=False, sep="\t")
+
diff --git a/src/pyscenic/cli/pyscenic.py b/src/pyscenic/cli/pyscenic.py
@@ -42,6 +42,7 @@ def find_adjacencies_command(args):
     try:
         ex_mtx = load_exp_matrix(args.expression_mtx_fname.name,
                                  (args.transpose == 'yes'),
+                                 args.sparse,
                                  args.cell_id_attribute,
                                  args.gene_attribute)
     except ValueError as e:
@@ -50,8 +51,12 @@ def find_adjacencies_command(args):
 
     tf_names = load_tf_names(args.tfs_fname.name)
 
-    n_total_genes = len(ex_mtx.columns)
-    n_matching_genes = len(ex_mtx.columns.isin(tf_names))
+    if args.sparse:
+        n_total_genes = len(ex_mtx[1])
+        n_matching_genes = len(ex_mtx[1].isin(tf_names))
+    else:
+        n_total_genes = len(ex_mtx.columns)
+        n_matching_genes = len(ex_mtx.columns.isin(tf_names))
     if n_total_genes == 0:
         LOGGER.error("The expression matrix supplied does not contain any genes. "
                      "Make sure the extension of the file matches the format (tab separation for TSV and "
@@ -86,6 +91,7 @@ def adjacencies2modules(args):
     try:
         ex_mtx = load_exp_matrix(args.expression_mtx_fname.name,
                                  (args.transpose == 'yes'),
+                                 False, # sparse loading is disabled here for now
                                  args.cell_id_attribute,
                                  args.gene_attribute)
     except ValueError as e:
@@ -173,6 +179,7 @@ def aucell_command(args):
     try:
         ex_mtx = load_exp_matrix(args.expression_mtx_fname.name,
                                  (args.transpose == 'yes'),
+                                 False, # sparse loading is disabled here for now
                                  args.cell_id_attribute,
                                  args.gene_attribute)
     except ValueError as e:
@@ -284,6 +291,8 @@ def add_loom_parameters(parser):
     group.add_argument('--gene_attribute',
                        type=str, default=ATTRIBUTE_NAME_GENE,
                        help='The name of the row attribute that specifies the gene symbols in the loom file.')
+    group.add_argument('--sparse', action='store_const', const=True, default=False,
+                        help='If set, load the expression data as a sparse matrix. Currently applies to the grn inference step only.')
     return parser
 
 
diff --git a/src/pyscenic/cli/utils.py b/src/pyscenic/cli/utils.py
@@ -48,6 +48,7 @@ def save_df_as_loom(df: pd.DataFrame, fname: str) -> None:
 
 
 def load_exp_matrix_as_loom(fname,
+                            return_sparse=False,
                             attribute_name_cell_id: str = ATTRIBUTE_NAME_CELL_IDENTIFIER,
                             attribute_name_gene: str = ATTRIBUTE_NAME_GENE) -> pd.DataFrame:
     """
@@ -56,13 +57,21 @@ def load_exp_matrix_as_loom(fname,
     :param fname: The name of the loom file to load.
     :return: A 2-dimensional dataframe (rows = cells x columns = genes).
     """
-    with lp.connect(fname,mode='r',validate=False) as ds:
-        # The orientation of the loom file is always:
-        #   - Columns represent cells or aggregates of cells
-        # 	- Rows represent genes
-        return pd.DataFrame(data=ds[:, :],
-                            index=ds.ra[attribute_name_gene],
-                            columns=ds.ca[attribute_name_cell_id]).T
+    if return_sparse:
+        with lp.connect(fname,mode='r',validate=False) as ds:
+            ex_mtx = ds.layers[''].sparse().T.tocsc()
+            genes = pd.Series(ds.ra[attribute_name_gene])
+            cells = ds.ca[attribute_name_cell_id]
+        return ex_mtx, genes, cells
+
+    else:
+        with lp.connect(fname,mode='r',validate=False) as ds:
+            # The orientation of the loom file is always:
+            #   - Columns represent cells or aggregates of cells
+            # 	- Rows represent genes
+            return pd.DataFrame(data=ds[:, :],
+                                index=ds.ra[attribute_name_gene],
+                                columns=ds.ca[attribute_name_cell_id]).T
 
 
 FILE_EXTENSION2SEPARATOR = {
@@ -72,6 +81,7 @@ def load_exp_matrix_as_loom(fname,
 
 
 def load_exp_matrix(fname: str, transpose: bool = False,
+                    return_sparse: bool = False,
                     attribute_name_cell_id: str = ATTRIBUTE_NAME_CELL_IDENTIFIER,
                     attribute_name_gene: str = ATTRIBUTE_NAME_GENE) -> pd.DataFrame:
     """
@@ -81,14 +91,15 @@ def load_exp_matrix(fname: str, transpose: bool = False,
 
     :param fname: The name of the file that contains the expression matrix.
     :param transpose: Is the expression matrix stored as (rows = genes x columns = cells)?
+    :param return_sparse: Returns a sparse matrix when loading from loom
     :return: A 2-dimensional dataframe (rows = cells x columns = genes).
     """
     extension = os.path.splitext(fname)[1].lower()
     if extension in FILE_EXTENSION2SEPARATOR.keys():
         df = pd.read_csv(fname, sep=FILE_EXTENSION2SEPARATOR[extension], header=0, index_col=0)
         return df.T if transpose else df
     elif extension == '.loom':
-        return load_exp_matrix_as_loom(fname, attribute_name_cell_id, attribute_name_gene)
+        return load_exp_matrix_as_loom(fname, return_sparse, attribute_name_cell_id, attribute_name_gene)
     else:
         raise ValueError("Unknown file format \"{}\".".format(fname))
 

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+adjustText==0.7.3`
`1`	`2`	`anndata==0.6.22.post1`
`2`	`3`	`annoy==1.15.2`
`3`	`4`	`ansiwrap==0.8.4`