Added downloading and parsing gene sets.

Currently only supports GMT files
Murali-group · Apr 8, 2020 · dc0b518 · dc0b518
1 parent a07b446
commit dc0b518
Show file tree

Hide file tree

Showing 2 changed files with 190 additions and 22 deletions.
diff --git a/config-files/master-config.yaml b/config-files/master-config.yaml
@@ -28,9 +28,33 @@ dataset_settings:
 
     # Download and parse *gene_sets*, *networks*, *drug targets*, etc.
     datasets_to_download:
-      # TODO 
-        #gene_sets:
-        #  - name: ""
+      #### Gene set options
+      # *name*: the name of this geneset or collection. Will be used as the directory
+      # *file_name*: name to give the parsed file
+      # *url*: the url of the file to download
+      # *file_type*: the format of the file. If not gmt, will be converted to gmt
+      # *unpack_command*: The command to run to unpackage the file/archive. (e.g., unzip, gzip, tar -xvf)
+      # *geneset_collection*: T/F. Specifies if the downloaded file contains many networks
+      # *namespace*: if specified, map to uniprot from the given namespace
+      # *taxon*: the NCBI taxonomy ID of the species to which this network belongs. Currently not used.
+      # *species*: the name of the species (i.e., human, mouse). TODO should be used to line up with the mapping
+      # *prefer_reviewed*: when mapping, if any of the alternate IDs map to a reviewed UniProt ID, keeping only that one. Otherwise keep all UniProt IDs
+      # *sep*: the delimiter of columns in the files
+        genesets:
+          # This is a collection of COVID-19 related gene sets submitted by the crowd
+          - name: "covid19-crowd"
+            file_name: "genesets-uniprotids.gmt"
+            file_type: "gmt"
+            namespace: "gene_name"
+            url: "https://amp.pharm.mssm.edu/covid19/genesets.gmt"
+            # some of these gene sets appear to be from mouse. Their website does not yet have that information available.
+            # see https://github.com/MaayanLab/covid19_crowd_library/issues/58
+            #taxon: 9606
+            #species: "human"
+            mapping_settings:
+              # if there are multiple mappings to UniProt IDs, use only the 'reviewed' one(s) 
+              prefer_reviewed: True
+            sep: "\t"
 
       #### Network options:
       # *name*: name of this network. Will be used as the directory

diff --git a/src/setup_datasets.py b/src/setup_datasets.py
@@ -92,29 +92,184 @@ def setup_dataset_files(datasets_dir, dataset_settings, mapping_settings, **kwar
     if mapping_settings is not None:
         namespace_mappings = setup_mappings(datasets_dir, mapping_settings, **kwargs)
 
+    if 'genesets' in dataset_settings:
+        genesets_dir = "%s/genesets" % (datasets_dir)
+        setup_genesets(genesets_dir, dataset_settings['genesets'], namespace_mappings, **kwargs)
     if 'networks' in dataset_settings:
         networks_dir = "%s/networks" % (datasets_dir)
         setup_networks(networks_dir, dataset_settings['networks'], namespace_mappings, **kwargs)
-    if 'gene_sets' in dataset_settings:
-        print("WARNING: downloading and parsing of 'gene_sets' is not yet implemented. Skipping.")
 
     print("Finished downloading/setting up datasets")
     return
 
 
+def setup_genesets(genesets_dir, geneset_settings, namespace_mappings, **kwargs):
+    """
+    Download all specified geneset files, and map the genes in them to UniProt IDs, if specified.
+    Will write the parsed files in gmt format.
+    Currently supports the following file types: gmt
+    """
+    for geneset in geneset_settings:
+        geneset_file = "%s/%s/%s" % (
+            genesets_dir, geneset['name'], geneset['file_name'])
+
+        if not kwargs.get('force_download') and os.path.isfile(geneset_file):
+            print("%s already exists. Use --force-download to overwrite and re-map to uniprot" % (geneset_file))
+            continue
+        # if this is a geneset collecton, then geneset file should be the name to give to the zip file with the collection inside
+        if geneset.get('geneset_collection') is True:
+            downloaded_file = geneset_file
+        # If this isn't a geneset collection, then download the file and keep the original filename
+        else:
+            downloaded_file = "%s/%s" % (os.path.dirname(geneset_file), geneset['url'].split('/')[-1])
+        # download the file 
+        #try:
+        download_file(geneset['url'], downloaded_file)
+        #except:
+        #    print("Failed to download '%s' using the url '%s'. Skipping" % (geneset_file, geneset['url']))
+        #    continue
+
+        unpack_command = geneset.get('unpack_command')
+        # if its a zip file, unzip first
+        if unpack_command is not None and unpack_command != "":
+            command = "%s %s" % (unpack_command, os.path.basename(downloaded_file))
+            run_command(command, chdir=os.path.dirname(downloaded_file))
+
+        # TODO parse multiple gene sets and put them in a gmt file
+        if geneset.get('geneset_collection') is True:
+            print("WARNING: 'geneset_collection' not yet implemented. Skipping")
+            continue
+        else:
+            mapping_settings = geneset.get('mapping_settings', {})
+            all_mapping_stats = setup_geneset(
+                downloaded_file, geneset_file, namespace_mappings,
+                namespace=geneset.get('namespace'), 
+                prefer_reviewed=mapping_settings.get('prefer_reviewed'),
+                file_type=geneset.get('file_type'),
+                sep=geneset.get('sep')
+            )
+            if all_mapping_stats is not None:
+                write_mapping_stats(all_mapping_stats, os.path.dirname(geneset_file)) 
+
+
+def setup_geneset(
+        geneset_file, new_file, namespace_mappings,
+        namespace=None, prefer_reviewed=True,
+        file_type='gmt', sep='\t'):
+    """
+    *geneset_file*: path/to/original geneset file in edge-list format. First two columns should be the tail and head of the edge
+    *new_file*: path/to/new file to write. Should be a .gmt file since that's the format that will be used
+        i.e., 1st column: geneset name, 2nd column: description, then a tab-separated list of genes 
+    *namespace_mappings*: a dictionary of IDs each mapped to a set of uniprot IDs
+    *namespace*: the namespace of the nodes in the genesets.
+    *prefer_reviewed*: when mapping, if any of the alternate IDs map to a reviewed UniProt ID, keeping only that one. Otherwise keep all UniProt IDs
+    *sep*: the delimiter of columns in the files
+
+    *returns*: a dictionary of mapping statistics
+    """
+    mapping_stats = None
+    if file_type != 'gmt':
+        print("WARNING: file_type '%s' not yet implemented. Skipping" % (file_type))
+        return mapping_stats
+
+    genesets, descriptions = parse_gmt_file(geneset_file)
+
+    if namespace is not None and namespace.lower() not in ['uniprot', 'uniprotkb']:
+        if namespace_mappings is None:
+            print("WARNING: mappings not supplied. Unable to map from '%s' to UniProt IDs." % (namespace))
+            namespace = None
+
+    print("\twriting %s" % (new_file))
+    all_mapping_stats = {}
+    # and re-write the file with the specified columns to keep
+    open_command = gzip.open if '.gz' in new_file else open
+    with open_command(new_file, 'wb' if '.gz' in new_file else 'w') as out:
+        for name, genes in genesets.items():
+            prots = genes
+            if namespace is not None:
+                prots, mapping_stats = map_nodes(genes, namespace_mappings, prefer_reviewed=prefer_reviewed)
+                all_mapping_stats[name] = mapping_stats
+            new_line = "%s\t%s\t%s\n" % (
+                name, descriptions.get(name, ""), '\t'.join(prots))
+            out.write(new_line.encode() if '.gz' in new_file else new_line)
+    # keep track of the mapping statistics
+    return all_mapping_stats if len(all_mapping_stats) > 0 else None
+
+
+def map_nodes(nodes, namespace_mappings, prefer_reviewed=True):
+    num_unmapped_nodes = 0
+    if prefer_reviewed: 
+        # get all of the reviewed prots 
+        reviewed_prots = namespace_mappings.get('reviewed')
+        if reviewed_prots is None:
+            print("\tWarning: 'prefer_reviewed' was specified, but the reviewed status of the UniProt IDs was not given. Skipping this option.")
+    new_nodes = []
+    for n in nodes:
+        new_ns = namespace_mappings.get(n)
+        if new_ns is None or len(new_ns) == 0:
+            num_unmapped_nodes += 1
+            continue
+        if prefer_reviewed and reviewed_prots is not None:
+            # if any of the mapped UniProt IDs are reviewed, keep only those
+            ns_to_keep = new_ns & reviewed_prots
+            new_ns = ns_to_keep if len(ns_to_keep) > 0 else new_ns
+        # keep the nodes in the same order they were originally in
+        new_nodes += list(new_ns)
+    #print("\t%d nodes map to %d new nodes" % (len(nodes), len(new_nodes)))
+    #print("\t%d unmapped edges" % (num_unmapped_edges))
+    mapping_stats = {'num_nodes': len(nodes), 'num_mapped_nodes': len(new_nodes),
+                     'num_unmapped_nodes': num_unmapped_nodes}
+    return new_nodes, mapping_stats
+
+
+def parse_gmt_file(gmt_file):
+    """
+    Parse a gmt file and return a dictionary of the geneset names mapped to the list of genes 
+    """
+    print("Reading %s" % (gmt_file))
+    genesets = {}
+    descriptions = {}
+    open_command = gzip.open if '.gz' in gmt_file else open
+    with open_command(gmt_file, 'r') as f:
+        for line in f:
+            line = line.decode() if '.gz' in gmt_file else line
+            if line[0] == '#':
+                continue
+            line = line.rstrip().split('\t')
+            name, description = line[:2]
+            genes = line[2:]
+            genesets[name] = genes
+            descriptions[name] = description 
+    print("\t%d gene sets" % (len(genesets)))
+    return genesets, descriptions
+
+
+def write_mapping_stats(all_mapping_stats, out_dir):
+    """
+    *all_mapping_stats*: a dictionary from the network name or gene set to the mapping statistics 
+        e.g., # nodes, # edges before and after mapping
+    *out_dir*: folder in which to place the 'mapping-stats.tsv' file
+    """
+    df = pd.DataFrame(all_mapping_stats).T
+    print(df)
+    stats_file = "%s/mapping-stats.tsv" % (out_dir)
+    print("Writing mapping statistics to %s" % (stats_file))
+    df.to_csv(stats_file, sep='\t')
+
+
 def setup_networks(networks_dir, network_settings, namespace_mappings, **kwargs):
     """
-    Download all specified network files, and map the identifiers of the proteins in them to UniProt, if necessary
+    Download all specified network files, and map the identifiers of the proteins in them to UniProt IDs, if necessary
     """
     #global namespace_mappings
     for network in network_settings:
         network_file = "%s/%s/%s" % (
             networks_dir, network['name'], network['file_name'])
 
-        if not kwargs['force_download'] and os.path.isfile(network_file):
+        if not kwargs.get('force_download') and os.path.isfile(network_file):
             print("%s already exists. Use --force-download to overwrite and re-map to uniprot" % (network_file))
             continue
-        elif os.path.isfile(network_file) and kwargs['force_download']:
+        elif kwargs.get('force_download') and os.path.isfile(network_file):
             print("--force-download not yet setup to overwrite the networks. Please manually remove the file(s) and re-run")
             continue
         # if this is a network collecton, then network file should be the name to give to the zip file with the collection inside
@@ -157,11 +312,7 @@ def setup_networks(networks_dir, network_settings, namespace_mappings, **kwargs)
 
             if mapping_stats is not None:
                 all_mapping_stats = {os.path.basename(network_file).split('.')[0]: mapping_stats}
-                df = pd.DataFrame(all_mapping_stats).T
-                print(df)
-                stats_file = "%s/mapping-stats.tsv" % (os.path.dirname(network_file))
-                print("Writing network mapping statistics to %s" % (stats_file))
-                df.to_csv(stats_file, sep='\t')
+                write_mapping_stats(all_mapping_stats, os.path.dirname(network_file)) 
     return 
 
 
@@ -191,6 +342,7 @@ def setup_network_collection(
     files = [f for f in glob.glob("%s/*.*" % (collection_dir)) if f != collection_file]
     # keep track of the network files and the mapping statistics
     net_files = [] 
+    # this will be a dictionary from the network name to the mapping statistics (i.e., # nodes, # edges)
     all_mapping_stats = {}
     for f in files:
         new_f = f
@@ -220,11 +372,7 @@ def setup_network_collection(
     with open('%s/net-files.txt' % (collection_dir), 'w') as out:
         out.write('\n'.join(net_files)+'\n')
     if len(all_mapping_stats) > 0:
-        df = pd.DataFrame(all_mapping_stats).T
-        print(df)
-        stats_file = "%s/mapping-stats.tsv" % (collection_dir)
-        print("Writing network mapping statistics to %s" % (stats_file))
-        df.to_csv(stats_file, sep='\t')
+        write_mapping_stats(all_mapping_stats, collection_dir) 
     print("")
     return df
 
@@ -325,10 +473,6 @@ def map_network(edges, extra_cols, namespace_mappings, prefer_reviewed=True):
     return new_edges, new_extra_cols, mapping_stats
 
 
-def setup_genesets():
-    pass
-
-
 def run_command(command, chdir=None):
     """
     Run the given command using subprocess.