Skip to content

Commit

Permalink
Added downloading and parsing gene sets.
Browse files Browse the repository at this point in the history
Currently only supports GMT files
  • Loading branch information
jlaw9 committed Apr 8, 2020
1 parent a07b446 commit dc0b518
Show file tree
Hide file tree
Showing 2 changed files with 190 additions and 22 deletions.
30 changes: 27 additions & 3 deletions config-files/master-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,33 @@ dataset_settings:

# Download and parse *gene_sets*, *networks*, *drug targets*, etc.
datasets_to_download:
# TODO
#gene_sets:
# - name: ""
#### Gene set options
# *name*: the name of this geneset or collection. Will be used as the directory
# *file_name*: name to give the parsed file
# *url*: the url of the file to download
# *file_type*: the format of the file. If not gmt, will be converted to gmt
# *unpack_command*: The command to run to unpackage the file/archive. (e.g., unzip, gzip, tar -xvf)
# *geneset_collection*: T/F. Specifies if the downloaded file contains many networks
# *namespace*: if specified, map to uniprot from the given namespace
# *taxon*: the NCBI taxonomy ID of the species to which this network belongs. Currently not used.
# *species*: the name of the species (i.e., human, mouse). TODO should be used to line up with the mapping
# *prefer_reviewed*: when mapping, if any of the alternate IDs map to a reviewed UniProt ID, keeping only that one. Otherwise keep all UniProt IDs
# *sep*: the delimiter of columns in the files
genesets:
# This is a collection of COVID-19 related gene sets submitted by the crowd
- name: "covid19-crowd"
file_name: "genesets-uniprotids.gmt"
file_type: "gmt"
namespace: "gene_name"
url: "https://amp.pharm.mssm.edu/covid19/genesets.gmt"
# some of these gene sets appear to be from mouse. Their website does not yet have that information available.
# see https://github.com/MaayanLab/covid19_crowd_library/issues/58
#taxon: 9606
#species: "human"
mapping_settings:
# if there are multiple mappings to UniProt IDs, use only the 'reviewed' one(s)
prefer_reviewed: True
sep: "\t"

#### Network options:
# *name*: name of this network. Will be used as the directory
Expand Down
182 changes: 163 additions & 19 deletions src/setup_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,29 +92,184 @@ def setup_dataset_files(datasets_dir, dataset_settings, mapping_settings, **kwar
if mapping_settings is not None:
namespace_mappings = setup_mappings(datasets_dir, mapping_settings, **kwargs)

if 'genesets' in dataset_settings:
genesets_dir = "%s/genesets" % (datasets_dir)
setup_genesets(genesets_dir, dataset_settings['genesets'], namespace_mappings, **kwargs)
if 'networks' in dataset_settings:
networks_dir = "%s/networks" % (datasets_dir)
setup_networks(networks_dir, dataset_settings['networks'], namespace_mappings, **kwargs)
if 'gene_sets' in dataset_settings:
print("WARNING: downloading and parsing of 'gene_sets' is not yet implemented. Skipping.")

print("Finished downloading/setting up datasets")
return


def setup_genesets(genesets_dir, geneset_settings, namespace_mappings, **kwargs):
"""
Download all specified geneset files, and map the genes in them to UniProt IDs, if specified.
Will write the parsed files in gmt format.
Currently supports the following file types: gmt
"""
for geneset in geneset_settings:
geneset_file = "%s/%s/%s" % (
genesets_dir, geneset['name'], geneset['file_name'])

if not kwargs.get('force_download') and os.path.isfile(geneset_file):
print("%s already exists. Use --force-download to overwrite and re-map to uniprot" % (geneset_file))
continue
# if this is a geneset collecton, then geneset file should be the name to give to the zip file with the collection inside
if geneset.get('geneset_collection') is True:
downloaded_file = geneset_file
# If this isn't a geneset collection, then download the file and keep the original filename
else:
downloaded_file = "%s/%s" % (os.path.dirname(geneset_file), geneset['url'].split('/')[-1])
# download the file
#try:
download_file(geneset['url'], downloaded_file)
#except:
# print("Failed to download '%s' using the url '%s'. Skipping" % (geneset_file, geneset['url']))
# continue

unpack_command = geneset.get('unpack_command')
# if its a zip file, unzip first
if unpack_command is not None and unpack_command != "":
command = "%s %s" % (unpack_command, os.path.basename(downloaded_file))
run_command(command, chdir=os.path.dirname(downloaded_file))

# TODO parse multiple gene sets and put them in a gmt file
if geneset.get('geneset_collection') is True:
print("WARNING: 'geneset_collection' not yet implemented. Skipping")
continue
else:
mapping_settings = geneset.get('mapping_settings', {})
all_mapping_stats = setup_geneset(
downloaded_file, geneset_file, namespace_mappings,
namespace=geneset.get('namespace'),
prefer_reviewed=mapping_settings.get('prefer_reviewed'),
file_type=geneset.get('file_type'),
sep=geneset.get('sep')
)
if all_mapping_stats is not None:
write_mapping_stats(all_mapping_stats, os.path.dirname(geneset_file))


def setup_geneset(
geneset_file, new_file, namespace_mappings,
namespace=None, prefer_reviewed=True,
file_type='gmt', sep='\t'):
"""
*geneset_file*: path/to/original geneset file in edge-list format. First two columns should be the tail and head of the edge
*new_file*: path/to/new file to write. Should be a .gmt file since that's the format that will be used
i.e., 1st column: geneset name, 2nd column: description, then a tab-separated list of genes
*namespace_mappings*: a dictionary of IDs each mapped to a set of uniprot IDs
*namespace*: the namespace of the nodes in the genesets.
*prefer_reviewed*: when mapping, if any of the alternate IDs map to a reviewed UniProt ID, keeping only that one. Otherwise keep all UniProt IDs
*sep*: the delimiter of columns in the files
*returns*: a dictionary of mapping statistics
"""
mapping_stats = None
if file_type != 'gmt':
print("WARNING: file_type '%s' not yet implemented. Skipping" % (file_type))
return mapping_stats

genesets, descriptions = parse_gmt_file(geneset_file)

if namespace is not None and namespace.lower() not in ['uniprot', 'uniprotkb']:
if namespace_mappings is None:
print("WARNING: mappings not supplied. Unable to map from '%s' to UniProt IDs." % (namespace))
namespace = None

print("\twriting %s" % (new_file))
all_mapping_stats = {}
# and re-write the file with the specified columns to keep
open_command = gzip.open if '.gz' in new_file else open
with open_command(new_file, 'wb' if '.gz' in new_file else 'w') as out:
for name, genes in genesets.items():
prots = genes
if namespace is not None:
prots, mapping_stats = map_nodes(genes, namespace_mappings, prefer_reviewed=prefer_reviewed)
all_mapping_stats[name] = mapping_stats
new_line = "%s\t%s\t%s\n" % (
name, descriptions.get(name, ""), '\t'.join(prots))
out.write(new_line.encode() if '.gz' in new_file else new_line)
# keep track of the mapping statistics
return all_mapping_stats if len(all_mapping_stats) > 0 else None


def map_nodes(nodes, namespace_mappings, prefer_reviewed=True):
num_unmapped_nodes = 0
if prefer_reviewed:
# get all of the reviewed prots
reviewed_prots = namespace_mappings.get('reviewed')
if reviewed_prots is None:
print("\tWarning: 'prefer_reviewed' was specified, but the reviewed status of the UniProt IDs was not given. Skipping this option.")
new_nodes = []
for n in nodes:
new_ns = namespace_mappings.get(n)
if new_ns is None or len(new_ns) == 0:
num_unmapped_nodes += 1
continue
if prefer_reviewed and reviewed_prots is not None:
# if any of the mapped UniProt IDs are reviewed, keep only those
ns_to_keep = new_ns & reviewed_prots
new_ns = ns_to_keep if len(ns_to_keep) > 0 else new_ns
# keep the nodes in the same order they were originally in
new_nodes += list(new_ns)
#print("\t%d nodes map to %d new nodes" % (len(nodes), len(new_nodes)))
#print("\t%d unmapped edges" % (num_unmapped_edges))
mapping_stats = {'num_nodes': len(nodes), 'num_mapped_nodes': len(new_nodes),
'num_unmapped_nodes': num_unmapped_nodes}
return new_nodes, mapping_stats


def parse_gmt_file(gmt_file):
"""
Parse a gmt file and return a dictionary of the geneset names mapped to the list of genes
"""
print("Reading %s" % (gmt_file))
genesets = {}
descriptions = {}
open_command = gzip.open if '.gz' in gmt_file else open
with open_command(gmt_file, 'r') as f:
for line in f:
line = line.decode() if '.gz' in gmt_file else line
if line[0] == '#':
continue
line = line.rstrip().split('\t')
name, description = line[:2]
genes = line[2:]
genesets[name] = genes
descriptions[name] = description
print("\t%d gene sets" % (len(genesets)))
return genesets, descriptions


def write_mapping_stats(all_mapping_stats, out_dir):
"""
*all_mapping_stats*: a dictionary from the network name or gene set to the mapping statistics
e.g., # nodes, # edges before and after mapping
*out_dir*: folder in which to place the 'mapping-stats.tsv' file
"""
df = pd.DataFrame(all_mapping_stats).T
print(df)
stats_file = "%s/mapping-stats.tsv" % (out_dir)
print("Writing mapping statistics to %s" % (stats_file))
df.to_csv(stats_file, sep='\t')


def setup_networks(networks_dir, network_settings, namespace_mappings, **kwargs):
"""
Download all specified network files, and map the identifiers of the proteins in them to UniProt, if necessary
Download all specified network files, and map the identifiers of the proteins in them to UniProt IDs, if necessary
"""
#global namespace_mappings
for network in network_settings:
network_file = "%s/%s/%s" % (
networks_dir, network['name'], network['file_name'])

if not kwargs['force_download'] and os.path.isfile(network_file):
if not kwargs.get('force_download') and os.path.isfile(network_file):
print("%s already exists. Use --force-download to overwrite and re-map to uniprot" % (network_file))
continue
elif os.path.isfile(network_file) and kwargs['force_download']:
elif kwargs.get('force_download') and os.path.isfile(network_file):
print("--force-download not yet setup to overwrite the networks. Please manually remove the file(s) and re-run")
continue
# if this is a network collecton, then network file should be the name to give to the zip file with the collection inside
Expand Down Expand Up @@ -157,11 +312,7 @@ def setup_networks(networks_dir, network_settings, namespace_mappings, **kwargs)

if mapping_stats is not None:
all_mapping_stats = {os.path.basename(network_file).split('.')[0]: mapping_stats}
df = pd.DataFrame(all_mapping_stats).T
print(df)
stats_file = "%s/mapping-stats.tsv" % (os.path.dirname(network_file))
print("Writing network mapping statistics to %s" % (stats_file))
df.to_csv(stats_file, sep='\t')
write_mapping_stats(all_mapping_stats, os.path.dirname(network_file))
return


Expand Down Expand Up @@ -191,6 +342,7 @@ def setup_network_collection(
files = [f for f in glob.glob("%s/*.*" % (collection_dir)) if f != collection_file]
# keep track of the network files and the mapping statistics
net_files = []
# this will be a dictionary from the network name to the mapping statistics (i.e., # nodes, # edges)
all_mapping_stats = {}
for f in files:
new_f = f
Expand Down Expand Up @@ -220,11 +372,7 @@ def setup_network_collection(
with open('%s/net-files.txt' % (collection_dir), 'w') as out:
out.write('\n'.join(net_files)+'\n')
if len(all_mapping_stats) > 0:
df = pd.DataFrame(all_mapping_stats).T
print(df)
stats_file = "%s/mapping-stats.tsv" % (collection_dir)
print("Writing network mapping statistics to %s" % (stats_file))
df.to_csv(stats_file, sep='\t')
write_mapping_stats(all_mapping_stats, collection_dir)
print("")
return df

Expand Down Expand Up @@ -325,10 +473,6 @@ def map_network(edges, extra_cols, namespace_mappings, prefer_reviewed=True):
return new_edges, new_extra_cols, mapping_stats


def setup_genesets():
pass


def run_command(command, chdir=None):
"""
Run the given command using subprocess.
Expand Down

0 comments on commit dc0b518

Please sign in to comment.