Skip to content

Commit

Permalink
Update to download and parse drug target datasets
Browse files Browse the repository at this point in the history
Currently they are treated as gene sets.
Adds to issue #5
  • Loading branch information
jlaw9 committed Apr 9, 2020
1 parent 05dec00 commit 3e33620
Show file tree
Hide file tree
Showing 2 changed files with 179 additions and 48 deletions.
79 changes: 44 additions & 35 deletions config-files/master-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,50 @@ dataset_settings:
prefer_reviewed: True
sep: "\t"

#### Gene-drug dataset options:
# *name*: name of this network. Will be used as the directory name
# *file_name*: the name of the file to download
# *unpack_command*: The command to run to unpackage the file/archive. (e.g., unzip, gzip, tar -xvf)
# *unpacked_file*: The name of the file to parse after unpacking the archive.
# *namespace*: the namespace of the nodes in the networks
# *url*: the url of the file to download
# *taxon*: the NCBI taxonomy ID of the species to which this network belongs. Currently not used.
# *species*: the name of the species (i.e., human, mouse). TODO should be used to line up with the mapping
# *prefer_reviewed*: when mapping, if any of the alternate IDs map to a reviewed UniProt ID, keeping only that one. Otherwise keep all UniProt IDs
# *columns_to_keep*: a list of indexes of columns to keep in the new file. Should be >= 2 (first two columns should be the tail and head of the edge)
# If not specified, all columns will be kept
# *sep*: the delimiter of columns in the files
drug-targets:
- name: "pharmgkb"
# the script will also write a gmt file
file_name: "prot-chem-relationships.tsv"
file_type: "table"
# available at https://www.pharmgkb.org/downloads
url: "https://s3.pgkb.org/data/relationships.zip"
unpack_command: "unzip"
unpacked_file: "relationships.tsv"
taxon: 9606
species: "human"
mapping_settings:
namespace: "pharmgkb"
prefer_reviewed: True
col_to_map: "Entity1_id"
# keep all columns for now
#columns_to_keep: []
# columns to use in the output gmt file.
# Should be 3 column headers. The first is the drug ID, the second is the description, third is the proteins
gmt_cols: ["Entity2_id", "Entity2_name", "Entity1_id"]
sep: "\t"
filters:
# Keep only the drugs that are associated with a gene for now.
- col: "Association"
vals: ["associated"]
# Keep only the Gene - Chemical interactions for now
- col: "Entity1_type"
vals: ["Gene"]
- col: "Entity2_type"
vals: ["Chemical"]

#### Network options:
# *name*: name of this network. Will be used as the directory
# *file_name*: the name of the file to download
Expand Down Expand Up @@ -148,41 +192,6 @@ dataset_settings:
columns_to_keep: []
sep: "\t"

#### Gene-drug dataset options:
# *name*: name of this network. Will be used as the directory name
# *file_name*: the name of the file to download
# *unpack_command*: The command to run to unpackage the file/archive. (e.g., unzip, gzip, tar -xvf)
# *namespace*: the namespace of the nodes in the networks
# *url*: the url of the file to download
# *taxon*: the NCBI taxonomy ID of the species to which this network belongs. Currently not used.
# *species*: the name of the species (i.e., human, mouse). TODO should be used to line up with the mapping
# *prefer_reviewed*: when mapping, if any of the alternate IDs map to a reviewed UniProt ID, keeping only that one. Otherwise keep all UniProt IDs
# *gzip_files*: gzip the individual files
# *remove_filename_spaces*: if there are spaces in the file names, remove them
# *columns_to_keep*: a list of indexes of columns to keep in the new file. Should be >= 2 (first two columns should be the tail and head of the edge)
# If not specified, all columns will be kept
# *sep*: the delimiter of columns in the files
drug-targets:
- name: "pharmgkb"
file_name: "relationships.zip"
network_collection: False
collection_settings:
gzip_files: True
remove_filename_spaces: True
weighted: False
unpack_command: "unzip"
namespace: "ensembl"
# available at https://www.pharmgkb.org/downloads
url: "https://s3.pgkb.org/data/relationships.zip"
taxon: 9606
species: "human"
mapping_settings:
#keep_multiple: False
# if there are multiple mappings to UniProt IDs, use only the 'reviewed' one(s)
prefer_reviewed: True
columns_to_keep: []
sep: "\t"

# This section contains settings which will be used to create another config file
# in order to run the FastSinkSource pipeline.
# See https://github.com/jlaw9/FastSinkSource/tree/no-ontology for more info
Expand Down
148 changes: 135 additions & 13 deletions src/setup_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,9 @@ def setup_dataset_files(datasets_dir, dataset_settings, mapping_settings, **kwar
if 'genesets' in dataset_settings:
genesets_dir = "%s/genesets" % (datasets_dir)
setup_genesets(genesets_dir, dataset_settings['genesets'], namespace_mappings, **kwargs)
if 'drug-targets' in dataset_settings:
out_dir = "%s/drug-targets" % (datasets_dir)
setup_drug_targets(out_dir, dataset_settings['drug-targets'], namespace_mappings, **kwargs)
if 'networks' in dataset_settings:
networks_dir = "%s/networks" % (datasets_dir)
setup_networks(networks_dir, dataset_settings['networks'], namespace_mappings, **kwargs)
Expand All @@ -103,6 +106,53 @@ def setup_dataset_files(datasets_dir, dataset_settings, mapping_settings, **kwar
return


def setup_drug_targets(drug_targets_dir, drug_target_settings, namespace_mappings, **kwargs):
"""
Download the drug target files, and map the genes in them to UniProt IDs, if specified.
"""
for settings in drug_target_settings:
drug_targets_file = "%s/%s/%s" % (
drug_targets_dir, settings['name'], settings['file_name'])
downloaded_file = "%s/%s" % (os.path.dirname(drug_targets_file), settings['url'].split('/')[-1])
download_dir = os.path.dirname(downloaded_file)
mapping_settings = settings.get("mapping_settings", {})
# add the mapping settings to all settings
settings.update(mapping_settings)

if not kwargs.get('force_download') and os.path.isfile(drug_targets_file):
print("%s already exists. Use --force-download to overwrite and re-map to uniprot" % (drug_targets_file))
continue
if kwargs.get('force_download') and os.path.isfile(downloaded_file):
print("Deleting %s and its contents" % (download_dir))
shutil.rmtree(download_dir)
# download the file
#try:
download_file(settings['url'], downloaded_file)
#except:
# print("Failed to download '%s' using the url '%s'. Skipping" % (drug_target_files, drug_target['url']))
# continue
unpack_command = settings.get('unpack_command')
# if its a zip file, unzip first
if unpack_command is not None and unpack_command != "":
command = "%s %s" % (unpack_command, os.path.basename(downloaded_file))
run_command(command, chdir=download_dir)

unpacked_file = settings.get('unpacked_file')
unpacked_file = downloaded_file if unpacked_file is None else \
"%s/%s" % (download_dir, unpacked_file)
all_mapping_stats = setup_geneset(
unpacked_file, drug_targets_file, namespace_mappings, **settings)
if all_mapping_stats is not None:
write_mapping_stats(all_mapping_stats, os.path.dirname(drug_targets_file))


#def setup_drug_target_dataset(
# downloaded_file, drug_target_files, namespace_mappings,
# **kwargs):
#
# return all_mapping_stats


def setup_genesets(genesets_dir, geneset_settings, namespace_mappings, **kwargs):
"""
Download all specified geneset files, and map the genes in them to UniProt IDs, if specified.
Expand Down Expand Up @@ -158,7 +208,7 @@ def setup_genesets(genesets_dir, geneset_settings, namespace_mappings, **kwargs)
def setup_geneset(
geneset_file, new_file, namespace_mappings,
namespace=None, prefer_reviewed=True,
file_type='gmt', sep='\t'):
file_type='gmt', sep='\t', **kwargs):
"""
*geneset_file*: path/to/original geneset file in edge-list format. First two columns should be the tail and head of the edge
*new_file*: path/to/new file to write. Should be a .gmt file since that's the format that will be used
Expand All @@ -171,17 +221,44 @@ def setup_geneset(
*returns*: a dictionary of mapping statistics
"""
mapping_stats = None
if file_type != 'gmt':
print("WARNING: file_type '%s' not yet implemented. Skipping" % (file_type))
return mapping_stats

genesets, descriptions = parse_gmt_file(geneset_file)

if namespace is not None and namespace.lower() not in ['uniprot', 'uniprotkb']:
if namespace_mappings is None:
print("WARNING: mappings not supplied. Unable to map from '%s' to UniProt IDs." % (namespace))
namespace = None

#if file_type != 'gmt':
# print("WARNING: file_type '%s' not yet implemented. Skipping" % (file_type))
# return mapping_stats
if file_type == 'table':
print("Reading %s" % (geneset_file))
df = pd.read_csv(geneset_file, sep=sep, index_col=None)
df2 = filter_and_map_table(
df, namespace_mappings, namespace=namespace,
prefer_reviewed=prefer_reviewed, **kwargs)
# now write to the table to file
print("\twriting %s" % (new_file))
df2.to_csv(new_file, sep=sep, index=None)

# extract the "genesets" from the table
geneset_name_col, description_col, genes_col = kwargs['gmt_cols']
print(geneset_name_col, description_col, genes_col)
genesets = {}
descriptions = {}
for geneset_name, geneset_df in df2.groupby(geneset_name_col):
genesets[geneset_name] = set(geneset_df[genes_col])
descriptions[geneset_name] = set(geneset_df[description_col]).pop()
print("\t%d genesets, %d total genes" % (len(genesets), len(set(g for gs in genesets.values() for g in gs))))
# don't need to map from a different namespace anymore
namespace = None
gmt_file = kwargs.get('gmt_file')
if gmt_file is None:
file_end = '.'.join(new_file.split('.')[1:])
gmt_file = new_file.replace(file_end, 'gmt')
new_file = gmt_file

if file_type == 'gmt':
genesets, descriptions = parse_gmt_file(geneset_file)

print("\twriting %s" % (new_file))
all_mapping_stats = {}
# and re-write the file with the specified columns to keep
Expand All @@ -190,7 +267,9 @@ def setup_geneset(
for name, genes in genesets.items():
prots = genes
if namespace is not None:
prots, mapping_stats = map_nodes(genes, namespace_mappings, prefer_reviewed=prefer_reviewed)
ids_to_prot, mapping_stats = map_nodes(genes, namespace_mappings, prefer_reviewed=prefer_reviewed)
# get just the set of uniprot IDs to which the genes map
prots = set(p for ps in ids_to_prot.values() for p in ps)
all_mapping_stats[name] = mapping_stats
new_line = "%s\t%s\t%s\n" % (
name, descriptions.get(name, ""), '\t'.join(prots))
Expand All @@ -199,14 +278,57 @@ def setup_geneset(
return all_mapping_stats if len(all_mapping_stats) > 0 else None


def filter_and_map_table(
df, namespace_mappings, namespace=None,
prefer_reviewed=None, col_to_map=0,
filters=None, **kwargs):
# first apply the filters
if filters is not None:
for filter_to_apply in filters:
col, vals = filter_to_apply['col'], filter_to_apply['vals']
print("\tkeeping only %s values in the '%s' column" % (str(vals), col))
df = df[df[col].isin(vals)]
#print(df.head())
num_rows = len(df.index)
print("\t%d rows in table" % (num_rows))
# now map the ids to UniProt
if namespace is not None:
# the map the ids column to uniprot
col_to_map = df.columns[col_to_map] if isinstance(col_to_map, int) else col_to_map
ids_to_map = set(df[col_to_map])
ids_to_prot, mapping_stats = map_nodes(ids_to_map, namespace_mappings, prefer_reviewed=prefer_reviewed)
print(mapping_stats)
# now update the dataframe with UniProt IDs
df[col_to_map+"_orig"] = df[col_to_map]
df.set_index(col_to_map, inplace=True)
orig_cols = df.columns
df['uniprot'] = pd.Series(ids_to_prot)
# since there could be cases where a single id maps to multiple ids, we need to expand the rows
# TODO what if I keep only one of the mapped uniprot IDs instead of allowing for multiple?
df2 = pd.concat([df['uniprot'].apply(pd.Series), df], axis=1) \
.drop('uniprot', axis=1) \
.melt(id_vars=orig_cols, value_name="uniprot") \
.drop("variable", axis=1) \
.dropna(subset=['uniprot'])
# keep the original column name
df2.rename(columns={'uniprot':col_to_map}, inplace=True)
# keep the original order
df2 = df2[[col_to_map]+list(orig_cols)]
print("\t%d rows mapped to %d rows" % (num_rows, len(df2)))
print(df2.head())
return df2

def map_nodes(nodes, namespace_mappings, prefer_reviewed=True):
"""
*returns*: a dictionary of node ID to a set of uniprot IDs
"""
num_unmapped_nodes = 0
if prefer_reviewed:
# get all of the reviewed prots
reviewed_prots = namespace_mappings.get('reviewed')
if reviewed_prots is None:
print("\tWarning: 'prefer_reviewed' was specified, but the reviewed status of the UniProt IDs was not given. Skipping this option.")
new_nodes = []
ids_to_prot = {}
for n in nodes:
new_ns = namespace_mappings.get(n)
if new_ns is None or len(new_ns) == 0:
Expand All @@ -216,13 +338,13 @@ def map_nodes(nodes, namespace_mappings, prefer_reviewed=True):
# if any of the mapped UniProt IDs are reviewed, keep only those
ns_to_keep = new_ns & reviewed_prots
new_ns = ns_to_keep if len(ns_to_keep) > 0 else new_ns
# keep the nodes in the same order they were originally in
new_nodes += list(new_ns)
ids_to_prot[n] = list(new_ns)
new_nodes = set(p for ps in ids_to_prot.values() for p in ps)
#print("\t%d nodes map to %d new nodes" % (len(nodes), len(new_nodes)))
#print("\t%d unmapped edges" % (num_unmapped_edges))
mapping_stats = {'num_nodes': len(nodes), 'num_mapped_nodes': len(new_nodes),
'num_unmapped_nodes': num_unmapped_nodes}
return new_nodes, mapping_stats
return ids_to_prot, mapping_stats


def parse_gmt_file(gmt_file):
Expand Down Expand Up @@ -512,7 +634,7 @@ def download_file(url, file_path):
#except urllib.error:
except Exception as e:
print(e)
print("WARNING: wget failed. Attempting to use the requests library")
print("WARNING: wget failed. Attempting to use the requests library to download the file")
# wget didn't work for STRING, gave a 403 error. Using the requests library is working
r = requests.get(url, stream=True)
if r.status_code == 200:
Expand Down

0 comments on commit 3e33620

Please sign in to comment.