diff --git a/config-files/master-config.yaml b/config-files/master-config.yaml index eaaf87e..5942816 100644 --- a/config-files/master-config.yaml +++ b/config-files/master-config.yaml @@ -56,6 +56,50 @@ dataset_settings: prefer_reviewed: True sep: "\t" + #### Gene-drug dataset options: + # *name*: name of this network. Will be used as the directory name + # *file_name*: the name of the file to download + # *unpack_command*: The command to run to unpackage the file/archive. (e.g., unzip, gzip, tar -xvf) + # *unpacked_file*: The name of the file to parse after unpacking the archive. + # *namespace*: the namespace of the nodes in the networks + # *url*: the url of the file to download + # *taxon*: the NCBI taxonomy ID of the species to which this network belongs. Currently not used. + # *species*: the name of the species (i.e., human, mouse). TODO should be used to line up with the mapping + # *prefer_reviewed*: when mapping, if any of the alternate IDs map to a reviewed UniProt ID, keeping only that one. Otherwise keep all UniProt IDs + # *columns_to_keep*: a list of indexes of columns to keep in the new file. Should be >= 2 (first two columns should be the tail and head of the edge) + # If not specified, all columns will be kept + # *sep*: the delimiter of columns in the files + drug-targets: + - name: "pharmgkb" + # the script will also write a gmt file + file_name: "prot-chem-relationships.tsv" + file_type: "table" + # available at https://www.pharmgkb.org/downloads + url: "https://s3.pgkb.org/data/relationships.zip" + unpack_command: "unzip" + unpacked_file: "relationships.tsv" + taxon: 9606 + species: "human" + mapping_settings: + namespace: "pharmgkb" + prefer_reviewed: True + col_to_map: "Entity1_id" + # keep all columns for now + #columns_to_keep: [] + # columns to use in the output gmt file. + # Should be 3 column headers. The first is the drug ID, the second is the description, third is the proteins + gmt_cols: ["Entity2_id", "Entity2_name", "Entity1_id"] + sep: "\t" + filters: + # Keep only the drugs that are associated with a gene for now. + - col: "Association" + vals: ["associated"] + # Keep only the Gene - Chemical interactions for now + - col: "Entity1_type" + vals: ["Gene"] + - col: "Entity2_type" + vals: ["Chemical"] + #### Network options: # *name*: name of this network. Will be used as the directory # *file_name*: the name of the file to download @@ -148,41 +192,6 @@ dataset_settings: columns_to_keep: [] sep: "\t" - #### Gene-drug dataset options: - # *name*: name of this network. Will be used as the directory name - # *file_name*: the name of the file to download - # *unpack_command*: The command to run to unpackage the file/archive. (e.g., unzip, gzip, tar -xvf) - # *namespace*: the namespace of the nodes in the networks - # *url*: the url of the file to download - # *taxon*: the NCBI taxonomy ID of the species to which this network belongs. Currently not used. - # *species*: the name of the species (i.e., human, mouse). TODO should be used to line up with the mapping - # *prefer_reviewed*: when mapping, if any of the alternate IDs map to a reviewed UniProt ID, keeping only that one. Otherwise keep all UniProt IDs - # *gzip_files*: gzip the individual files - # *remove_filename_spaces*: if there are spaces in the file names, remove them - # *columns_to_keep*: a list of indexes of columns to keep in the new file. Should be >= 2 (first two columns should be the tail and head of the edge) - # If not specified, all columns will be kept - # *sep*: the delimiter of columns in the files - drug-targets: - - name: "pharmgkb" - file_name: "relationships.zip" - network_collection: False - collection_settings: - gzip_files: True - remove_filename_spaces: True - weighted: False - unpack_command: "unzip" - namespace: "ensembl" - # available at https://www.pharmgkb.org/downloads - url: "https://s3.pgkb.org/data/relationships.zip" - taxon: 9606 - species: "human" - mapping_settings: - #keep_multiple: False - # if there are multiple mappings to UniProt IDs, use only the 'reviewed' one(s) - prefer_reviewed: True - columns_to_keep: [] - sep: "\t" - # This section contains settings which will be used to create another config file # in order to run the FastSinkSource pipeline. # See https://github.com/jlaw9/FastSinkSource/tree/no-ontology for more info diff --git a/src/setup_datasets.py b/src/setup_datasets.py index aa75943..a71ab8e 100644 --- a/src/setup_datasets.py +++ b/src/setup_datasets.py @@ -95,6 +95,9 @@ def setup_dataset_files(datasets_dir, dataset_settings, mapping_settings, **kwar if 'genesets' in dataset_settings: genesets_dir = "%s/genesets" % (datasets_dir) setup_genesets(genesets_dir, dataset_settings['genesets'], namespace_mappings, **kwargs) + if 'drug-targets' in dataset_settings: + out_dir = "%s/drug-targets" % (datasets_dir) + setup_drug_targets(out_dir, dataset_settings['drug-targets'], namespace_mappings, **kwargs) if 'networks' in dataset_settings: networks_dir = "%s/networks" % (datasets_dir) setup_networks(networks_dir, dataset_settings['networks'], namespace_mappings, **kwargs) @@ -103,6 +106,53 @@ def setup_dataset_files(datasets_dir, dataset_settings, mapping_settings, **kwar return +def setup_drug_targets(drug_targets_dir, drug_target_settings, namespace_mappings, **kwargs): + """ + Download the drug target files, and map the genes in them to UniProt IDs, if specified. + """ + for settings in drug_target_settings: + drug_targets_file = "%s/%s/%s" % ( + drug_targets_dir, settings['name'], settings['file_name']) + downloaded_file = "%s/%s" % (os.path.dirname(drug_targets_file), settings['url'].split('/')[-1]) + download_dir = os.path.dirname(downloaded_file) + mapping_settings = settings.get("mapping_settings", {}) + # add the mapping settings to all settings + settings.update(mapping_settings) + + if not kwargs.get('force_download') and os.path.isfile(drug_targets_file): + print("%s already exists. Use --force-download to overwrite and re-map to uniprot" % (drug_targets_file)) + continue + if kwargs.get('force_download') and os.path.isfile(downloaded_file): + print("Deleting %s and its contents" % (download_dir)) + shutil.rmtree(download_dir) + # download the file + #try: + download_file(settings['url'], downloaded_file) + #except: + # print("Failed to download '%s' using the url '%s'. Skipping" % (drug_target_files, drug_target['url'])) + # continue + unpack_command = settings.get('unpack_command') + # if its a zip file, unzip first + if unpack_command is not None and unpack_command != "": + command = "%s %s" % (unpack_command, os.path.basename(downloaded_file)) + run_command(command, chdir=download_dir) + + unpacked_file = settings.get('unpacked_file') + unpacked_file = downloaded_file if unpacked_file is None else \ + "%s/%s" % (download_dir, unpacked_file) + all_mapping_stats = setup_geneset( + unpacked_file, drug_targets_file, namespace_mappings, **settings) + if all_mapping_stats is not None: + write_mapping_stats(all_mapping_stats, os.path.dirname(drug_targets_file)) + + +#def setup_drug_target_dataset( +# downloaded_file, drug_target_files, namespace_mappings, +# **kwargs): +# +# return all_mapping_stats + + def setup_genesets(genesets_dir, geneset_settings, namespace_mappings, **kwargs): """ Download all specified geneset files, and map the genes in them to UniProt IDs, if specified. @@ -158,7 +208,7 @@ def setup_genesets(genesets_dir, geneset_settings, namespace_mappings, **kwargs) def setup_geneset( geneset_file, new_file, namespace_mappings, namespace=None, prefer_reviewed=True, - file_type='gmt', sep='\t'): + file_type='gmt', sep='\t', **kwargs): """ *geneset_file*: path/to/original geneset file in edge-list format. First two columns should be the tail and head of the edge *new_file*: path/to/new file to write. Should be a .gmt file since that's the format that will be used @@ -171,17 +221,44 @@ def setup_geneset( *returns*: a dictionary of mapping statistics """ mapping_stats = None - if file_type != 'gmt': - print("WARNING: file_type '%s' not yet implemented. Skipping" % (file_type)) - return mapping_stats - - genesets, descriptions = parse_gmt_file(geneset_file) - if namespace is not None and namespace.lower() not in ['uniprot', 'uniprotkb']: if namespace_mappings is None: print("WARNING: mappings not supplied. Unable to map from '%s' to UniProt IDs." % (namespace)) namespace = None + #if file_type != 'gmt': + # print("WARNING: file_type '%s' not yet implemented. Skipping" % (file_type)) + # return mapping_stats + if file_type == 'table': + print("Reading %s" % (geneset_file)) + df = pd.read_csv(geneset_file, sep=sep, index_col=None) + df2 = filter_and_map_table( + df, namespace_mappings, namespace=namespace, + prefer_reviewed=prefer_reviewed, **kwargs) + # now write to the table to file + print("\twriting %s" % (new_file)) + df2.to_csv(new_file, sep=sep, index=None) + + # extract the "genesets" from the table + geneset_name_col, description_col, genes_col = kwargs['gmt_cols'] + print(geneset_name_col, description_col, genes_col) + genesets = {} + descriptions = {} + for geneset_name, geneset_df in df2.groupby(geneset_name_col): + genesets[geneset_name] = set(geneset_df[genes_col]) + descriptions[geneset_name] = set(geneset_df[description_col]).pop() + print("\t%d genesets, %d total genes" % (len(genesets), len(set(g for gs in genesets.values() for g in gs)))) + # don't need to map from a different namespace anymore + namespace = None + gmt_file = kwargs.get('gmt_file') + if gmt_file is None: + file_end = '.'.join(new_file.split('.')[1:]) + gmt_file = new_file.replace(file_end, 'gmt') + new_file = gmt_file + + if file_type == 'gmt': + genesets, descriptions = parse_gmt_file(geneset_file) + print("\twriting %s" % (new_file)) all_mapping_stats = {} # and re-write the file with the specified columns to keep @@ -190,7 +267,9 @@ def setup_geneset( for name, genes in genesets.items(): prots = genes if namespace is not None: - prots, mapping_stats = map_nodes(genes, namespace_mappings, prefer_reviewed=prefer_reviewed) + ids_to_prot, mapping_stats = map_nodes(genes, namespace_mappings, prefer_reviewed=prefer_reviewed) + # get just the set of uniprot IDs to which the genes map + prots = set(p for ps in ids_to_prot.values() for p in ps) all_mapping_stats[name] = mapping_stats new_line = "%s\t%s\t%s\n" % ( name, descriptions.get(name, ""), '\t'.join(prots)) @@ -199,14 +278,57 @@ def setup_geneset( return all_mapping_stats if len(all_mapping_stats) > 0 else None +def filter_and_map_table( + df, namespace_mappings, namespace=None, + prefer_reviewed=None, col_to_map=0, + filters=None, **kwargs): + # first apply the filters + if filters is not None: + for filter_to_apply in filters: + col, vals = filter_to_apply['col'], filter_to_apply['vals'] + print("\tkeeping only %s values in the '%s' column" % (str(vals), col)) + df = df[df[col].isin(vals)] + #print(df.head()) + num_rows = len(df.index) + print("\t%d rows in table" % (num_rows)) + # now map the ids to UniProt + if namespace is not None: + # the map the ids column to uniprot + col_to_map = df.columns[col_to_map] if isinstance(col_to_map, int) else col_to_map + ids_to_map = set(df[col_to_map]) + ids_to_prot, mapping_stats = map_nodes(ids_to_map, namespace_mappings, prefer_reviewed=prefer_reviewed) + print(mapping_stats) + # now update the dataframe with UniProt IDs + df[col_to_map+"_orig"] = df[col_to_map] + df.set_index(col_to_map, inplace=True) + orig_cols = df.columns + df['uniprot'] = pd.Series(ids_to_prot) + # since there could be cases where a single id maps to multiple ids, we need to expand the rows + # TODO what if I keep only one of the mapped uniprot IDs instead of allowing for multiple? + df2 = pd.concat([df['uniprot'].apply(pd.Series), df], axis=1) \ + .drop('uniprot', axis=1) \ + .melt(id_vars=orig_cols, value_name="uniprot") \ + .drop("variable", axis=1) \ + .dropna(subset=['uniprot']) + # keep the original column name + df2.rename(columns={'uniprot':col_to_map}, inplace=True) + # keep the original order + df2 = df2[[col_to_map]+list(orig_cols)] + print("\t%d rows mapped to %d rows" % (num_rows, len(df2))) + print(df2.head()) + return df2 + def map_nodes(nodes, namespace_mappings, prefer_reviewed=True): + """ + *returns*: a dictionary of node ID to a set of uniprot IDs + """ num_unmapped_nodes = 0 if prefer_reviewed: # get all of the reviewed prots reviewed_prots = namespace_mappings.get('reviewed') if reviewed_prots is None: print("\tWarning: 'prefer_reviewed' was specified, but the reviewed status of the UniProt IDs was not given. Skipping this option.") - new_nodes = [] + ids_to_prot = {} for n in nodes: new_ns = namespace_mappings.get(n) if new_ns is None or len(new_ns) == 0: @@ -216,13 +338,13 @@ def map_nodes(nodes, namespace_mappings, prefer_reviewed=True): # if any of the mapped UniProt IDs are reviewed, keep only those ns_to_keep = new_ns & reviewed_prots new_ns = ns_to_keep if len(ns_to_keep) > 0 else new_ns - # keep the nodes in the same order they were originally in - new_nodes += list(new_ns) + ids_to_prot[n] = list(new_ns) + new_nodes = set(p for ps in ids_to_prot.values() for p in ps) #print("\t%d nodes map to %d new nodes" % (len(nodes), len(new_nodes))) #print("\t%d unmapped edges" % (num_unmapped_edges)) mapping_stats = {'num_nodes': len(nodes), 'num_mapped_nodes': len(new_nodes), 'num_unmapped_nodes': num_unmapped_nodes} - return new_nodes, mapping_stats + return ids_to_prot, mapping_stats def parse_gmt_file(gmt_file): @@ -512,7 +634,7 @@ def download_file(url, file_path): #except urllib.error: except Exception as e: print(e) - print("WARNING: wget failed. Attempting to use the requests library") + print("WARNING: wget failed. Attempting to use the requests library to download the file") # wget didn't work for STRING, gave a 403 error. Using the requests library is working r = requests.get(url, stream=True) if r.status_code == 200: