Skip to content

Commit

Permalink
Merge pull request #181 from RobokopU24/realign_bindingdb
Browse files Browse the repository at this point in the history
aligned bindingdb
  • Loading branch information
beasleyjonm authored Sep 18, 2023
2 parents 89183d8 + e2eee0c commit 6167ba6
Showing 1 changed file with 67 additions and 44 deletions.
111 changes: 67 additions & 44 deletions parsers/BINDING/src/loadBINDINGDB.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,14 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None):
#We may not even use the thresholds, that way all data can be captured.
self.affinity_threshold = LOG_SCALE_AFFINITY_THRESHOLD

# self.KI_predicate = 'biolink:binds'
# self.IC50_predicate = 'biolink:negatively_regulates_activity_of'
# self.KD_predicate = 'biolink:binds'
# self.EC50_predicate = 'biolink:regulates_activity_of'
# self.KON_predicate = 'biolink:binds'
# self.KOFF_predicate = 'biolink:binds'
self.measure_to_predicate = {
"pKi": "biolink:binds",
"pIC50": "CTD:decreases_activity_of",
"pKd": "biolink:binds",
"pEC50": "CTD:increases_activity_of",
"k_on": "biolink:binds",
"k_off": "biolink:binds"
}

self.bindingdb_version = '202307' # TODO temporarily hard coded until renci connection bug is resolved
self.bindingdb_version = self.get_latest_source_version()
Expand Down Expand Up @@ -108,6 +110,8 @@ def get_data(self) -> int:
def parse_data(self) -> dict:
"""
Parses the data file for graph nodes/edges
We are going to merge rows that have the same ligand, protein, and affinity type. This will allow us to
calculate a single affinity value for each edge.
:return: ret_val: load_metadata
"""
Expand All @@ -129,56 +133,75 @@ def parse_data(self) -> dict:
if (ligand == '') or (protein == ''): # Check if Pubchem or UniProt ID is missing.
n+=1
continue
ligand_protein_key = f"{ligand}~{protein}"
# The section below checks through all of the previous entry keys and uses
found_key = False
index = None
if ligand_protein_key in data_store: #TODO start here
entry = data_store[ligand_protein_key]
found_key = True
else:
entry = {}
entry.update({'ligand':f"PUBCHEM.COMPOUND:{ligand}"})
entry.update({'protein':f"UniProtKB:{protein}"})

publications = [x for x in [f"pmid:{row[BD_EDGEUMAN.PMID.value]}",f"pubchem_aid:{row[BD_EDGEUMAN.PUBCHEM_AID.value]}",f"patent:{row[BD_EDGEUMAN.PATENT_NUMBER.value]}"] if x not in ['pmid:','pubchem_aid:','patent:']]
if row[BD_EDGEUMAN.pKi.value] != '':
publication = f"PMID:{row[BD_EDGEUMAN.PMID.value]}"
else:
publication = None

for column in columns:

if row[column[0]] != '':
measure_type = column[1]
if measure_type not in entry.keys():
entry.update({measure_type:[]})
try:
if measure_type in ["k_on", "k_off"]:
value = round(float(row[column[0]].replace('>','').replace('<','').replace(' ','')),2)
elif measure_type in ["pKi", "pKd", "pIC50", "pEC50"]:
value = round(negative_log(float(row[column[0]].replace('>','').replace('<','').replace(' ',''))),2)
except Exception as e:
self.logger.info(f"Error:{e} on value: {row[column[0]]} {measure_type}")
value = "undefined"


entry[measure_type].append({
AFFINITY:value,
PUBLICATIONS:publications
})

if PUBLICATIONS not in entry.keys():
entry.update({PUBLICATIONS: []})
entry[PUBLICATIONS] = list(set(entry[PUBLICATIONS] + publications))

if found_key:
data_store[ligand_protein_key] = entry
else:
data_store.update({ligand_protein_key:entry})
if measure_type in ["k_on", "k_off"]:
# JMB says:
# These are just rate terms used to calculate Kd/Ki so each row with a k_on/k_off value
# already has another measurement type in the row, and that other measurement has far more value.
continue
ligand_protein_measure_key = f"{ligand}~{protein}~{measure_type}"
# The section below checks through all of the previous entry keys and uses
if ligand_protein_measure_key in data_store: # TODO start here
entry = data_store[ligand_protein_measure_key]
found_key = True
else:
entry = {}
entry.update({'ligand': f"PUBCHEM.COMPOUND:{ligand}"})
entry.update({'protein': f"UniProtKB:{protein}"})
entry.update({'predicate': self.measure_to_predicate[measure_type]})
entry.update({'affinity_parameter': measure_type})
entry.update({'supporting_affinities': []})
entry.update({'publications': []})
data_store[ligand_protein_measure_key] = entry
#If there's a > in the result, it means that this is a dead compound, i.e. it won't bass
# our activity/inhibition threshold
if ">" in row[column[0]]:
continue
sa = float(row[column[0]].replace('>','').replace('<','').replace(' ',''))
# I don't see how 0 would be a valid affinity value, so we'll skip it
if sa == 0:
continue
entry["supporting_affinities"].append(sa)
if publication is not None and publication not in entry["publications"]:
entry["publications"].append(publication)

n+=1

bad_entries = set()
for key, entry in data_store.items():
if len(entry["supporting_affinities"]) == 0:
bad_entries.add(key)
continue
if len(entry["publications"]) == 0:
del entry["publications"]
try:
average_affinity = sum(entry["supporting_affinities"])/len(entry["supporting_affinities"])
entry["affinity"] = round(negative_log(average_affinity),2)
entry["supporting_affinities"] = [round(negative_log(x),2) for x in entry["supporting_affinities"]]
except:
bad_entries.add(key)

import json
for badkey in bad_entries:
bad_entry = data_store.pop(badkey)
if len(bad_entry["supporting_affinities"]) == 0:
continue
print(json.dumps(bad_entry,indent=4))

extractor = Extractor(file_writer=self.output_file_writer)
extractor.json_extract(data_store,
lambda item: data_store[item]['ligand'], # subject id
lambda item: data_store[item]['protein'], # object id
lambda item: "biolink:binds",
lambda item: data_store[item]['predicate'], # predicate
lambda item: {}, #Node 1 props
lambda item: {}, #Node 2 props
lambda item: {key:value for key,value in data_store[item].items() if key not in ['ligand','protein']} #Edge props
Expand Down

0 comments on commit 6167ba6

Please sign in to comment.