Merge pull request #181 from RobokopU24/realign_bindingdb

aligned bindingdb
RobokopU24 · Sep 18, 2023 · 6167ba6 · 6167ba6
2 parents 89183d8 + e2eee0c
commit 6167ba6
Showing 1 changed file with 67 additions and 44 deletions.
diff --git a/parsers/BINDING/src/loadBINDINGDB.py b/parsers/BINDING/src/loadBINDINGDB.py
@@ -63,12 +63,14 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None):
         #We may not even use the thresholds, that way all data can be captured.
         self.affinity_threshold = LOG_SCALE_AFFINITY_THRESHOLD
 
-        # self.KI_predicate = 'biolink:binds'
-        # self.IC50_predicate = 'biolink:negatively_regulates_activity_of'
-        # self.KD_predicate = 'biolink:binds'
-        # self.EC50_predicate = 'biolink:regulates_activity_of'
-        # self.KON_predicate = 'biolink:binds'
-        # self.KOFF_predicate = 'biolink:binds'
+        self.measure_to_predicate = {
+            "pKi": "biolink:binds",
+            "pIC50": "CTD:decreases_activity_of",
+            "pKd": "biolink:binds",
+            "pEC50": "CTD:increases_activity_of",
+            "k_on": "biolink:binds",
+            "k_off": "biolink:binds"
+        }
 
         self.bindingdb_version = '202307'  # TODO temporarily hard coded until renci connection bug is resolved
         self.bindingdb_version = self.get_latest_source_version()
@@ -108,6 +110,8 @@ def get_data(self) -> int:
     def parse_data(self) -> dict:
         """
         Parses the data file for graph nodes/edges
+        We are going to merge rows that have the same ligand, protein, and affinity type.  This will allow us to
+        calculate a single affinity value for each edge.
 
         :return: ret_val: load_metadata
         """
@@ -129,56 +133,75 @@ def parse_data(self) -> dict:
             if (ligand == '') or (protein == ''): # Check if Pubchem or UniProt ID is missing.
                 n+=1
                 continue
-            ligand_protein_key = f"{ligand}~{protein}"
-            # The section below checks through all of the previous entry keys and uses
-            found_key = False
-            index = None
-            if ligand_protein_key in data_store: #TODO start here 
-                entry = data_store[ligand_protein_key]
-                found_key = True
-            else:
-                entry = {}
-                entry.update({'ligand':f"PUBCHEM.COMPOUND:{ligand}"})
-                entry.update({'protein':f"UniProtKB:{protein}"})
 
-            publications = [x for x in [f"pmid:{row[BD_EDGEUMAN.PMID.value]}",f"pubchem_aid:{row[BD_EDGEUMAN.PUBCHEM_AID.value]}",f"patent:{row[BD_EDGEUMAN.PATENT_NUMBER.value]}"] if x not in ['pmid:','pubchem_aid:','patent:']]
+            if row[BD_EDGEUMAN.pKi.value] != '':
+                publication = f"PMID:{row[BD_EDGEUMAN.PMID.value]}"
+            else:
+                publication = None
 
             for column in columns:
 
                 if row[column[0]] != '':
                     measure_type = column[1]
-                    if measure_type not in entry.keys():
-                        entry.update({measure_type:[]})
-                    try:
-                        if measure_type in ["k_on", "k_off"]:
-                            value = round(float(row[column[0]].replace('>','').replace('<','').replace(' ','')),2)
-                        elif measure_type in ["pKi", "pKd", "pIC50", "pEC50"]:
-                            value = round(negative_log(float(row[column[0]].replace('>','').replace('<','').replace(' ',''))),2)
-                    except Exception as e:
-                        self.logger.info(f"Error:{e} on value: {row[column[0]]} {measure_type}")
-                        value = "undefined"
-
-
-                    entry[measure_type].append({
-                        AFFINITY:value,
-                        PUBLICATIONS:publications
-                    })
-
-            if PUBLICATIONS not in entry.keys():
-                entry.update({PUBLICATIONS: []})
-            entry[PUBLICATIONS] = list(set(entry[PUBLICATIONS] + publications))
-
-            if found_key:
-                data_store[ligand_protein_key] = entry
-            else:
-                data_store.update({ligand_protein_key:entry})
+                    if measure_type in ["k_on", "k_off"]:
+                        # JMB says:
+                        # These are just rate terms used to calculate Kd/Ki so each row with a k_on/k_off value
+                        # already has another measurement type in the row, and that other measurement has far more value.
+                        continue
+                    ligand_protein_measure_key = f"{ligand}~{protein}~{measure_type}"
+                    # The section below checks through all of the previous entry keys and uses
+                    if ligand_protein_measure_key in data_store:  # TODO start here
+                        entry = data_store[ligand_protein_measure_key]
+                        found_key = True
+                    else:
+                        entry = {}
+                        entry.update({'ligand': f"PUBCHEM.COMPOUND:{ligand}"})
+                        entry.update({'protein': f"UniProtKB:{protein}"})
+                        entry.update({'predicate': self.measure_to_predicate[measure_type]})
+                        entry.update({'affinity_parameter': measure_type})
+                        entry.update({'supporting_affinities': []})
+                        entry.update({'publications': []})
+                        data_store[ligand_protein_measure_key] = entry
+                    #If there's a > in the result, it means that this is a dead compound, i.e. it won't bass
+                    # our activity/inhibition threshold
+                    if ">" in row[column[0]]:
+                        continue
+                    sa = float(row[column[0]].replace('>','').replace('<','').replace(' ',''))
+                    # I don't see how 0 would be a valid affinity value, so we'll skip it
+                    if sa == 0:
+                        continue
+                    entry["supporting_affinities"].append(sa)
+                    if publication is not None and publication not in entry["publications"]:
+                        entry["publications"].append(publication)
+
             n+=1
 
+        bad_entries = set()
+        for key, entry in data_store.items():
+            if len(entry["supporting_affinities"]) == 0:
+                bad_entries.add(key)
+                continue
+            if len(entry["publications"]) == 0:
+                del entry["publications"]
+            try:
+                average_affinity = sum(entry["supporting_affinities"])/len(entry["supporting_affinities"])
+                entry["affinity"] = round(negative_log(average_affinity),2)
+                entry["supporting_affinities"] = [round(negative_log(x),2) for x in entry["supporting_affinities"]]
+            except:
+                bad_entries.add(key)
+
+        import json
+        for badkey in bad_entries:
+            bad_entry = data_store.pop(badkey)
+            if len(bad_entry["supporting_affinities"]) == 0:
+                continue
+            print(json.dumps(bad_entry,indent=4))
+
         extractor = Extractor(file_writer=self.output_file_writer)
         extractor.json_extract(data_store,
                             lambda item: data_store[item]['ligand'],  # subject id
                             lambda item: data_store[item]['protein'],  # object id
-                            lambda item: "biolink:binds",
+                            lambda item: data_store[item]['predicate'],  # predicate
                             lambda item: {}, #Node 1 props
                             lambda item: {}, #Node 2 props
                             lambda item: {key:value for key,value in data_store[item].items() if key not in ['ligand','protein']} #Edge props