fix find_nodes() and formatting

RomanoLab · Aug 9, 2024 · 83ab717 · 83ab717
1 parent 18c5620
commit 83ab717
Show file tree

Hide file tree

Showing 5 changed files with 223 additions and 69 deletions.
diff --git a/comptox_ai/chemical_featurizer/generate_vectors.py b/comptox_ai/chemical_featurizer/generate_vectors.py
@@ -163,6 +163,7 @@ def retrieve_smiles(
 
     return smiles_list, found_chemical_id_list
 
+
 def generate_3d_conformers(smiles_list):
     """
     Generates a list of mol objects with 3D conformer data from input list of SMILES strings.
@@ -171,7 +172,7 @@ def generate_3d_conformers(smiles_list):
     ----------
     smiles_list : List[str]
         List of SMILES strings.
-        
+
     Returns
     -------
     List[rdkit.Chem.rdchem.Mol]
@@ -189,7 +190,7 @@ def generate_3d_conformers(smiles_list):
     for smiles in smiles_list:
         mol = Chem.MolFromSmiles(smiles)
         if mol is not None:
-            mol = Chem.AddHs(mol)  
+            mol = Chem.AddHs(mol)
             AllChem.EmbedMolecule(mol, randomSeed=0)
             AllChem.UFFOptimizeMolecule(mol)
             mols_with_3d.append(mol)
@@ -220,7 +221,7 @@ def create_vector_table(
     rdkit_descriptors : bool
         Whether full set of rdkit_descriptors should be calculated and incorporated as vectors.
     dtype : type
-        Data type of output df (e.g. float, np.float32, etc.). 
+        Data type of output df (e.g. float, np.float32, etc.).
     molfeat_descriptors : List[str]
         List of features to generate. For possible features, see https://molfeat.datamol.io/featurizers.
      use_original_chemical_ids_for_df_index : bool
@@ -265,29 +266,60 @@ def create_vector_table(
         for feature in molfeat_descriptors:
             print(f"Calculating {feature} descriptors")
 
-            if feature in {"Roberta-Zinc480M-102M", "GPT2-Zinc480M-87M", "ChemGPT-1.2B", "ChemGPT-19M", "ChemGPT-4.7M", "MolT5", "ChemBERTa-77M-MTR", "ChemBERTa-77M-MLM"}:
-                featurizer =  PretrainedHFTransformer(kind=feature, notation='smiles', dtype=dtype) 
-
-            elif feature in {"gin_supervised_masking", "gin_supervised_infomax", "gin_supervised_edgepred", "jtvae_zinc_no_kl", "gin_supervised_contextpred"}:
+            if feature in {
+                "Roberta-Zinc480M-102M",
+                "GPT2-Zinc480M-87M",
+                "ChemGPT-1.2B",
+                "ChemGPT-19M",
+                "ChemGPT-4.7M",
+                "MolT5",
+                "ChemBERTa-77M-MTR",
+                "ChemBERTa-77M-MLM",
+            }:
+                featurizer = PretrainedHFTransformer(
+                    kind=feature, notation="smiles", dtype=dtype
+                )
+
+            elif feature in {
+                "gin_supervised_masking",
+                "gin_supervised_infomax",
+                "gin_supervised_edgepred",
+                "jtvae_zinc_no_kl",
+                "gin_supervised_contextpred",
+            }:
                 featurizer = PretrainedDGLTransformer(kind=feature, dtype=dtype)
-     
+
             else:
-                featurizer = MoleculeTransformer(featurizer=feature, dtype=dtype, verbose=True)
-                if feature in {"desc3D", "desc2D", "electroshape", "usrcat", "usr", "cats3d", "pharm3D-cats", "pharm3D-gobbi", "pharm3D-pmapper"}:
+                featurizer = MoleculeTransformer(
+                    featurizer=feature, dtype=dtype, verbose=True
+                )
+                if feature in {
+                    "desc3D",
+                    "desc2D",
+                    "electroshape",
+                    "usrcat",
+                    "usr",
+                    "cats3d",
+                    "pharm3D-cats",
+                    "pharm3D-gobbi",
+                    "pharm3D-pmapper",
+                }:
                     mol_list = generate_3d_conformers(smiles_list)
                     conformer_3D_flag = True
 
             chemical_list = mol_list if conformer_3D_flag else smiles_list
             vectors.append(featurizer(chemical_list).tolist())
-           
+
             df_column_names.append(feature)
 
     if rdkit_descriptors:
         print(f"Calculating rdkit descriptors")
         mols = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]
         rdkit_features = np.array(
             [
-                np.array(list(Descriptors.CalcMolDescriptors(mol).values()), dtype=dtype)
+                np.array(
+                    list(Descriptors.CalcMolDescriptors(mol).values()), dtype=dtype
+                )
                 for mol in mols
             ]
         )

diff --git a/comptox_ai/db/graph_db.py b/comptox_ai/db/graph_db.py
@@ -428,72 +428,116 @@ def find_node(self, name=None, properties=None):
 
         return node_response[0]["n"]
 
-    def find_nodes(self, properties={}, node_types=[]):
+    def find_nodes(self, search_dict):
         """
         Find multiple nodes by node properties and/or labels.
 
         Parameters
         ----------
-        properties : dict
-        Dict of property values to match in the database query. Each key of
-        `properties` should be a (case-sensitive) node property, and each value
-        should be the value of that property (case- and type-sensitive).
-        node_types : list of str
-        Case sensitive list of strings representing node labels (node types) to
-        include in the results. Two or more node types in a single query may
-        significantly increase runtime. When multiple node labels are given, the
-        results will be the union of all property queries when applied
+        search_dict : Dict[str, Dict[str, Union[str, int, list]]]
+            Case sensitive dictionary of nodes to search for in the format of {node_type : {property : property_value(s)}}
 
         Returns
         -------
-        generator of dict
-        A generator containing dict representations of nodes matching the given
-        query.
+        Dict[str : List[Dict[str, str]]]
+            Nodes found from the query in the format {node_type : [{node_property : property_value}]}
 
-        Notes
-        -----
-        The value returned in the event of a successful query can be extremely
-        large. To improve performance, the results are returned as a generator
-        rather than a list.
-        """
-        if (not properties) and (len(node_types) == 0):
-            raise ValueError(
-                "Error: Query must contain at least one node property or node type."
-            )
+        Raises
+        -------
+        TypeError
+            If type of search_dict is not Dict or if property is not str or if property values are not str, int, or List[Union[str, int]]
 
-        if not properties:
-            warnings.warn(
-                "Warning: No property filters given - the query result may be very large!"
-            )
+        Examples
+        --------
+        >>> db.find_nodes({"Chemical" : {'commonName': ["Hydroxychloroquine", "Warfarin"]}, "Gene" : {"xrefNcbiGene" : 1031, "geneSymbol" : "CDKN1A"}})
+        {'Chemical': [{'commonName': 'Hydroxychloroquine',
+        'synonyms': "+-|Ethanol, 2-[[4-[(7-chloro-4-quinolinyl)amino]pentyl]ethylamino]-|(.+-.)-Hydroxychloroquine|7-Chloro-4-[4-(N-ethyl-N-β-hydroxyethylamino)-1-methylbutylamino]quinoline|7-Chloro-4-[4'-[ethyl (2''-hydroxyethyl) amino]-1'-methylbutylamino]quinoline|7-Chloro-4-[4-[ethyl(2-hydroxyethyl)amino]-1-methylbutylamino]quinoline|7-Chloro-4-[5-(N-ethyl-N-2-hydroxyethylamino)-2-pentyl]aminoquinoline|Ethanol, 2-[[4-[(7-chloro-4-quinolyl)amino]pentyl]ethylamino]-|hidroxicloroquina|Hydroxychlorochin|Oxichloroquine|Oxychlorochin|Oxychloroquine|Racemic Hydroxychloroquine|5-22-10-00280|BRN 0253894|7-Chloro-4-(4-(N-ethyl-N-beta-hydroxyethylamino)-1-methylbutylamino)quinoline|7-Chloro-4-(5-(N-ethyl-N-2-hydroxyethylamino)-2-pentyl)aminoquinoline|EINECS 204-249-8|Oxichlorochinum|Hydroxychloroquinum|Idrossiclorochina|UNII-4QWG6N8QKH|(+-)-hydroxychloroquine|2-((4-((7-chloro-4-quinolyl)amino)pentyl)ethylamino)ethanol|2-(N-(4-(7-chlor-4-chinolylamino)-4-methylbutyl)ethylamino)ethanol|7-chloro-4-(4-(N-ethyl-N-beta-hydroxyethylamino)-1-methylbutylamino)quinoline|7-chloro-4-(4-(ethyl(2-hydroxyethyl)amino)-1-methylbutylamino)quinoline|7-chloro-4-[4-(N-ethyl-N-beta-hydroxyethylamino)-1-methylbutylamino]quinoline|7-chloro-4-[5-(N-ethyl-N-2-hydroxyethylamino)-2-pentyl]aminoquinoline|NSC4375|oxichlorochine",
+        'xrefDrugbank': 'DB01611',
+        'xrefMeSH': 'MESH:D006886',
+        'xrefDTXSID': 'DTXSID8023135',
+        'xrefPubchemSID': '315673741.0',
+        'xrefCasRN': '118-42-3',
+        'uri': 'http://jdr.bio/ontologies/comptox.owl#chemical_dtxsid8023135',
+        'monoisotopicMass': '335.1764402000',
+        'maccs': '00000000000000000000000000000000000000000000000000000000000000000100000000000000101001110011000100101101110111010011101001100100110111110111001001011101010111101111110',
+        'molFormula': 'C18H26ClN3O',
+        'sMILES': 'CCN(CCO)CCCC(C)NC1=CC=NC2=CC(Cl)=CC=C12',
+        'xrefPubchemCID': '3652',
+        'molWeight': '335.8800000000'},
+        {'commonName': 'Warfarin',
+        'synonyms': "Coumadin|2H-1-Benzopyran-2-one, 4-hydroxy-3-(3-oxo-1-phenylbutyl)-|(.+-.)-Warfarin|(.+-.)-Warfarin-alcohol|(RS)-Warfarin|1-(4'-Hydroxy-3'-coumarinyl)-1-phenyl-3-butanone|3-(1'-Phenyl-2'-acetylethyl)-4-hydroxycoumarin|3-(α-Acetonylbenzyl)-4-hydroxycoumarin|3-(α-Phenyl-β-acetylethyl)-4-hydroxycoumarin|3-(α-Phenyl-β-acetylethyl)-4-hydroxy-coumarin|4-Hydroxy-3-(3-oxo-1-phenylbutyl)-2H-1-benzopyran-2-one|4-Hydroxy-3-(3-oxo-1-phenylbutyl)-2H-chromen-2-one|Athrombine-K|BENZOPYRAN(2H-1)-2-ONE, 4-HYDROXY-3-(3-OXO-1- PHENYLBUTYL)-|Brumolin|Coumafen|Coumafene|Coumaphen|Coumarin, 3-(α-acetonylbenzyl)-4-hydroxy-|Coumefene|Dethmor|DL-3-(α-Acetonylbenzyl)-4-hydroxycoumarin|Kumader|Kumatox|NSC 59813|rac-Warfarin|Ratron G|Rodafarin|Rodafarin C|Temus W|Vampirinip II|Vampirinip III|W.A.R.F. 42|WARF compound 42|warfarina|Warfarine|Zoocoumarin|5-18-04-00162|200 coumarin|Arab Rat Death|BRN 1293536|Caswell No. 903|Compound 42|Coumaphene|Coumarins|Cov-R-Tox|Dethnel|Eastern states duocide|EINECS 201-377-6|EPA Pesticide Chemical Code 086002|Frass-ratron|4-Hydroxy-3-(3-oxo-1-phenylbutyl)coumarin|Kypfarin|Liqua-tox|Maag rattentod cum|Mar-frin|Martin's mar-frin|Maveran|Mouse pak|3-(alpha-Phenyl-beta-acetylethyl)-4-hydroxycoumarin|Rat & mice bait|Rat-o-cide #2|Rat-Gard|Rat-B-gon|Rat-Kill|Rat-Mix|Rat-ola|Ratorex|Ratoxin|Rats-No-More|Ratten-koederrohr|Rattenstreupulver Neu Schacht|Rattenstreupulver new schacht|Rattentraenke|Rat-Trol|Rattunal|Rat-A-way|RCRA waste number P001|Ro-Deth|Rodex blox|Rough & ready mouse mix|Solfarin|Spray-trol brand roden-trol|Tox-Hid|Twin light rat away|Warfarat|Warfarin Q|Warfarin plus|DL-3-(alpha-Acetonylbenzyl)-4-hydroxycoumarin|Dicusat E|(Phenyl-1 acetyl-2 ethyl) 3-hydroxy-4 coumarine|3-(alpha-Phenyl-beta-acetylaethyl)-4-hydroxycumarin|4-Hydroxy-3-(3-oxo-1-fenyl-butyl) cumarine|4-Hydroxy-3-(3-oxo-1-phenyl-butyl)-cumarin|4-Idrossi-3-(3-oxo-1-fenil-butil)-cumarine|Warfarinum|UNII-5Q7ZVV76EI",
+        'xrefDrugbank': 'DB00682',
+        'xrefMeSH': 'MESH:D014859',
+        'xrefDTXSID': 'DTXSID5023742',
+        'xrefPubchemSID': '315674265.0',
+        'uri': 'http://jdr.bio/ontologies/comptox.owl#chemical_dtxsid5023742',
+        'xrefCasRN': '81-81-2',
+        'monoisotopicMass': '308.1048589950',
+        'maccs': '00000000000000000000000000000000000000000000000000000000010000000000000000000000000000000101000000100100010000000101000000010101000010001101100111100010101101011011110',
+        'molFormula': 'C19H16O4',
+        'sMILES': 'CC(=O)CC(C1=CC=CC=C1)C1=C(O)C2=CC=CC=C2OC1=O',
+        'xrefAOPWikiStressorID': 195,
+        'xrefPubchemCID': '54678486',
+        'molWeight': '308.3330000000'}],
+        'Gene': [{'typeOfGene': 'protein-coding',
+        'commonName': 'cyclin dependent kinase inhibitor 2C',
+        'xrefOMIM': '603369',
+        'xrefHGNC': '1789',
+        'xrefEnsembl': 'ENSG00000123080',
+        'geneSymbol': 'CDKN2C',
+        'uri': 'http://jdr.bio/ontologies/comptox.owl#gene_cdkn2c',
+        'xrefNcbiGene': 1031},
+        {'typeOfGene': 'protein-coding',
+        'commonName': 'cyclin dependent kinase inhibitor 1A',
+        'xrefOMIM': '116899',
+        'xrefHGNC': '1784',
+        'xrefEnsembl': 'ENSG00000124762',
+        'geneSymbol': 'CDKN1A',
+        'uri': 'http://jdr.bio/ontologies/comptox.owl#gene_cdkn1a',
+        'xrefNcbiGene': 1026}]}
+        """
+
+        result = dict()
+
+        if not isinstance(search_dict, Dict):
+            raise TypeError("search_dict should be a dictionary.")
+
+        for node_type, property_dict in search_dict.items():
+
+            if not isinstance(node_type, str):
+                raise TypeError("node_type should be str.")
+            property_match_substrings = []
+
+            for property, property_value in property_dict.items():
+                if isinstance(property_value, int):
+                    property_match_substrings.append(
+                        f"((n.{property}) = {property_value})"
+                    )
+                elif isinstance(property_value, str):
+                    property_match_substrings.append(
+                        f"((n.{property}) = '{property_value}')"
+                    )
+                elif isinstance(property_value, list):
+                    property_match_substrings.append(
+                        f"((n.{property}) IN {property_value})"
+                    )
+                else:
+                    raise TypeError(
+                        "Property values should be str, int, or list of str or int."
+                    )
 
-        prop_string = ", ".join(
-            [
-                f"{k}: '{v}'" if type(v) == str else f"{k}: {v}"
-                for k, v in properties.items()
-            ]
-        )
+            match_substring = " OR ".join(property_match_substrings)
 
-        # Use a WHERE clause when multiple node types are given
-        if len(node_types) == 1:
-            # Only 1 node label - include it in the MATCH clause
-            match_clause = f"MATCH (n:{node_types[0]} {{ {prop_string} }})"
-            where_clause = ""
-        elif len(node_types) > 1:
-            # Multiple node labels - include them in the WHERE clause
-            match_clause = f"MATCH (n {{ {prop_string} }})"
-            where_clause = " WHERE n:" + " OR n:".join(node_types)
-        else:
-            # No node labels - just use bare MATCH clause and no WHERE clause
-            match_clause = f"MATCH (n {{ {prop_string} }})"
-            where_clause = ""
+            query = f"MATCH (n:{node_type}) WHERE {match_substring} RETURN n"
 
-        query = match_clause + where_clause + " RETURN n;"
+            print(query)
+            response = self.run_cypher(query)
 
-        print(query)
+            response = [node["n"] for node in response]
 
-        nodes_response = self.run_cypher(query)
+            result[node_type] = response
 
-        return (n["n"] for n in nodes_response)
+        return result
 
     def find_relationships(self):
         """

diff --git a/tests/test_chemical_featurizer.py b/tests/test_chemical_featurizer.py
@@ -128,13 +128,20 @@ def test_retrieve_dtxsid(self):
             ["DTXSID8023135", "DTXSID5023742"],
         )
 
+
 class TestGenerate3DConformers:
     def test_generate_3d_conformers(self):
-        mol_list = generate_3d_conformers(["CCN(CCO)CCCC(C)Nc1ccnc2cc(Cl)ccc12", "CC(=O)CC(c1ccccc1)c1c(O)c2ccccc2oc1=O"])
+        mol_list = generate_3d_conformers(
+            [
+                "CCN(CCO)CCCC(C)Nc1ccnc2cc(Cl)ccc12",
+                "CC(=O)CC(c1ccccc1)c1c(O)c2ccccc2oc1=O",
+            ]
+        )
 
         assert Descriptors.MolWt(mol_list[0]) == pytest.approx(335.8789999999995, 0.001)
         assert Descriptors.MolWt(mol_list[1]) == pytest.approx(308.3329999999997, 0.001)
 
+
 class TestCreateVectorTable:
 
     def test_create_vector_table_original_chemical_ids_as_index(self):
@@ -147,7 +154,7 @@ def test_create_vector_table_original_chemical_ids_as_index(self):
         )
         expected_output_df_original_chemical_ids_as_index = pd.read_pickle(df_file_path)
         assert create_vector_table(
-            ["Hydroxychloroquine", "Warfarin"], molfeat_descriptors=['maccs', "erg"]
+            ["Hydroxychloroquine", "Warfarin"], molfeat_descriptors=["maccs", "erg"]
         ).equals(expected_output_df_original_chemical_ids_as_index)
 
     def test_create_vector_table_smiles_as_index(self):
@@ -159,6 +166,6 @@ def test_create_vector_table_smiles_as_index(self):
         expected_output_df_smiles_as_index = pd.read_pickle(df_file_path)
         assert create_vector_table(
             ["Hydroxychloroquine", "Warfarin"],
-            molfeat_descriptors=['maccs', "erg"],
+            molfeat_descriptors=["maccs", "erg"],
             use_original_chemical_ids_for_df_index=False,
         ).equals(expected_output_df_smiles_as_index)