Merge pull request #207 from abearab/main

Improvements in the context of knowledge graph resources
mims-harvard · Mar 12, 2024 · 2ccd732 · 2ccd732
2 parents 0e14a64 + 6b73b78
commit 2ccd732
Show file tree

Hide file tree

Showing 4 changed files with 73 additions and 4 deletions.
diff --git a/.gitignore b/.gitignore
@@ -7,6 +7,7 @@ __pycache__/
 *.so
 
 # Distribution / packaging
+.idea/
 .Python
 build/
 develop-eggs/

diff --git a/tdc/__init__.py b/tdc/__init__.py
@@ -2,3 +2,4 @@
 from .oracles import Oracle
 from .benchmark_deprecated import BenchmarkGroup
 from .tdc_hf import tdc_hf_interface
+from tdc.utils.knowledge_graph import KnowledgeGraph
diff --git a/tdc/resource/primekg.py b/tdc/resource/primekg.py
@@ -5,14 +5,13 @@
 This file contains a primekg dataloader. 
 """
 
-import pandas as pd
 import numpy as np
-import sys
 import warnings
 
-warnings.filterwarnings("ignore")
-
 from ..utils import general_load
+from ..utils.knowledge_graph import KnowledgeGraph
+
+warnings.filterwarnings("ignore")
 
 
 class PrimeKG:
@@ -46,3 +45,37 @@ def get_node_list(self, node_type):
         df = self.df
         return np.unique(df[(df.x_type == node_type)].x_id.unique().tolist() +
                          df[(df.y_type == node_type)].y_id.unique().tolist())
+
+
+class PrimeKGDev(KnowledgeGraph):
+    """PrimeKG data loader class to load the knowledge graph with additional support functions.
+    """
+
+    def __init__(self, path="./data"):
+        """load the KG to the specified path"""
+        self = KnowledgeGraph(df=general_load("primekg", path, ","))
+        self.path = path
+
+    def get_data(self):
+        return self.df
+
+    def to_nx(self):
+        import networkx as nx
+
+        G = nx.Graph()
+        for i in self.df.relation.unique():
+            G.add_edges_from(self.df[self.df.relation == i][["x_id",
+                                                             "y_id"]].values,
+                             relation=i)
+        return G
+
+    def get_features(self, feature_type):
+        if feature_type not in ["drug", "disease"]:
+            raise ValueError("feature_type only supports drug/disease!")
+        return general_load("primekg_" + feature_type + "_feature", self.path,
+                            "\t")
+
+    def get_node_list(self, node_type):
+        df = self.df
+        return np.unique(df[(df.x_type == node_type)].x_id.unique().tolist() +
+                         df[(df.y_type == node_type)].y_id.unique().tolist())
diff --git a/tdc/utils/knowledge_graph.py b/tdc/utils/knowledge_graph.py
@@ -0,0 +1,34 @@
+"""A python module to build, handle, explore, and manipulate knowledge graphs.
+"""
+
+import pandas as pd
+from copy import copy
+
+kg_columns = [
+    'relation', 'display_relation', 'x_id', 'x_type', 'x_name', 'x_source',
+    'y_id', 'y_type', 'y_name', 'y_source'
+]
+
+
+class KnowledgeGraph:
+
+    def __init__(self, df=None):
+        if df is not None:
+            self.df = df
+        else:
+            self.df = pd.DataFrame('', columns=kg_columns)
+
+    def copy(self):
+        return copy(self)
+
+    def run_query(self, query):
+        """build subgraph using given query"""
+        self.df_raw = self.df
+        self.df = self.df.query(query).reset_index(drop=True)
+
+    def get_nodes_by_source(self, source):
+        # extract x nodes
+        x_df = self.df.query(
+            f"x_source == '{source}' | y_source == '{source}'")[[
+                col for col in self.df.columns if col.startswith("x_")
+            ]]