Skip to content

Commit

Permalink
adding COHD
Browse files Browse the repository at this point in the history
  • Loading branch information
EvanDietzMorris committed Nov 8, 2024
1 parent a33f245 commit 8f0e8d0
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 0 deletions.
2 changes: 2 additions & 0 deletions Common/data_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
CHEBI_PROPERTIES = 'CHEBIProps'
CLINICAL_TRIALS_KP = 'ClinicalTrialsKP'
CORD19 = 'Cord19'
COHD = 'COHD'
CTD = 'CTD'
DRUG_CENTRAL = 'DrugCentral'
DRUGMECHDB = 'DrugMechDB'
Expand Down Expand Up @@ -56,6 +57,7 @@
CHEBI_PROPERTIES: ("parsers.chebi.src.loadChebiProperties", "ChebiPropertiesLoader"),
CLINICAL_TRIALS_KP: ("parsers.clinicaltrials.src.loadCTKP", "CTKPLoader"),
CORD19: ("parsers.cord19.src.loadCord19", "Cord19Loader"),
COHD: ("parsers.cohd.src.loadCOHD", "COHDLoader"),
CTD: ("parsers.CTD.src.loadCTD", "CTDLoader"),
DRUG_CENTRAL: ("parsers.drugcentral.src.loaddrugcentral", "DrugCentralLoader"),
DRUGMECHDB: ("parsers.drugmechdb.src.loadDrugMechDB", "DrugMechDBLoader"),
Expand Down
9 changes: 9 additions & 0 deletions graph_specs/default-graph-spec.yml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,15 @@ graphs:
sources:
- source_id: CAM-KP

- graph_id: COHD_Automat
graph_name: COHD
graph_description:
graph_url:
conflation: False
output_format: neo4j
sources:
- source_id: COHD

- graph_id: CTD_Automat
graph_name: CTD
graph_description: 'The Comparative Toxicogenomics Database (CTD) is an open-source database that provides manually curated information about chemical-gene/protein, chemical-disease, and gene/protein-disease relationships, with additional support for the curated relationships provided by function and pathway data.'
Expand Down
76 changes: 76 additions & 0 deletions parsers/cohd/src/loadCOHD.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@

import os
import requests
import yaml

from Common.loader_interface import SourceDataLoader
from Common.biolink_constants import PRIMARY_KNOWLEDGE_SOURCE
from Common.utils import GetData, quick_jsonl_file_iterator


##############
# Class: COHD source loader
#
# Desc: Class that loads/parses the COHD data.
##############
class COHDLoader(SourceDataLoader):

source_id: str = 'COHD'
provenance_id: str = 'infores:cohd'
parsing_version: str = '1.0'

def __init__(self, test_mode: bool = False, source_data_dir: str = None):
"""
:param test_mode - sets the run into test mode
:param source_data_dir - the specific storage directory to save files in
"""
super().__init__(test_mode=test_mode, source_data_dir=source_data_dir)

self.data_url = 'https://stars.renci.org/var/data_services/cohd_2/'
self.version_file = 'cohd.yaml'
self.cohd_nodes = 'cohd_nodes.jsonl'
self.cohd_edges = 'cohd_edges.jsonl'
self.data_files = [self.cohd_nodes, self.cohd_edges]

def get_latest_source_version(self) -> str:
version_file_url = f"{self.data_url}{self.version_file}"
r = requests.get(version_file_url)
if not r.ok:
r.raise_for_status()
version_yaml = yaml.full_load(r.text)
build_version = str(version_yaml['build'])
return build_version

def get_data(self) -> bool:
for data_file in self.data_files:
source_data_url = f'{self.data_url}{data_file}'
data_puller = GetData()
data_puller.pull_via_http(source_data_url, self.data_path)
return True

def parse_data(self) -> dict:
"""
Parses the data file for graph nodes/edges
:return: ret_val: load_metadata
"""
record_counter = 0
skipped_record_counter = 0

nodes_file_path: str = os.path.join(self.data_path, self.cohd_nodes)
for node_json in quick_jsonl_file_iterator(nodes_file_path):
self.output_file_writer.write_normalized_node(node_json)

edges_file_path: str = os.path.join(self.data_path, self.cohd_edges)
for edge_json in quick_jsonl_file_iterator(edges_file_path):
sources = edge_json.pop("sources")
for source in sources:
edge_json[source["resource_role"]] = source["resource_id"]
self.output_file_writer.write_normalized_edge(edge_json)
record_counter += 1

# load up the metadata
load_metadata: dict = {
'num_source_lines': record_counter,
'unusable_source_lines': skipped_record_counter}
return load_metadata

0 comments on commit 8f0e8d0

Please sign in to comment.