From f60479102610244df24f73fca0f8ad8aad3f501d Mon Sep 17 00:00:00 2001 From: Evan Morris Date: Thu, 24 Oct 2024 13:23:43 -0400 Subject: [PATCH] updating HGNC file location and version date for new HGNC set up --- parsers/hgnc/src/loadHGNC.py | 48 +++++++++++++++--------------------- 1 file changed, 20 insertions(+), 28 deletions(-) diff --git a/parsers/hgnc/src/loadHGNC.py b/parsers/hgnc/src/loadHGNC.py index 9eee0d56..09a0df57 100644 --- a/parsers/hgnc/src/loadHGNC.py +++ b/parsers/hgnc/src/loadHGNC.py @@ -1,6 +1,7 @@ import argparse import csv import os +import requests from Common.utils import GetData from Common.loader_interface import SourceDataLoader @@ -10,7 +11,7 @@ ############## -# Class: HGNC metabolites loader +# Class: HGNC loader # # By: Phil Owen # Date: 3/31/2021 @@ -21,10 +22,10 @@ class HGNCLoader(SourceDataLoader): source_id: str = HGNC provenance_id: str = 'infores:hgnc' description = "The HUGO Gene Nomenclature Committee (HGNC) database provides open access to HGNC-approved unique symbols and names for human genes, gene groups, and associated resources, including links to genomic, proteomic and phenotypic information." - source_data_url = "ftp://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/" + source_data_url = "https://www.genenames.org/download/archive/" license = "https://www.genenames.org/about/" attribution = "https://www.genenames.org/about/" - parsing_version: str = '1.2' + parsing_version: str = '1.3' def __init__(self, test_mode: bool = False, source_data_dir: str = None): """ @@ -32,17 +33,10 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None): :param source_data_dir - the specific storage directory to save files in """ super().__init__(test_mode=test_mode, source_data_dir=source_data_dir) - + self.source_db = 'HUGO Gene Nomenclature Committee' self.complete_set_file_name = 'hgnc_complete_set.txt' - # self.gene_groups_file_name = 'hgnc_genes_in_groups.txt' - self.data_files: list = [self.complete_set_file_name, - # self.gene_groups_file_name - ] - self.test_mode: bool = test_mode - self.source_db: str = 'HUGO Gene Nomenclature Committee' - - self.ftp_site = 'ftp.ebi.ac.uk' - self.ftp_dir = '/pub/databases/genenames/hgnc/tsv/' + self.data_file = self.complete_set_file_name + self.data_url = "https://storage.googleapis.com/public-download-files/hgnc/tsv/tsv/" def get_latest_source_version(self) -> str: """ @@ -50,27 +44,25 @@ def get_latest_source_version(self) -> str: :return: the data version """ - data_puller = GetData() - # HGNC files change very frequently, excluding the day makes sure we only update it once per month - data_file_date = data_puller.get_ftp_file_date(self.ftp_site, - self.ftp_dir, - self.data_files[0], - exclude_day=True) - return data_file_date + headers = {"Accept": "application/json"} + info_response = requests.get('https://www.genenames.org/rest/info', headers=headers) + if info_response.ok: + info_json = info_response.json() + modified_date = info_json['lastModified'] + latest_version = modified_date.split('T')[0] + return latest_version + else: + info_response.raise_for_status() def get_data(self) -> int: """ Gets the HGNC data from two sources. """ - # get a reference to the data gathering class - gd: GetData = GetData(self.logger.level) - file_count: int = gd.pull_via_ftp(self.ftp_site, self.ftp_dir, [self.complete_set_file_name], self.data_path) - - # get the gene groups dataset - # byte_count: int = gd.pull_via_http('https://www.genenames.org/cgi-bin/genegroup/download-all/' + self.self.gene_groups_file_name, self.data_path) - - return file_count + gd: GetData = GetData() + data_file_url = self.data_url + self.data_file + gd.pull_via_http(url=data_file_url, data_dir=self.data_path) + return True def parse_data(self) -> dict: """