Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

updating HGNC file location and version date for new HGNC set up #263

Merged
merged 1 commit into from
Oct 24, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 20 additions & 28 deletions parsers/hgnc/src/loadHGNC.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import argparse
import csv
import os
import requests

from Common.utils import GetData
from Common.loader_interface import SourceDataLoader
Expand All @@ -10,7 +11,7 @@


##############
# Class: HGNC metabolites loader
# Class: HGNC loader
#
# By: Phil Owen
# Date: 3/31/2021
Expand All @@ -21,56 +22,47 @@ class HGNCLoader(SourceDataLoader):
source_id: str = HGNC
provenance_id: str = 'infores:hgnc'
description = "The HUGO Gene Nomenclature Committee (HGNC) database provides open access to HGNC-approved unique symbols and names for human genes, gene groups, and associated resources, including links to genomic, proteomic and phenotypic information."
source_data_url = "ftp://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/"
source_data_url = "https://www.genenames.org/download/archive/"
license = "https://www.genenames.org/about/"
attribution = "https://www.genenames.org/about/"
parsing_version: str = '1.2'
parsing_version: str = '1.3'

def __init__(self, test_mode: bool = False, source_data_dir: str = None):
"""
:param test_mode - sets the run into test mode
:param source_data_dir - the specific storage directory to save files in
"""
super().__init__(test_mode=test_mode, source_data_dir=source_data_dir)

self.source_db = 'HUGO Gene Nomenclature Committee'
self.complete_set_file_name = 'hgnc_complete_set.txt'
# self.gene_groups_file_name = 'hgnc_genes_in_groups.txt'
self.data_files: list = [self.complete_set_file_name,
# self.gene_groups_file_name
]
self.test_mode: bool = test_mode
self.source_db: str = 'HUGO Gene Nomenclature Committee'

self.ftp_site = 'ftp.ebi.ac.uk'
self.ftp_dir = '/pub/databases/genenames/hgnc/tsv/'
self.data_file = self.complete_set_file_name
self.data_url = "https://storage.googleapis.com/public-download-files/hgnc/tsv/tsv/"

def get_latest_source_version(self) -> str:
"""
gets the version of the data

:return: the data version
"""
data_puller = GetData()
# HGNC files change very frequently, excluding the day makes sure we only update it once per month
data_file_date = data_puller.get_ftp_file_date(self.ftp_site,
self.ftp_dir,
self.data_files[0],
exclude_day=True)
return data_file_date
headers = {"Accept": "application/json"}
info_response = requests.get('https://www.genenames.org/rest/info', headers=headers)
if info_response.ok:
info_json = info_response.json()
modified_date = info_json['lastModified']
latest_version = modified_date.split('T')[0]
return latest_version
else:
info_response.raise_for_status()

def get_data(self) -> int:
"""
Gets the HGNC data from two sources.

"""
# get a reference to the data gathering class
gd: GetData = GetData(self.logger.level)
file_count: int = gd.pull_via_ftp(self.ftp_site, self.ftp_dir, [self.complete_set_file_name], self.data_path)

# get the gene groups dataset
# byte_count: int = gd.pull_via_http('https://www.genenames.org/cgi-bin/genegroup/download-all/' + self.self.gene_groups_file_name, self.data_path)

return file_count
gd: GetData = GetData()
data_file_url = self.data_url + self.data_file
gd.pull_via_http(url=data_file_url, data_dir=self.data_path)
return True

def parse_data(self) -> dict:
"""
Expand Down
Loading