From ff714afb0b49f8079d80458c2ae6f06a1877dd30 Mon Sep 17 00:00:00 2001 From: tokebe <43009413+tokebe@users.noreply.github.com> Date: Wed, 11 Aug 2021 15:15:11 -0400 Subject: [PATCH] add plugin for AGR --- plugins/agr/manifest.json | 17 ++++++++++ plugins/agr/parser.py | 66 +++++++++++++++++++++++++++++++++++++++ plugins/agr/version.py | 3 ++ 3 files changed, 86 insertions(+) create mode 100644 plugins/agr/manifest.json create mode 100644 plugins/agr/parser.py create mode 100644 plugins/agr/version.py diff --git a/plugins/agr/manifest.json b/plugins/agr/manifest.json new file mode 100644 index 0000000..1b0cf59 --- /dev/null +++ b/plugins/agr/manifest.json @@ -0,0 +1,17 @@ +{ + "version": "0.1", + "__metadata__": { + "url": "https://www.alliancegenome.org/downloads", + "license_url": "https://creativecommons.org/licenses/by/4.0/", + "license": "CC BY 4.0" + }, + "dumper": { + "data_url": "https://download.alliancegenome.org/4.0.0/DISEASE-ALLIANCE/COMBINED/DISEASE-ALLIANCE_COMBINED_47.tsv.gz", + "uncompress": false, + "release": "version:get_release" + }, + "uploader": { + "parser": "parser:load_data", + "on_duplicates": "merge" + } +} diff --git a/plugins/agr/parser.py b/plugins/agr/parser.py new file mode 100644 index 0000000..e293de9 --- /dev/null +++ b/plugins/agr/parser.py @@ -0,0 +1,66 @@ +import re +import os.path +import json +from collections import defaultdict +from biothings.utils.common import open_anyfile +from biothings.utils.dataload import dict_sweep + +SKIP_ROWS = 15 # number of rows to skip +HEADER_ROW = 15 # zero-indexed header row +DESIRED_OBJECT_TYPES = [ + "gene" +] + +def load_data(data_folder): + agr_file = os.path.join(data_folder, "DISEASE-ALLIANCE_COMBINED_47.tsv.gz") + i = -1 + entries = defaultdict(dict) + + + with open_anyfile(agr_file, "r") as file: + for line in file: + i += 1 + if i < SKIP_ROWS: + continue + elif i == HEADER_ROW: + # convert headers to lowercase, underscore_delimited + header = [ + re.sub(r"(.)([A-Z])", r"\1_\2", colname).lower() + for colname in + line.rstrip('\n').split('\t') + ] + continue + + row = line.rstrip('\n').split('\t') + if row[2] not in DESIRED_OBJECT_TYPES: + continue + + # Comments below correspond to original column names + entries[row[3]]["_id"] = row[3] # DBObjectID + if "agr" not in entries[row[3]]: + entries[row[3]]["agr"] = {} + entry = entries[row[3]]["agr"] + entry[header[0]] = row[0] # Taxon + entry[header[1]] = row[1] # SpeciesName + entry["symbol"] = row[4] # originally DBObjectSymbol + + if row[5] not in entry: + entry[row[5]] = [] + + entry[row[5]].append(dict_sweep({ # AssociationType + "doid": row[6], # DOID + "term_name": row[7], # DOtermName + header[8]: list( + filter(len, row[8].split("|")) + ), # WithOrthologs + "inferred_from_id": row[9], # InferredFromID + header[10]: row[10], # InferredFromSymbol + header[11]: row[11], # EvidenceCode + header[12]: row[12], # EvidenceCodeName + header[13]: row[13], # Reference + header[14]: row[14], # Date + header[15]: row[15] # Source + }, remove_invalid_list=True)) + + for doc in entries.values(): + yield dict_sweep(doc, remove_invalid_list=True) diff --git a/plugins/agr/version.py b/plugins/agr/version.py new file mode 100644 index 0000000..c31c40a --- /dev/null +++ b/plugins/agr/version.py @@ -0,0 +1,3 @@ +def get_release(self): + # hard-coded due to hard-coded file download + return "4.0.0"