From 27d8134323bf7c78ea021269e6fda5cfe3a101d5 Mon Sep 17 00:00:00 2001 From: Yao Yao Date: Tue, 29 Nov 2022 13:41:04 -0800 Subject: [PATCH 01/15] format the code --- parser.py | 190 +++++++++++++++++++++++++++++------------------------- 1 file changed, 103 insertions(+), 87 deletions(-) diff --git a/parser.py b/parser.py index f357eae..301ac96 100644 --- a/parser.py +++ b/parser.py @@ -4,95 +4,111 @@ import re -def load_data(data_folder): - - def construct_rec(line): - - reg = re.compile('^[C0-9|\.]+$') # CUI format check - - # skip some error records (e.g., predication id:80264847,123980473). - if reg.match(line[8].strip()): - sub_umls = line[4].split("|") - obj_umls = line[8].split("|") - predication_id = line[0] - sub_semtype = line[6] - sub_name = line[5].split("|") - obj_semtype = line[10] - obj_name = line[9].split("|") - pmid = int(line[2]) - pred = line[3] - sub_novelty = int(line[7]) - obj_novelty = int(line[11]) - sub_id_field = "umls" - obj_id_field = "umls" - - sub_semtype_name = None - obj_semtype_name = None - - # Find UMLS mapping - if (sub_semtype in type_label): - sub_semtype_name = type_label[sub_semtype] - - if (obj_semtype in type_label): - obj_semtype_name = type_label[obj_semtype] - - # Define ID field name - if "C" not in line[4]: # one or more gene ids - sub_id_field = "ncbigene" - else: - if '|' in line[4]: - # take first CUI if it contains gene id(s) - sub_umls = [sub_umls[0]] - sub_name = [sub_name[0]] - - if "C" not in line[8]: # one or more gene ids - obj_id_field = "ncbigene" - else: - if '|' in line[8]: # take first CUI if it contains gene id(s) - obj_umls = [obj_umls[0]] - obj_name = [obj_name[0]] - - rec_dict_list = [] - id_count = 0 # loop to get all id combinations if one record has multiple ids - for sub_idx, sub_id in enumerate(sub_umls): - for obj_idx, obj_id in enumerate(obj_umls): - - id_count += 1 - if len(sub_umls) == 1 and len(obj_umls) == 1: - id_value = predication_id - else: - id_value = predication_id + "_" + str(id_count) # add sequence id - - rec_dict = { - "_id": id_value, - "predicate": pred, - "predication_id": predication_id, - "pmid": pmid, - "subject": { - sub_id_field: sub_id, - "name": sub_name[sub_idx], - "semantic_type_abbreviation": sub_semtype, - "semantic_type_name": sub_semtype_name, - "novelty": sub_novelty - }, - "object": { - obj_id_field: obj_id, - "name": obj_name[obj_idx], - "semantic_type_abbreviation": obj_semtype, - "semantic_type_name": obj_semtype_name, - "novelty": obj_novelty - } +def construct_rec(line, type_label): + """ + SemMedDB Database Details: https://lhncbc.nlm.nih.gov/ii/tools/SemRep_SemMedDB_SKR/dbinfo.html + + Name: PREDICATION table + Each record in this table identifies a unique predication. The data fields are as follows: + + PREDICATION_ID : Auto-generated primary key for each unique predication + SENTENCE_ID : Foreign key to the SENTENCE table + PMID : The PubMed identifier of the citation to which the predication belongs + PREDICATE : The string representation of each predicate (for example TREATS, PROCESS_OF) + SUBJECT_CUI : The CUI of the subject of the predication + SUBJECT_NAME : The preferred name of the subject of the predication + SUBJECT_SEMTYPE : The semantic type of the subject of the predication + SUBJECT_NOVELTY : The novelty of the subject of the predication + OBJECT_CUI : The CUI of the object of the predication + OBJECT_NAME : The preferred name of the object of the predication + OBJECT_SEMTYPE : The semantic type of the object of the predication + OBJECT_NOVELTY : The novelty of the object of the predication + """ + reg = re.compile('^[C0-9|\.]+$') # CUI format check + + # skip some error records (e.g., predication id:80264847,123980473). + if reg.match(line[8].strip()): + predication_id = line[0] + # ignore line[1] + pmid = int(line[2]) + pred = line[3] + + sub_umls = line[4].split("|") + sub_name = line[5].split("|") + sub_semtype = line[6] + sub_novelty = int(line[7]) + + obj_umls = line[8].split("|") + obj_name = line[9].split("|") + obj_semtype = line[10] + obj_novelty = int(line[11]) + + # Find UMLS mapping + sub_semtype_name = type_label.get(sub_semtype, None) + obj_semtype_name = type_label.get(obj_semtype, None) + + sub_id_field = "umls" + obj_id_field = "umls" + + # Define ID field name + if "C" not in line[4]: # one or more gene ids + sub_id_field = "ncbigene" + else: + if '|' in line[4]: + # take first CUI if it contains gene id(s) + sub_umls = [sub_umls[0]] + sub_name = [sub_name[0]] + + if "C" not in line[8]: # one or more gene ids + obj_id_field = "ncbigene" + else: + if '|' in line[8]: # take first CUI if it contains gene id(s) + obj_umls = [obj_umls[0]] + obj_name = [obj_name[0]] + + rec_dict_list = [] + id_count = 0 # loop to get all id combinations if one record has multiple ids + for sub_idx, sub_id in enumerate(sub_umls): + for obj_idx, obj_id in enumerate(obj_umls): + + id_count += 1 + if len(sub_umls) == 1 and len(obj_umls) == 1: + id_value = predication_id + else: + id_value = predication_id + "_" + str(id_count) # add sequence id + + rec_dict = { + "_id": id_value, + "predicate": pred, + "predication_id": predication_id, + "pmid": pmid, + "subject": { + sub_id_field: sub_id, + "name": sub_name[sub_idx], + "semantic_type_abbreviation": sub_semtype, + "semantic_type_name": sub_semtype_name, + "novelty": sub_novelty + }, + "object": { + obj_id_field: obj_id, + "name": obj_name[obj_idx], + "semantic_type_abbreviation": obj_semtype, + "semantic_type_name": obj_semtype_name, + "novelty": obj_novelty } + } - # del semtype_name field if we did not any mappings - if not sub_semtype_name: - del rec_dict["subject"]["semantic_type_name"] - if not obj_semtype_name: - del rec_dict["object"]["semantic_type_name"] - rec_dict_list.append(rec_dict) + # del semtype_name field if we did not any mappings + if not sub_semtype_name: + del rec_dict["subject"]["semantic_type_name"] + if not obj_semtype_name: + del rec_dict["object"]["semantic_type_name"] + rec_dict_list.append(rec_dict) - return rec_dict_list + return rec_dict_list + +def load_data(data_folder): edges_path = os.path.join(data_folder, "semmed_0821.csv") mapping_path = os.path.join(data_folder, "SemanticTypes_2013AA.txt") names = pd.read_csv(mapping_path, sep="|", names=['abv', 'ID', 'label']) @@ -109,8 +125,8 @@ def construct_rec(line): for _item in csv_reader: count += 1 print("Data Generation Progess:", str(count)+"/"+str(csv_total)) - records = construct_rec(_item) - if(records): + records = construct_rec(_item, type_label) + if records: for record in records: yield record print("=====") From 468ea764aa2825f5de6501c4f26b418f8ddb3065 Mon Sep 17 00:00:00 2001 From: Yao Yao Date: Tue, 29 Nov 2022 14:06:40 -0800 Subject: [PATCH 02/15] tiny refactor to eliminate pandas dependency --- parser.py | 31 +++++++++++-------------------- 1 file changed, 11 insertions(+), 20 deletions(-) diff --git a/parser.py b/parser.py index 301ac96..c1bc5f2 100644 --- a/parser.py +++ b/parser.py @@ -1,6 +1,5 @@ import csv import os -import pandas as pd import re @@ -109,25 +108,17 @@ def construct_rec(line, type_label): def load_data(data_folder): - edges_path = os.path.join(data_folder, "semmed_0821.csv") - mapping_path = os.path.join(data_folder, "SemanticTypes_2013AA.txt") - names = pd.read_csv(mapping_path, sep="|", names=['abv', 'ID', 'label']) - type_label = dict(zip(names.abv, names.label)) - - with open(edges_path) as f: # get total record count - next(f) - csv_total = sum(1 for line in f) - - with open(edges_path) as f: # data prep - csv_reader = csv.reader(f, delimiter=';') - next(csv_reader) - count = 0 - for _item in csv_reader: - count += 1 - print("Data Generation Progess:", str(count)+"/"+str(csv_total)) - records = construct_rec(_item, type_label) + semantic_type_filepath = os.path.join(data_folder, "SemanticTypes_2013AA.txt") + with open(semantic_type_filepath) as f: + semantic_type_reader = csv.DictReader(f, delimiter="|", fieldnames=['abv', 'ID', 'label']) + semantic_type_map = dict(zip((row["abv"], row["label"]) for row in semantic_type_reader)) + + semmed_path = os.path.join(data_folder, "semmed_0821.csv") + with open(semmed_path) as f: + semmed_reader = csv.reader(f, delimiter=';') + next(semmed_reader) + for _item in semmed_reader: + records = construct_rec(_item, semantic_type_map) if records: for record in records: yield record - print("=====") - print("Data Generation is Done.") From bbb64f6a7a5bb398084ec39adc6e2ab31cc93d19 Mon Sep 17 00:00:00 2001 From: Yao Yao Date: Mon, 5 Dec 2022 11:57:52 -0800 Subject: [PATCH 03/15] major refactor: rename variables and functions; simply object CUI format checking; specify csv reading options --- parser.py | 257 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 158 insertions(+), 99 deletions(-) diff --git a/parser.py b/parser.py index c1bc5f2..f1abc34 100644 --- a/parser.py +++ b/parser.py @@ -1,18 +1,43 @@ -import csv import os -import re +import pandas as pd -def construct_rec(line, type_label): +def is_valid_object_cui(object_cui: str): + """ + Currently in semmedVER43_2022_R_PREDICATION.csv there are a few rows with invalid object CUIs, as below: + + (index) PREDICATION_ID OBJECT_CUI + 7154043 80264874 1|medd + 7154067 80264901 1|medd + 35698397 109882731 235|Patients + 35700796 109885303 1524|Pain + 48339691 123980473 1|anim + 60007185 137779669 1|dsyn + 69460686 149136787 6|gngm + 80202338 160180312 1|humn + 111403674 192912334 1|neop + 114631930 196519528 1|humn + 114631934 196519532 1|humn + + This function checks the format of the input object CUI. + """ + # RE is an overkill (w.r.t. semmedVER43_2022_R_PREDICATION.csv); we can simply check if the CUI starts with "C" + + # cui_pattern = re.compile('^[C0-9|.]+$') + # return cui_pattern.match(object_cui.strip()) + + return object_cui.strip().startswith("C") + + +def construct_documents(row: pd.Series, semantic_type_map): """ SemMedDB Database Details: https://lhncbc.nlm.nih.gov/ii/tools/SemRep_SemMedDB_SKR/dbinfo.html Name: PREDICATION table - Each record in this table identifies a unique predication. The data fields are as follows: + Each record in this table identifies a unique predication. The data fields of our interest are as follows: PREDICATION_ID : Auto-generated primary key for each unique predication SENTENCE_ID : Foreign key to the SENTENCE table - PMID : The PubMed identifier of the citation to which the predication belongs PREDICATE : The string representation of each predicate (for example TREATS, PROCESS_OF) SUBJECT_CUI : The CUI of the subject of the predication SUBJECT_NAME : The preferred name of the subject of the predication @@ -23,102 +48,136 @@ def construct_rec(line, type_label): OBJECT_SEMTYPE : The semantic type of the object of the predication OBJECT_NOVELTY : The novelty of the object of the predication """ - reg = re.compile('^[C0-9|\.]+$') # CUI format check - - # skip some error records (e.g., predication id:80264847,123980473). - if reg.match(line[8].strip()): - predication_id = line[0] - # ignore line[1] - pmid = int(line[2]) - pred = line[3] - - sub_umls = line[4].split("|") - sub_name = line[5].split("|") - sub_semtype = line[6] - sub_novelty = int(line[7]) - - obj_umls = line[8].split("|") - obj_name = line[9].split("|") - obj_semtype = line[10] - obj_novelty = int(line[11]) - - # Find UMLS mapping - sub_semtype_name = type_label.get(sub_semtype, None) - obj_semtype_name = type_label.get(obj_semtype, None) - - sub_id_field = "umls" - obj_id_field = "umls" - - # Define ID field name - if "C" not in line[4]: # one or more gene ids - sub_id_field = "ncbigene" - else: - if '|' in line[4]: - # take first CUI if it contains gene id(s) - sub_umls = [sub_umls[0]] - sub_name = [sub_name[0]] - - if "C" not in line[8]: # one or more gene ids - obj_id_field = "ncbigene" - else: - if '|' in line[8]: # take first CUI if it contains gene id(s) - obj_umls = [obj_umls[0]] - obj_name = [obj_name[0]] - - rec_dict_list = [] - id_count = 0 # loop to get all id combinations if one record has multiple ids - for sub_idx, sub_id in enumerate(sub_umls): - for obj_idx, obj_id in enumerate(obj_umls): - - id_count += 1 - if len(sub_umls) == 1 and len(obj_umls) == 1: - id_value = predication_id - else: - id_value = predication_id + "_" + str(id_count) # add sequence id - - rec_dict = { - "_id": id_value, - "predicate": pred, - "predication_id": predication_id, - "pmid": pmid, - "subject": { - sub_id_field: sub_id, - "name": sub_name[sub_idx], - "semantic_type_abbreviation": sub_semtype, - "semantic_type_name": sub_semtype_name, - "novelty": sub_novelty - }, - "object": { - obj_id_field: obj_id, - "name": obj_name[obj_idx], - "semantic_type_abbreviation": obj_semtype, - "semantic_type_name": obj_semtype_name, - "novelty": obj_novelty - } + if not is_valid_object_cui(row["OBJECT_CUI"]): + return + + predication_id = row["PREDICATION_ID"] + pmid = row["PMID"] + predicate = row["PREDICATE"] + + sub_cui = row["SUBJECT_CUI"].split("|") + sub_name = row["SUBJECT_NAME"].split("|") + sub_semantic_type_abv = row["SUBJECT_SEMTYPE"] + sub_semantic_type_name = semantic_type_map.get(sub_semantic_type_abv, None) + sub_novelty = row["SUBJECT_NOVELTY"] + + obj_cui = row["OBJECT_CUI"].split("|") + obj_name = row["OBJECT_NAME"].split("|") + obj_semantic_type_abv = row["OBJECT_SEMTYPE"] + obj_semantic_type_name = semantic_type_map.get(obj_semantic_type_abv, None) + obj_novelty = row["OBJECT_NOVELTY"] + + sub_id_field = "umls" + obj_id_field = "umls" + + # Define ID field name + if "C" not in row["SUBJECT_CUI"]: # one or more gene ids + sub_id_field = "ncbigene" + else: + if '|' in row["SUBJECT_CUI"]: + # take first CUI if it contains gene id(s) + sub_cui = [sub_cui[0]] + sub_name = [sub_name[0]] + + if "C" not in row["OBJECT_CUI"]: # one or more gene ids + obj_id_field = "ncbigene" + else: + if '|' in row["OBJECT_CUI"]: # take first CUI if it contains gene id(s) + obj_cui = [obj_cui[0]] + obj_name = [obj_name[0]] + + id_count = 0 # loop to get all id combinations if one record has multiple ids + for sub_idx, sub_id in enumerate(sub_cui): + for obj_idx, obj_id in enumerate(obj_cui): + + id_count += 1 + if len(sub_cui) == 1 and len(obj_cui) == 1: + _id = predication_id + else: + _id = predication_id + "_" + str(id_count) # add sequence id + + doc = { + "_id": _id, + "predicate": predicate, + "predication_id": predication_id, + "pmid": pmid, + "subject": { + sub_id_field: sub_id, + "name": sub_name[sub_idx], + "semantic_type_abbreviation": sub_semantic_type_abv, + "semantic_type_name": sub_semantic_type_name, + "novelty": sub_novelty + }, + "object": { + obj_id_field: obj_id, + "name": obj_name[obj_idx], + "semantic_type_abbreviation": obj_semantic_type_abv, + "semantic_type_name": obj_semantic_type_name, + "novelty": obj_novelty } + } + + # del semtype_name field if we did not any mappings + if not sub_semantic_type_name: + del doc["subject"]["semantic_type_name"] + if not obj_semantic_type_name: + del doc["object"]["semantic_type_name"] + yield doc + + +def read_semantic_type_data_frame(data_folder, filename) -> pd.DataFrame: + filepath = os.path.join(data_folder, filename) + column_info = [ + (0, 'abv', str), + # (1, 'ID', str), + (2, 'label', str) + ] + column_indices = [e[0] for e in column_info] + column_names = [e[1] for e in column_info] + column_dtypes = {e[1]: e[2] for e in column_info} + data_frame = pd.read_csv(filepath, sep="|", names=column_names, usecols=column_indices, dtype=column_dtypes) + return data_frame + + +def read_semmed_data_frame(data_folder, filename) -> pd.DataFrame: + filepath = os.path.join(data_folder, filename) + encoding = "latin1" # TODO encode in UTF-8 before outputting + na_value = r"\N" + column_info = [ + # Each element is a tuple of (column_index, column_name, data_type) + # See column description at https://lhncbc.nlm.nih.gov/ii/tools/SemRep_SemMedDB_SKR/dbinfo.html + # "Int8" is a nullable integer type (while `int` cannot handle NA values), range [-128, 127] + # "UInt32" ranges [0, 4294967295] + # See https://pandas.pydata.org/docs/user_guide/basics.html#basics-dtypes + (0, "PREDICATION_ID", str), # column 0 (Auto-generated primary key; read as strings for easier concatenation) + # (1, "SENTENCE_ID", str), # column 1 (ignored) + (2, "PMID", "UInt32"), # column 2 (PubMed IDs are 8-digit numbers) + (3, "PREDICATE", str), # column 3 + (4, "SUBJECT_CUI", str), # column 4 + (5, "SUBJECT_NAME", str), # column 5 + (6, "SUBJECT_SEMTYPE", str), # column 6 + (7, "SUBJECT_NOVELTY", "Int8"), # column 7 + (8, "OBJECT_CUI", str), # column 8 + (9, "OBJECT_NAME", str), # column 9 + (10, "OBJECT_SEMTYPE", str), # column 10 + (11, "OBJECT_NOVELTY", "Int8") # column 11 + # (12, "FACT_VALUE", "Int8"), # column 12 (ignored) + # (13, "MOD_SCALE", "Int8"), # column 13 (ignored) + # (14, "MOD_VALUE", "Int8"), # column 14 (ignored) + ] + column_indices = [e[0] for e in column_info] + column_names = [e[1] for e in column_info] + column_dtypes = {e[1]: e[2] for e in column_info} + data_frame = pd.read_csv(filepath, names=column_names, sep=",", usecols=column_indices, + dtype=column_dtypes, na_values=[na_value], encoding=encoding) + return data_frame - # del semtype_name field if we did not any mappings - if not sub_semtype_name: - del rec_dict["subject"]["semantic_type_name"] - if not obj_semtype_name: - del rec_dict["object"]["semantic_type_name"] - rec_dict_list.append(rec_dict) - return rec_dict_list +def load_data(data_folder): + semantic_type_df = read_semantic_type_data_frame(data_folder, "SemanticTypes_2013AA.txt") + semantic_type_map = dict(zip(semantic_type_df["abv"], semantic_type_df["label"])) + semmed_df = read_semmed_data_frame(data_folder, "semmedVER43_2022_R_PREDICATION.csv") + for _, row in semmed_df.iterrows(): + yield from construct_documents(row, semantic_type_map) -def load_data(data_folder): - semantic_type_filepath = os.path.join(data_folder, "SemanticTypes_2013AA.txt") - with open(semantic_type_filepath) as f: - semantic_type_reader = csv.DictReader(f, delimiter="|", fieldnames=['abv', 'ID', 'label']) - semantic_type_map = dict(zip((row["abv"], row["label"]) for row in semantic_type_reader)) - - semmed_path = os.path.join(data_folder, "semmed_0821.csv") - with open(semmed_path) as f: - semmed_reader = csv.reader(f, delimiter=';') - next(semmed_reader) - for _item in semmed_reader: - records = construct_rec(_item, semantic_type_map) - if records: - for record in records: - yield record From 2531bb211a4eaf4a32dcd16d55fe17752aadfb77 Mon Sep 17 00:00:00 2001 From: Yao Yao Date: Mon, 5 Dec 2022 16:17:09 -0800 Subject: [PATCH 04/15] resort to RE filter again with a simplified pattern; use pd.Series-wise matching operation to accelarate --- parser.py | 72 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 42 insertions(+), 30 deletions(-) diff --git a/parser.py b/parser.py index f1abc34..0d6b9fe 100644 --- a/parser.py +++ b/parser.py @@ -2,33 +2,6 @@ import pandas as pd -def is_valid_object_cui(object_cui: str): - """ - Currently in semmedVER43_2022_R_PREDICATION.csv there are a few rows with invalid object CUIs, as below: - - (index) PREDICATION_ID OBJECT_CUI - 7154043 80264874 1|medd - 7154067 80264901 1|medd - 35698397 109882731 235|Patients - 35700796 109885303 1524|Pain - 48339691 123980473 1|anim - 60007185 137779669 1|dsyn - 69460686 149136787 6|gngm - 80202338 160180312 1|humn - 111403674 192912334 1|neop - 114631930 196519528 1|humn - 114631934 196519532 1|humn - - This function checks the format of the input object CUI. - """ - # RE is an overkill (w.r.t. semmedVER43_2022_R_PREDICATION.csv); we can simply check if the CUI starts with "C" - - # cui_pattern = re.compile('^[C0-9|.]+$') - # return cui_pattern.match(object_cui.strip()) - - return object_cui.strip().startswith("C") - - def construct_documents(row: pd.Series, semantic_type_map): """ SemMedDB Database Details: https://lhncbc.nlm.nih.gov/ii/tools/SemRep_SemMedDB_SKR/dbinfo.html @@ -48,9 +21,6 @@ def construct_documents(row: pd.Series, semantic_type_map): OBJECT_SEMTYPE : The semantic type of the object of the predication OBJECT_NOVELTY : The novelty of the object of the predication """ - if not is_valid_object_cui(row["OBJECT_CUI"]): - return - predication_id = row["PREDICATION_ID"] pmid = row["PMID"] predicate = row["PREDICATE"] @@ -173,11 +143,53 @@ def read_semmed_data_frame(data_folder, filename) -> pd.DataFrame: return data_frame +def clean_semmed_data_frame(data_frame: pd.DataFrame): + """ + This function exclude rows with "invalid" object CUIs in the Semmed data frame. + + A "valid" object CUI present in "semmedVER43_2022_R_PREDICATION.csv" can be either: + 1. A true CUI (starting with "C", followed by seven numbers, like "C0003725") + 2. A NCBI gene ID (a numerical string, like "1756") + 3. A piped string of a true CUI and multiple NCBI gene IDs (like "C1414968|2597") + 4. A piped string of multiple NCBI gene IDs (like "4914|7170") + + Currently in "semmedVER43_2022_R_PREDICATION.csv" there are a few rows with invalid object CUIs, as below: + + (index) PREDICATION_ID OBJECT_CUI + 7154043 80264874 1|medd + 7154067 80264901 1|medd + 35698397 109882731 235|Patients + 35700796 109885303 1524|Pain + 48339691 123980473 1|anim + 60007185 137779669 1|dsyn + 69460686 149136787 6|gngm + 80202338 160180312 1|humn + 111403674 192912334 1|neop + 114631930 196519528 1|humn + 114631934 196519532 1|humn + + Subject CUIs are all valid in "semmedVER43_2022_R_PREDICATION.csv" + """ + + """ + Below is the previous row-wise filter. Issues: + 1. Object CUIs don't contain spaces; `strip()` operation unnecessary + 2. Object CUIs don't contain dots; RE pattern can be simplified + 3. Row-wise RE matching is slow; `pd.Series.str.match()` is much faster + """ + # cui_pattern = re.compile(r'^[C0-9|.]+$') # multiple occurrences of "C", "0" to "9", "|" (vertical bar), or "." (dot) + # return cui_pattern.match(object_cui.strip()) + + cui_pattern = r"^[C0-9|]+$" # multiple occurrences of "C", "0" to "9", or "|" (vertical bar) + return data_frame.loc[data_frame["OBJECT_CUI"].str.match(cui_pattern)] + + def load_data(data_folder): semantic_type_df = read_semantic_type_data_frame(data_folder, "SemanticTypes_2013AA.txt") semantic_type_map = dict(zip(semantic_type_df["abv"], semantic_type_df["label"])) semmed_df = read_semmed_data_frame(data_folder, "semmedVER43_2022_R_PREDICATION.csv") + semmed_df = clean_semmed_data_frame(semmed_df) for _, row in semmed_df.iterrows(): yield from construct_documents(row, semantic_type_map) From 377ddaf371ac2b4fbc86bfa68a0da1960067ba40 Mon Sep 17 00:00:00 2001 From: Yao Yao Date: Tue, 6 Dec 2022 11:01:41 -0800 Subject: [PATCH 05/15] tiny refactor: rename variables and simplify if conditions --- parser.py | 50 ++++++++++++++++++++++++-------------------------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/parser.py b/parser.py index 0d6b9fe..4f7d336 100644 --- a/parser.py +++ b/parser.py @@ -25,43 +25,40 @@ def construct_documents(row: pd.Series, semantic_type_map): pmid = row["PMID"] predicate = row["PREDICATE"] - sub_cui = row["SUBJECT_CUI"].split("|") - sub_name = row["SUBJECT_NAME"].split("|") + sub_cui_list = row["SUBJECT_CUI"].split("|") + sub_name_list = row["SUBJECT_NAME"].split("|") sub_semantic_type_abv = row["SUBJECT_SEMTYPE"] sub_semantic_type_name = semantic_type_map.get(sub_semantic_type_abv, None) sub_novelty = row["SUBJECT_NOVELTY"] - obj_cui = row["OBJECT_CUI"].split("|") - obj_name = row["OBJECT_NAME"].split("|") + obj_cui_list = row["OBJECT_CUI"].split("|") + obj_name_list = row["OBJECT_NAME"].split("|") obj_semantic_type_abv = row["OBJECT_SEMTYPE"] obj_semantic_type_name = semantic_type_map.get(obj_semantic_type_abv, None) obj_novelty = row["OBJECT_NOVELTY"] - sub_id_field = "umls" - obj_id_field = "umls" + # if "C" not present, the CUI field must be one or more gene ids + sub_id_field = "umls" if "C" in row["SUBJECT_CUI"] else "ncbigene" + obj_id_field = "umls" if "C" in row["OBJECT_CUI"] else "ncbigene" - # Define ID field name - if "C" not in row["SUBJECT_CUI"]: # one or more gene ids - sub_id_field = "ncbigene" - else: - if '|' in row["SUBJECT_CUI"]: + if sub_id_field == "umls": + if '|' in row["SUBJECT_CUI"]: # equivalent to `if len(sub_cui_list) > 1` # take first CUI if it contains gene id(s) - sub_cui = [sub_cui[0]] - sub_name = [sub_name[0]] + sub_cui_list = [sub_cui_list[0]] + sub_name_list = [sub_name_list[0]] - if "C" not in row["OBJECT_CUI"]: # one or more gene ids - obj_id_field = "ncbigene" - else: - if '|' in row["OBJECT_CUI"]: # take first CUI if it contains gene id(s) - obj_cui = [obj_cui[0]] - obj_name = [obj_name[0]] + if obj_id_field == "umls": + if '|' in row["OBJECT_CUI"]: # equivalent to `if len(obj_cui_list) > 1` + # take first CUI if it contains gene id(s) + obj_cui_list = [obj_cui_list[0]] + obj_name_list = [obj_name_list[0]] id_count = 0 # loop to get all id combinations if one record has multiple ids - for sub_idx, sub_id in enumerate(sub_cui): - for obj_idx, obj_id in enumerate(obj_cui): + for sub_idx, sub_cui in enumerate(sub_cui_list): + for obj_idx, obj_cui in enumerate(obj_cui_list): id_count += 1 - if len(sub_cui) == 1 and len(obj_cui) == 1: + if len(sub_cui_list) == 1 and len(obj_cui_list) == 1: _id = predication_id else: _id = predication_id + "_" + str(id_count) # add sequence id @@ -72,15 +69,15 @@ def construct_documents(row: pd.Series, semantic_type_map): "predication_id": predication_id, "pmid": pmid, "subject": { - sub_id_field: sub_id, - "name": sub_name[sub_idx], + sub_id_field: sub_cui, + "name": sub_name_list[sub_idx], "semantic_type_abbreviation": sub_semantic_type_abv, "semantic_type_name": sub_semantic_type_name, "novelty": sub_novelty }, "object": { - obj_id_field: obj_id, - "name": obj_name[obj_idx], + obj_id_field: obj_cui, + "name": obj_name_list[obj_idx], "semantic_type_abbreviation": obj_semantic_type_abv, "semantic_type_name": obj_semantic_type_name, "novelty": obj_novelty @@ -92,6 +89,7 @@ def construct_documents(row: pd.Series, semantic_type_map): del doc["subject"]["semantic_type_name"] if not obj_semantic_type_name: del doc["object"]["semantic_type_name"] + yield doc From ec678ee7eb2f1917cb6450447e5af1eae82ba179 Mon Sep 17 00:00:00 2001 From: Yao Yao Date: Tue, 6 Dec 2022 11:07:50 -0800 Subject: [PATCH 06/15] restructure the parser; add functions about MRCUI data --- parser.py | 221 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 131 insertions(+), 90 deletions(-) diff --git a/parser.py b/parser.py index 4f7d336..a6766fd 100644 --- a/parser.py +++ b/parser.py @@ -2,6 +2,137 @@ import pandas as pd +################################### +# PART 1: Load Semantic Type Data # +################################### + +def read_semantic_type_data_frame(data_folder, filename) -> pd.DataFrame: + filepath = os.path.join(data_folder, filename) + column_info = [ + (0, 'abv', str), + # (1, 'ID', str), + (2, 'label', str) + ] + column_indices = [e[0] for e in column_info] + column_names = [e[1] for e in column_info] + column_dtypes = {e[1]: e[2] for e in column_info} + data_frame = pd.read_csv(filepath, sep="|", names=column_names, usecols=column_indices, dtype=column_dtypes) + return data_frame + + +################################# +# PART 2: Load Retired CUI Data # +################################# + +def read_mrcui_data_frame(data_folder, filename): + filepath = os.path.join(data_folder, filename) + column_info = [ + # Each element is a tuple of (column_index, column_name, data_type) + # See column description at https://www.ncbi.nlm.nih.gov/books/NBK9685/table/ch03.T.retired_cui_mapping_file_mrcui_rr/ + (0, "CUI1", str), # column 0 + # (1, "VER", str), # column 1 (ignored) + (2, "REL", str), # column 2 + # (3, "RELA", str), # column 3 (ignored) + # (4, "MAPREASON", str), # column 4 (ignored) + (5, "CUI2", str), # column 5 + # (6, "MAPIN", str) # column 6 (ignored) + ] + column_indices = [e[0] for e in column_info] + column_names = [e[1] for e in column_info] + column_dtypes = {e[1]: e[2] for e in column_info} + data_frame = pd.read_csv(filepath, sep="|", names=column_names, usecols=column_indices, dtype=column_dtypes) + return data_frame + + +def get_deleted_cuis(mrcui_data_frame: pd.DataFrame) -> set: + deleted_flags = mrcui_data_frame["REL"] == "DEL" + deleted_cuis = mrcui_data_frame.loc[deleted_flags, "CUI1"] + return set(deleted_cuis) + + +############################ +# PART 3: Load SemMed Data # +############################ + +def read_semmed_data_frame(data_folder, filename) -> pd.DataFrame: + filepath = os.path.join(data_folder, filename) + encoding = "latin1" # TODO encode in UTF-8 before outputting + na_value = r"\N" + column_info = [ + # Each element is a tuple of (column_index, column_name, data_type) + # See column description at https://lhncbc.nlm.nih.gov/ii/tools/SemRep_SemMedDB_SKR/dbinfo.html + # "Int8" is a nullable integer type (while `int` cannot handle NA values), range [-128, 127] + # "UInt32" ranges [0, 4294967295] + # See https://pandas.pydata.org/docs/user_guide/basics.html#basics-dtypes + (0, "PREDICATION_ID", str), # column 0 (Auto-generated primary key; read as strings for easier concatenation) + # (1, "SENTENCE_ID", str), # column 1 (ignored) + (2, "PMID", "UInt32"), # column 2 (PubMed IDs are 8-digit numbers) + (3, "PREDICATE", str), # column 3 + (4, "SUBJECT_CUI", str), # column 4 + (5, "SUBJECT_NAME", str), # column 5 + (6, "SUBJECT_SEMTYPE", str), # column 6 + (7, "SUBJECT_NOVELTY", "Int8"), # column 7 + (8, "OBJECT_CUI", str), # column 8 + (9, "OBJECT_NAME", str), # column 9 + (10, "OBJECT_SEMTYPE", str), # column 10 + (11, "OBJECT_NOVELTY", "Int8") # column 11 + # (12, "FACT_VALUE", "Int8"), # column 12 (ignored) + # (13, "MOD_SCALE", "Int8"), # column 13 (ignored) + # (14, "MOD_VALUE", "Int8"), # column 14 (ignored) + ] + column_indices = [e[0] for e in column_info] + column_names = [e[1] for e in column_info] + column_dtypes = {e[1]: e[2] for e in column_info} + data_frame = pd.read_csv(filepath, names=column_names, sep=",", usecols=column_indices, + dtype=column_dtypes, na_values=[na_value], encoding=encoding) + return data_frame + + +def clean_semmed_data_frame(data_frame: pd.DataFrame): + """ + This function exclude rows with "invalid" object CUIs in the Semmed data frame. + + A "valid" object CUI present in "semmedVER43_2022_R_PREDICATION.csv" can be either: + 1. A true CUI (starting with "C", followed by seven numbers, like "C0003725") + 2. A NCBI gene ID (a numerical string, like "1756") + 3. A piped string of a true CUI and multiple NCBI gene IDs (like "C1414968|2597") + 4. A piped string of multiple NCBI gene IDs (like "4914|7170") + + Currently in "semmedVER43_2022_R_PREDICATION.csv" there are a few rows with invalid object CUIs, as below: + + (index) PREDICATION_ID OBJECT_CUI + 7154043 80264874 1|medd + 7154067 80264901 1|medd + 35698397 109882731 235|Patients + 35700796 109885303 1524|Pain + 48339691 123980473 1|anim + 60007185 137779669 1|dsyn + 69460686 149136787 6|gngm + 80202338 160180312 1|humn + 111403674 192912334 1|neop + 114631930 196519528 1|humn + 114631934 196519532 1|humn + + Subject CUIs are all valid in "semmedVER43_2022_R_PREDICATION.csv" + """ + + """ + Below is the previous row-wise filter. Issues: + 1. Object CUIs don't contain spaces; `strip()` operation unnecessary + 2. Object CUIs don't contain dots; RE pattern can be simplified + 3. Row-wise RE matching is slow; `pd.Series.str.match()` is much faster + """ + # cui_pattern = re.compile(r'^[C0-9|.]+$') # multiple occurrences of "C", "0" to "9", "|" (vertical bar), or "." (dot) + # return cui_pattern.match(object_cui.strip()) + + cui_pattern = r"^[C0-9|]+$" # multiple occurrences of "C", "0" to "9", or "|" (vertical bar) + return data_frame.loc[data_frame["OBJECT_CUI"].str.match(cui_pattern)] + + +################## +# PART 4: Parser # +################## + def construct_documents(row: pd.Series, semantic_type_map): """ SemMedDB Database Details: https://lhncbc.nlm.nih.gov/ii/tools/SemRep_SemMedDB_SKR/dbinfo.html @@ -93,95 +224,6 @@ def construct_documents(row: pd.Series, semantic_type_map): yield doc -def read_semantic_type_data_frame(data_folder, filename) -> pd.DataFrame: - filepath = os.path.join(data_folder, filename) - column_info = [ - (0, 'abv', str), - # (1, 'ID', str), - (2, 'label', str) - ] - column_indices = [e[0] for e in column_info] - column_names = [e[1] for e in column_info] - column_dtypes = {e[1]: e[2] for e in column_info} - data_frame = pd.read_csv(filepath, sep="|", names=column_names, usecols=column_indices, dtype=column_dtypes) - return data_frame - - -def read_semmed_data_frame(data_folder, filename) -> pd.DataFrame: - filepath = os.path.join(data_folder, filename) - encoding = "latin1" # TODO encode in UTF-8 before outputting - na_value = r"\N" - column_info = [ - # Each element is a tuple of (column_index, column_name, data_type) - # See column description at https://lhncbc.nlm.nih.gov/ii/tools/SemRep_SemMedDB_SKR/dbinfo.html - # "Int8" is a nullable integer type (while `int` cannot handle NA values), range [-128, 127] - # "UInt32" ranges [0, 4294967295] - # See https://pandas.pydata.org/docs/user_guide/basics.html#basics-dtypes - (0, "PREDICATION_ID", str), # column 0 (Auto-generated primary key; read as strings for easier concatenation) - # (1, "SENTENCE_ID", str), # column 1 (ignored) - (2, "PMID", "UInt32"), # column 2 (PubMed IDs are 8-digit numbers) - (3, "PREDICATE", str), # column 3 - (4, "SUBJECT_CUI", str), # column 4 - (5, "SUBJECT_NAME", str), # column 5 - (6, "SUBJECT_SEMTYPE", str), # column 6 - (7, "SUBJECT_NOVELTY", "Int8"), # column 7 - (8, "OBJECT_CUI", str), # column 8 - (9, "OBJECT_NAME", str), # column 9 - (10, "OBJECT_SEMTYPE", str), # column 10 - (11, "OBJECT_NOVELTY", "Int8") # column 11 - # (12, "FACT_VALUE", "Int8"), # column 12 (ignored) - # (13, "MOD_SCALE", "Int8"), # column 13 (ignored) - # (14, "MOD_VALUE", "Int8"), # column 14 (ignored) - ] - column_indices = [e[0] for e in column_info] - column_names = [e[1] for e in column_info] - column_dtypes = {e[1]: e[2] for e in column_info} - data_frame = pd.read_csv(filepath, names=column_names, sep=",", usecols=column_indices, - dtype=column_dtypes, na_values=[na_value], encoding=encoding) - return data_frame - - -def clean_semmed_data_frame(data_frame: pd.DataFrame): - """ - This function exclude rows with "invalid" object CUIs in the Semmed data frame. - - A "valid" object CUI present in "semmedVER43_2022_R_PREDICATION.csv" can be either: - 1. A true CUI (starting with "C", followed by seven numbers, like "C0003725") - 2. A NCBI gene ID (a numerical string, like "1756") - 3. A piped string of a true CUI and multiple NCBI gene IDs (like "C1414968|2597") - 4. A piped string of multiple NCBI gene IDs (like "4914|7170") - - Currently in "semmedVER43_2022_R_PREDICATION.csv" there are a few rows with invalid object CUIs, as below: - - (index) PREDICATION_ID OBJECT_CUI - 7154043 80264874 1|medd - 7154067 80264901 1|medd - 35698397 109882731 235|Patients - 35700796 109885303 1524|Pain - 48339691 123980473 1|anim - 60007185 137779669 1|dsyn - 69460686 149136787 6|gngm - 80202338 160180312 1|humn - 111403674 192912334 1|neop - 114631930 196519528 1|humn - 114631934 196519532 1|humn - - Subject CUIs are all valid in "semmedVER43_2022_R_PREDICATION.csv" - """ - - """ - Below is the previous row-wise filter. Issues: - 1. Object CUIs don't contain spaces; `strip()` operation unnecessary - 2. Object CUIs don't contain dots; RE pattern can be simplified - 3. Row-wise RE matching is slow; `pd.Series.str.match()` is much faster - """ - # cui_pattern = re.compile(r'^[C0-9|.]+$') # multiple occurrences of "C", "0" to "9", "|" (vertical bar), or "." (dot) - # return cui_pattern.match(object_cui.strip()) - - cui_pattern = r"^[C0-9|]+$" # multiple occurrences of "C", "0" to "9", or "|" (vertical bar) - return data_frame.loc[data_frame["OBJECT_CUI"].str.match(cui_pattern)] - - def load_data(data_folder): semantic_type_df = read_semantic_type_data_frame(data_folder, "SemanticTypes_2013AA.txt") semantic_type_map = dict(zip(semantic_type_df["abv"], semantic_type_df["label"])) @@ -190,4 +232,3 @@ def load_data(data_folder): semmed_df = clean_semmed_data_frame(semmed_df) for _, row in semmed_df.iterrows(): yield from construct_documents(row, semantic_type_map) - From 20669ddc699dd0955f00bdab01e43a5143b01c2e Mon Sep 17 00:00:00 2001 From: Yao Yao Date: Tue, 6 Dec 2022 12:01:19 -0800 Subject: [PATCH 07/15] add filter on novelty scores for SemMed data --- parser.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/parser.py b/parser.py index a6766fd..ad4139f 100644 --- a/parser.py +++ b/parser.py @@ -71,11 +71,11 @@ def read_semmed_data_frame(data_folder, filename) -> pd.DataFrame: (4, "SUBJECT_CUI", str), # column 4 (5, "SUBJECT_NAME", str), # column 5 (6, "SUBJECT_SEMTYPE", str), # column 6 - (7, "SUBJECT_NOVELTY", "Int8"), # column 7 + (7, "SUBJECT_NOVELTY", "Int8"), # column 7 (Currently either 0 or 1) (8, "OBJECT_CUI", str), # column 8 (9, "OBJECT_NAME", str), # column 9 (10, "OBJECT_SEMTYPE", str), # column 10 - (11, "OBJECT_NOVELTY", "Int8") # column 11 + (11, "OBJECT_NOVELTY", "Int8") # column 11 (Currently either 0 or 1) # (12, "FACT_VALUE", "Int8"), # column 12 (ignored) # (13, "MOD_SCALE", "Int8"), # column 13 (ignored) # (14, "MOD_VALUE", "Int8"), # column 14 (ignored) @@ -88,7 +88,7 @@ def read_semmed_data_frame(data_folder, filename) -> pd.DataFrame: return data_frame -def clean_semmed_data_frame(data_frame: pd.DataFrame): +def remove_invalid_object_cuis(data_frame: pd.DataFrame): """ This function exclude rows with "invalid" object CUIs in the Semmed data frame. @@ -129,6 +129,15 @@ def clean_semmed_data_frame(data_frame: pd.DataFrame): return data_frame.loc[data_frame["OBJECT_CUI"].str.match(cui_pattern)] +def remove_zero_novelty(data_frame: pd.DataFrame): + """ + Records with novelty score equal to 0 should be removed. + See discussion in https://github.com/biothings/pending.api/issues/63#issuecomment-1100469563 + """ + flags = (data_frame["SUBJECT_NOVELTY"] != 0) & (data_frame["OBJECT_NOVELTY"] != 0) + return data_frame.loc[flags] + + ################## # PART 4: Parser # ################## @@ -229,6 +238,7 @@ def load_data(data_folder): semantic_type_map = dict(zip(semantic_type_df["abv"], semantic_type_df["label"])) semmed_df = read_semmed_data_frame(data_folder, "semmedVER43_2022_R_PREDICATION.csv") - semmed_df = clean_semmed_data_frame(semmed_df) + semmed_df = remove_invalid_object_cuis(semmed_df) + semmed_df = remove_zero_novelty(semmed_df) for _, row in semmed_df.iterrows(): yield from construct_documents(row, semantic_type_map) From f86216b2eb528efcfc263fe1bf082be7884e213d Mon Sep 17 00:00:00 2001 From: Yao Yao Date: Thu, 29 Dec 2022 22:06:27 -0800 Subject: [PATCH 08/15] major refactor: preprocess all predications in Pandas --- parser.py | 550 ++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 434 insertions(+), 116 deletions(-) diff --git a/parser.py b/parser.py index ad4139f..c06381a 100644 --- a/parser.py +++ b/parser.py @@ -1,25 +1,42 @@ import os +from typing import Dict, Set +import requests import pandas as pd +import numpy as np ################################### # PART 1: Load Semantic Type Data # ################################### -def read_semantic_type_data_frame(data_folder, filename) -> pd.DataFrame: +def read_semantic_type_mappings_data_frame(data_folder, filename) -> pd.DataFrame: filepath = os.path.join(data_folder, filename) column_info = [ - (0, 'abv', str), - # (1, 'ID', str), - (2, 'label', str) + # See column description at https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/documentation/SemanticTypesAndGroups.html + (0, 'abbreviation', "string"), + # (1, 'TUI', "string"), + (2, 'fullname', "string") ] column_indices = [e[0] for e in column_info] column_names = [e[1] for e in column_info] column_dtypes = {e[1]: e[2] for e in column_info} data_frame = pd.read_csv(filepath, sep="|", names=column_names, usecols=column_indices, dtype=column_dtypes) + + data_frame = data_frame.astype({ + "abbreviation": "string[pyarrow]", + "fullname": "string[pyarrow]" + }) + return data_frame +def get_semtype_name_map(semantic_type_data_frame: pd.DataFrame) -> Dict: + """ + Get a map of of Semantic Types + """ + return dict(zip(semantic_type_data_frame["abbreviation"], semantic_type_data_frame["fullname"])) + + ################################# # PART 2: Load Retired CUI Data # ################################# @@ -29,34 +46,105 @@ def read_mrcui_data_frame(data_folder, filename): column_info = [ # Each element is a tuple of (column_index, column_name, data_type) # See column description at https://www.ncbi.nlm.nih.gov/books/NBK9685/table/ch03.T.retired_cui_mapping_file_mrcui_rr/ - (0, "CUI1", str), # column 0 - # (1, "VER", str), # column 1 (ignored) - (2, "REL", str), # column 2 - # (3, "RELA", str), # column 3 (ignored) - # (4, "MAPREASON", str), # column 4 (ignored) - (5, "CUI2", str), # column 5 - # (6, "MAPIN", str) # column 6 (ignored) + (0, "CUI1", "string"), # column 0 + # (1, "VER", "string"), # column 1 (ignored) + (2, "REL", "category"), # column 2 + # (3, "RELA", "string"), # column 3 (ignored) + # (4, "MAPREASON", "string"), # column 4 (ignored) + (5, "CUI2", "string"), # column 5 + # (6, "MAPIN", "string") # column 6 (ignored). We confirmed that CUI1 and CUI2 columns has no CUIs in common ] column_indices = [e[0] for e in column_info] column_names = [e[1] for e in column_info] column_dtypes = {e[1]: e[2] for e in column_info} data_frame = pd.read_csv(filepath, sep="|", names=column_names, usecols=column_indices, dtype=column_dtypes) + + data_frame = data_frame.astype({ + "CUI1": "string[pyarrow]", + "CUI2": "string[pyarrow]" + }) + return data_frame -def get_deleted_cuis(mrcui_data_frame: pd.DataFrame) -> set: - deleted_flags = mrcui_data_frame["REL"] == "DEL" - deleted_cuis = mrcui_data_frame.loc[deleted_flags, "CUI1"] +def get_retired_cuis_for_deletion(mrcui_data_frame: pd.DataFrame) -> Set: + deletion_flags = (mrcui_data_frame["REL"] == "DEL") + deleted_cuis = mrcui_data_frame.loc[deletion_flags, "CUI1"].unique() return set(deleted_cuis) +def get_retirement_mapping_data_frame(mrcui_data_frame: pd.DataFrame) -> pd.DataFrame: + # Exclude rows whose "CUI2" is empty + mapping_data_frame = mrcui_data_frame.loc[~mrcui_data_frame["CUI2"].isnull(), ["CUI1", "CUI2"]] + return mapping_data_frame + + +def add_cui_name_and_semtype_to_retirement_mapping(retirement_mapping_data_frame, semmed_cui_name_semtype_data_frame, umls_cui_name_semtype_data_frame): + """ + Given a replacement CUI (for a retired CUI), its name and semtype should be looked up in the SemMedDB data frame at first. + If not present, look up in the external "UMLS_CUI_Semtype.tsv" file. + + This function will match the CUI names semtypes to the replacement CUIs (as in the "CUI2" column) of the "retirement_mapping_data_frame". CUI names and + semtypes from SemMedDB will be preferred for matching. + """ + + new_cuis = set(retirement_mapping_data_frame["CUI2"].unique()) + semmed_cui_info = semmed_cui_name_semtype_data_frame.loc[semmed_cui_name_semtype_data_frame["CUI"].isin(new_cuis)] + umls_cui_info = umls_cui_name_semtype_data_frame.loc[umls_cui_name_semtype_data_frame["CUI"].isin(new_cuis)] + preferred_cui_info = pd.concat([semmed_cui_info, umls_cui_info], ignore_index=True, copy=False) + # because SemMed values are put above UMLS values in "preferred_cui_info", so keep="first" will preserve the SemMed values if duplicates are found + preferred_cui_info.drop_duplicates(subset=["CUI", "SEMTYPE"], keep="first", inplace=True) + + """ + Why left join here? Because "retirement_mapping_df" ("MRCUI.RRF") may contain a replacement CUI having no preferred English name and thus not + included in "umls_cui_name_semtype_data_frame" ("UMLS_CUI_Semtype.tsv") + E.g. C4082455 is replaced by C4300557, according to "MRCUI.RRF". However C4300557 has only one preferred name in French, "Non-disjonction mitotique" + Left join would cause NaN values (which means a failed replacement) for C4082455. Such retired CUIs will be deleted directly. Therefore, for now we should + keep C4082455 in the result. + """ + retirement_mapping_data_frame = retirement_mapping_data_frame.merge(preferred_cui_info, how="left", left_on="CUI2", right_on="CUI") + retirement_mapping_data_frame.drop(columns="CUI", inplace=True) + retirement_mapping_data_frame.rename(columns={"CONCEPT_NAME": "CUI2_NAME", "SEMTYPE": "CUI2_SEMTYPE"}, inplace=True) + + return retirement_mapping_data_frame + + +######################################################### +# PART 3: Load CUI Names/Semantic Types for Replacement # +######################################################### + + +def read_cui_name_and_semtype_from_umls(data_folder, filename) -> pd.DataFrame: + filepath = os.path.join(data_folder, filename) + column_info = [ + # Each element is a tuple of (column_index, column_name, data_type) + (0, "CUI", "string"), + (1, "CONCEPT_NAME", "string"), + # (2, "SEMTYPE_FULLNAME", "string"), # we will map semantic type abbreviations to fullnames when constructing documents later, no need to read this column + (3, "SEMTYPE", "string") + ] + column_indices = [e[0] for e in column_info] + column_names = [e[1] for e in column_info] + column_dtypes = {e[1]: e[2] for e in column_info} + # Ignore the original header, use column names defined above + data_frame = pd.read_csv(filepath, sep="\t", header=0, names=column_names, usecols=column_indices, dtype=column_dtypes) + + data_frame = data_frame.astype({ + "CUI": "string[pyarrow]", + "CONCEPT_NAME": "string[pyarrow]", + "SEMTYPE": "string[pyarrow]" + }) + + return data_frame + + ############################ -# PART 3: Load SemMed Data # +# PART 4: Load SemMed Data # ############################ def read_semmed_data_frame(data_folder, filename) -> pd.DataFrame: filepath = os.path.join(data_folder, filename) - encoding = "latin1" # TODO encode in UTF-8 before outputting + encoding = "latin1" # TODO encode in UTF-8 before outputting? Once read in strings, it's UTF (to be confirmed)? na_value = r"\N" column_info = [ # Each element is a tuple of (column_index, column_name, data_type) @@ -64,33 +152,45 @@ def read_semmed_data_frame(data_folder, filename) -> pd.DataFrame: # "Int8" is a nullable integer type (while `int` cannot handle NA values), range [-128, 127] # "UInt32" ranges [0, 4294967295] # See https://pandas.pydata.org/docs/user_guide/basics.html#basics-dtypes - (0, "PREDICATION_ID", str), # column 0 (Auto-generated primary key; read as strings for easier concatenation) - # (1, "SENTENCE_ID", str), # column 1 (ignored) - (2, "PMID", "UInt32"), # column 2 (PubMed IDs are 8-digit numbers) - (3, "PREDICATE", str), # column 3 - (4, "SUBJECT_CUI", str), # column 4 - (5, "SUBJECT_NAME", str), # column 5 - (6, "SUBJECT_SEMTYPE", str), # column 6 + (0, "PREDICATION_ID", "UInt32"), # column 0 (Auto-generated primary key; current max is 199,713,830) + # (1, "SENTENCE_ID", "string"), # column 1 (ignored) + (2, "PMID", "UInt32"), # column 2 (PubMed IDs are 8-digit numbers) + (3, "PREDICATE", "string"), # column 3 + (4, "SUBJECT_CUI", "string"), # column 4 + (5, "SUBJECT_NAME", "string"), # column 5 + (6, "SUBJECT_SEMTYPE", "string"), # column 6 (7, "SUBJECT_NOVELTY", "Int8"), # column 7 (Currently either 0 or 1) - (8, "OBJECT_CUI", str), # column 8 - (9, "OBJECT_NAME", str), # column 9 - (10, "OBJECT_SEMTYPE", str), # column 10 - (11, "OBJECT_NOVELTY", "Int8") # column 11 (Currently either 0 or 1) - # (12, "FACT_VALUE", "Int8"), # column 12 (ignored) - # (13, "MOD_SCALE", "Int8"), # column 13 (ignored) - # (14, "MOD_VALUE", "Int8"), # column 14 (ignored) + (8, "OBJECT_CUI", "string"), # column 8 + (9, "OBJECT_NAME", "string"), # column 9 + (10, "OBJECT_SEMTYPE", "string"), # column 10 + (11, "OBJECT_NOVELTY", "Int8") # column 11 (Currently either 0 or 1) + # (12, "FACT_VALUE", "Int8"), # column 12 (ignored) + # (13, "MOD_SCALE", "Int8"), # column 13 (ignored) + # (14, "MOD_VALUE", "Int8"), # column 14 (ignored) ] column_indices = [e[0] for e in column_info] column_names = [e[1] for e in column_info] column_dtypes = {e[1]: e[2] for e in column_info} - data_frame = pd.read_csv(filepath, names=column_names, sep=",", usecols=column_indices, + data_frame = pd.read_csv(filepath, sep=",", names=column_names, usecols=column_indices, dtype=column_dtypes, na_values=[na_value], encoding=encoding) + + data_frame = data_frame.astype({ + "PREDICATE": "string[pyarrow]", + "SUBJECT_CUI": "string[pyarrow]", + "SUBJECT_NAME": "string[pyarrow]", + "SUBJECT_SEMTYPE": "string[pyarrow]", + "OBJECT_CUI": "string[pyarrow]", + "OBJECT_NAME": "string[pyarrow]", + "OBJECT_SEMTYPE": "string[pyarrow]" + }) + return data_frame -def remove_invalid_object_cuis(data_frame: pd.DataFrame): +def delete_invalid_object_cuis(semmed_data_frame: pd.DataFrame): """ - This function exclude rows with "invalid" object CUIs in the Semmed data frame. + This function remove rows with "invalid" object CUIs in the Semmed data frame. + ote this operation must be done BEFORE "explode_pipes()" is called. A "valid" object CUI present in "semmedVER43_2022_R_PREDICATION.csv" can be either: 1. A true CUI (starting with "C", followed by seven numbers, like "C0003725") @@ -113,7 +213,7 @@ def remove_invalid_object_cuis(data_frame: pd.DataFrame): 114631930 196519528 1|humn 114631934 196519532 1|humn - Subject CUIs are all valid in "semmedVER43_2022_R_PREDICATION.csv" + Subject CUIs are all valid in "semmedVER43_2022_R_PREDICATION.csv". """ """ @@ -126,22 +226,262 @@ def remove_invalid_object_cuis(data_frame: pd.DataFrame): # return cui_pattern.match(object_cui.strip()) cui_pattern = r"^[C0-9|]+$" # multiple occurrences of "C", "0" to "9", or "|" (vertical bar) - return data_frame.loc[data_frame["OBJECT_CUI"].str.match(cui_pattern)] + valid_flags = semmed_data_frame["OBJECT_CUI"].str.match(cui_pattern) + invalid_index = semmed_data_frame.index[~valid_flags] + semmed_data_frame.drop(index=invalid_index, inplace=True) + semmed_data_frame.reset_index(drop=True, inplace=True) + return semmed_data_frame -def remove_zero_novelty(data_frame: pd.DataFrame): +def delete_zero_novelty_scores(semmed_data_frame: pd.DataFrame): """ - Records with novelty score equal to 0 should be removed. + Rows with novelty score equal to 0 should be removed. See discussion in https://github.com/biothings/pending.api/issues/63#issuecomment-1100469563 """ - flags = (data_frame["SUBJECT_NOVELTY"] != 0) & (data_frame["OBJECT_NOVELTY"] != 0) - return data_frame.loc[flags] + zero_novelty_flags = semmed_data_frame["SUBJECT_NOVELTY"].eq(0) | semmed_data_frame["OBJECT_NOVELTY"].eq(0) + zero_novelty_index = semmed_data_frame.index[zero_novelty_flags] + semmed_data_frame.drop(index=zero_novelty_index, inplace=True) + semmed_data_frame.reset_index(drop=True, inplace=True) + return semmed_data_frame + + +def explode_pipes(semmed_data_frame: pd.DataFrame): + """ + Split "SUBJECT_CUI", "SUBJECT_NAME", "OBJECT_CUI", and "OBJECT_NAME" by pipes. Then transform the split values into individual rows. + + E.g. given the original data + + PREDICATION_ID SUBJECT_CUI SUBJECT_NAME OBJECT_CUI OBJECT_NAME + 11021926 2212|2213|9103 FCGR2A|FCGR2B|FCGR2C C1332714|920 CD4 gene|CD4 + + After splitting by pipes, we have + + PREDICATION_ID SUBJECT_CUI SUBJECT_NAME OBJECT_CUI OBJECT_NAME + 11021926 [2212,2213,9103] [FCGR2A,FCGR2B,FCGR2C] [C1332714,920] [CD4 gene,CD4] + + After the "explode" operations (see https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.explode.html), we have + + PREDICATION_ID SUBJECT_CUI SUBJECT_NAME OBJECT_CUI OBJECT_NAME + 11021926 2212 FCGR2A C1332714 CD4 gene + 11021926 2213 FCGR2B C1332714 CD4 gene + 11021926 9103 FCGR2C C1332714 CD4 gene + 11021926 2212 FCGR2A 920 CD4 + 11021926 2213 FCGR2B 920 CD4 + 11021926 9103 FCGR2C 920 CD4 + """ + + sub_piped_flags = semmed_data_frame["SUBJECT_CUI"].str.contains(r"\|") + obj_piped_flags = semmed_data_frame["OBJECT_CUI"].str.contains(r"\|") + piped_flags = sub_piped_flags | obj_piped_flags + + semmed_data_frame["IS_PIPED"] = piped_flags + semmed_data_frame.set_index("IS_PIPED", append=False, inplace=True) # use "IS_PIPED" as the new index; discard the original integer index + + piped_predications = semmed_data_frame.loc[True] + + piped_predications = piped_predications.assign( + OBJECT_CUI=piped_predications["OBJECT_CUI"].str.split(r"\|"), + OBJECT_NAME=piped_predications["OBJECT_NAME"].str.split(r"\|"), + SUBJECT_CUI=piped_predications["SUBJECT_CUI"].str.split(r"\|"), + SUBJECT_NAME=piped_predications["SUBJECT_NAME"].str.split(r"\|") + ) + + piped_predications = piped_predications.explode(["OBJECT_CUI", "OBJECT_NAME"]) + piped_predications = piped_predications.explode(["SUBJECT_CUI", "SUBJECT_NAME"]) + # These 4 columns' dtypes are changed to "object" after the above "assign" and "explode" operations + # Convert them to "string[pyarrow]" for less memory usage + piped_predications = piped_predications.astype({ + "SUBJECT_CUI": "string[pyarrow]", + "SUBJECT_NAME": "string[pyarrow]", + "OBJECT_CUI": "string[pyarrow]", + "OBJECT_NAME": "string[pyarrow]", + }) + + """ + "CUI" columns may contain empty strings and "NAME" columns may contain "None" strings, e.g.: + + PREDICATION_ID SUBJECT_CUI SUBJECT_NAME OBJECT_CUI OBJECT_NAME + 72530597 C0757738||100329167 m-AAA protease|None|AAA1 C1330957 Cytokinesis of the fertilized ovum + 75458336 C1167321 inner membrane C0757738||100329167 m-AAA protease|None|AAA1 + + Rows containing such values after "explode" operations should be dropped. + """ + piped_predications.reset_index(drop=False, inplace=True) + empty_value_flags = \ + piped_predications["SUBJECT_CUI"].eq('') | piped_predications["SUBJECT_NAME"].eq('None') | \ + piped_predications["OBJECT_CUI"].eq('') | piped_predications["OBJECT_NAME"].eq('None') + empty_value_index = piped_predications.index[empty_value_flags] + piped_predications.drop(index=empty_value_index, inplace=True) + piped_predications.set_index("IS_PIPED", append=False, inplace=True) + + semmed_data_frame.drop(index=True, inplace=True) # drop the original piped predications (marked by True values in "IS_PIPED" index) + semmed_data_frame = pd.concat([semmed_data_frame, piped_predications], copy=False) # append the "exploded" piped predications + semmed_data_frame.reset_index(drop=False, inplace=True) + + return semmed_data_frame + + +def delete_retired_cuis(semmed_data_frame: pd.DataFrame, retired_cuis: Set): + """ + Remove rows containing deleted CUIs specified in "MRCUI.RRF" file. + Note this operation must be done AFTER "explode_pipes()" is called. + """ + deleted_flags = semmed_data_frame["OBJECT_CUI"].isin(retired_cuis) | semmed_data_frame["SUBJECT_CUI"].isin(retired_cuis) + deleted_index = semmed_data_frame.index[deleted_flags] + semmed_data_frame.drop(index=deleted_index, inplace=True) + semmed_data_frame.reset_index(drop=True, inplace=True) + return semmed_data_frame + + +def add_prefix_columns(semmed_data_frame: pd.DataFrame): + """ + Add 2 columns, "SUBJECT_PREFIX" and "OBJECT_PREFIX" to the SemMedDB data frame. + If a "CUI" is a real CUI starting with the letter "C", its prefix would be "umls"; + otherwise the "CUI" should be a NCBIGene ID, and its prefix would be "ncbigene". + """ + subject_prefix_series = pd.Series(np.where(semmed_data_frame["SUBJECT_CUI"].str.startswith("C"), "umls", "ncbigene"), dtype="category") + object_prefix_series = pd.Series(np.where(semmed_data_frame["OBJECT_CUI"].str.startswith("C"), "umls", "ncbigene"), dtype="category") + + semmed_data_frame = semmed_data_frame.assign( + SUBJECT_PREFIX=subject_prefix_series, + OBJECT_PREFIX=object_prefix_series + ) + + return semmed_data_frame + + +def get_cui_name_and_semtype_from_semmed(semmed_data_frame: pd.DataFrame): + sub_cui_flags = semmed_data_frame["SUBJECT_PREFIX"].eq("umls") + obj_cui_flags = semmed_data_frame["OBJECT_PREFIX"].eq("umls") + + sub_cui_semtype_data_frame = semmed_data_frame.loc[sub_cui_flags, ["SUBJECT_CUI", "SUBJECT_NAME", "SUBJECT_SEMTYPE"]] + obj_cui_semtype_data_frame = semmed_data_frame.loc[obj_cui_flags, ["OBJECT_CUI", "OBJECT_NAME", "OBJECT_SEMTYPE"]] + + sub_cui_semtype_data_frame.drop_duplicates(subset=["SUBJECT_CUI", "SUBJECT_SEMTYPE"], inplace=True) + obj_cui_semtype_data_frame.drop_duplicates(subset=["OBJECT_CUI", "OBJECT_SEMTYPE"], inplace=True) + + unified_column_names = ["CUI", "CONCEPT_NAME", "SEMTYPE"] + sub_cui_semtype_data_frame.columns = unified_column_names + obj_cui_semtype_data_frame.columns = unified_column_names + + cui_semtype_data_frame = pd.concat([sub_cui_semtype_data_frame, obj_cui_semtype_data_frame], ignore_index=True, copy=False) + cui_semtype_data_frame.drop_duplicates(subset=["CUI", "SEMTYPE"], inplace=True) + return cui_semtype_data_frame + + +def map_retired_cuis(semmed_data_frame: pd.DataFrame, retirement_mapping_data_frame: pd.DataFrame): + """ + Let's rename: + + - semmed_data_frame as table A(SUBJECT_CUI, SUBJECT_NAME, SUBJECT_SEMTYPE, OBJECT_CUI, OBJECT_NAME, OBJECT_SEMTYPE), the target of replacement, + - retirement_mapping_data_frame as table B(CUI1, CUI2, CUI2_NAME, CUI2_SEMTYPE) where CUI1 is the retired CUI column while CUI2 is new CUI column, and + + The replacement is carried out in the following steps: + + 1. Find from A all predications with retired subjects or objects, resulting in table X + 2. Replace predications with retired subjects + 2.1 X.merge(B, how="left", left_on=[SUBJECT_CUI, SUBJECT_SEMTYPE], right_on=[CUI1, CUI2_SEMTYPE]), i.e. find the same semantic-typed new CUI2 + for each retired SUBJECT_CUI, resulting in table Y + 2.2 Replace columns (SUBJECT_CUI, SUBJECT_NAME, SUBJECT_SEMTYPE) in Y with matched (CUI2, CUI2_NAME, CUI2_SEMTYPE), resulting in table Z + 3. Replace predications with retired objects + 3.1 Z.merge(B, how="left", left_on=[SUBJECT_CUI, SUBJECT_SEMTYPE], right_on=[CUI1, CUI2_SEMTYPE]), i.e. find the same semantic-typed new CUI2 + for each retired SUBJECT_CUI, resulting in table W + 3.2 Replace columns (SUBJECT_CUI, SUBJECT_NAME, SUBJECT_SEMTYPE) in W with matched (CUI2, CUI2_NAME, CUI2_SEMTYPE), resulting in table V + 4. Drop X from A, and then append V to A, resulting in table U. Return U as result. + """ + ########## + # Step 1 # + ########## + """ + Find all retired CUIs to be replaced. + + P.S. Do not use set(mapping_data_frame["CUI1"].unique()). See comments below. + E.g. CUI C4082455 should be replaced to C4300557. However C4300557 is not in the "mapping_data_frame" + In this case, all predications with CUI C4082455 should also be marked by "replaced_sub_flags" and be deleted later + """ + retired_cuis = set(retirement_mapping_data_frame["CUI1"].unique()) + sub_retired_flags = semmed_data_frame["SUBJECT_CUI"].isin(retired_cuis) + obj_retired_flags = semmed_data_frame["OBJECT_CUI"].isin(retired_cuis) + retired_flags = sub_retired_flags | obj_retired_flags + + semmed_data_frame["IS_SUBJECT_RETIRED"] = sub_retired_flags + semmed_data_frame["IS_OBJECT_RETIRED"] = obj_retired_flags + + # It does not matter if "retired_predications" is a view or a copy of "semmed_data_frame" + # since the below "merge" operation always returns a new dataframe. + # Therefore, operations on "retired_predications" won't alter "semmed_data_frame". + retired_predications = semmed_data_frame.loc[retired_flags] + + ########## + # Step 2 # + ########## + retired_predications = retired_predications.merge(retirement_mapping_data_frame, how="left", + left_on=["SUBJECT_CUI", "SUBJECT_SEMTYPE"], + right_on=["CUI1", "CUI2_SEMTYPE"]) # match by CUIs and semtypes together + # Overwrite retired SUBJECT_* values with matched CUI2_* values + retired_predications["SUBJECT_CUI"] = np.where(retired_predications["IS_SUBJECT_RETIRED"], + retired_predications["CUI2"], retired_predications["SUBJECT_CUI"]) + retired_predications["SUBJECT_NAME"] = np.where(retired_predications["IS_SUBJECT_RETIRED"], + retired_predications["CUI2_NAME"], retired_predications["SUBJECT_NAME"]) + retired_predications["SUBJECT_SEMTYPE"] = np.where(retired_predications["IS_SUBJECT_RETIRED"], + retired_predications["CUI2_SEMTYPE"], retired_predications["SUBJECT_SEMTYPE"]) + + # Drop all merged columns from retirement_mapping_data_frame + retired_predications.drop(columns=retirement_mapping_data_frame.columns, inplace=True) + # Drop all predications whose retired subjects are unmatched + retired_predications.dropna(axis=0, how="any", subset=["SUBJECT_CUI", "SUBJECT_NAME", "SUBJECT_SEMTYPE"], inplace=True) + + ########## + # Step 3 # + ########## + retired_predications = retired_predications.merge(retirement_mapping_data_frame, how="left", + left_on=["OBJECT_CUI", "OBJECT_SEMTYPE"], + right_on=["CUI1", "CUI2_SEMTYPE"]) # match by CUIs and semtypes together + # Overwrite retired OBJECT_* values with new CUI2_* values + retired_predications["OBJECT_CUI"] = np.where(retired_predications["IS_OBJECT_RETIRED"], + retired_predications["CUI2"], retired_predications["OBJECT_CUI"]) + retired_predications["OBJECT_NAME"] = np.where(retired_predications["IS_OBJECT_RETIRED"], + retired_predications["CUI2_NAME"], retired_predications["OBJECT_NAME"]) + retired_predications["OBJECT_SEMTYPE"] = np.where(retired_predications["IS_OBJECT_RETIRED"], + retired_predications["CUI2_SEMTYPE"], retired_predications["OBJECT_SEMTYPE"]) + + # Drop all merged columns from retirement_mapping_data_frame + retired_predications.drop(columns=retirement_mapping_data_frame.columns, inplace=True) + # Drop all predications whose retired objects are unmatched + retired_predications.dropna(axis=0, how="any", subset=["OBJECT_CUI", "OBJECT_NAME", "OBJECT_SEMTYPE"], inplace=True) + + ########## + # Step 4 # + ########## + # Now these two columns are not necessary. Drop them to save memory + retired_predications.drop(columns=["IS_SUBJECT_RETIRED", "IS_OBJECT_RETIRED"], inplace=True) + semmed_data_frame.drop(columns=["IS_SUBJECT_RETIRED", "IS_OBJECT_RETIRED"], inplace=True) + + # Drop the original retired predications + retired_index = semmed_data_frame.index[retired_flags] + semmed_data_frame.drop(index=retired_index, inplace=True) + + # Append the matched new predications + semmed_data_frame = pd.concat([semmed_data_frame, retired_predications], ignore_index=True, copy=False) + semmed_data_frame.sort_values(by="PREDICATION_ID", ignore_index=True) + + return semmed_data_frame ################## -# PART 4: Parser # +# PART 5: Parser # ################## + +# def query_node_normalizer(cui: str): +# # TODO batch query for the whole data frame? (POST with `curies`) +# # TODO or single queries row by row? (GET with `curie`) +# curie = f"UMLS:{cui}" +# conflate = False # "conflate" means "the conflated data will be returned", see https://github.com/TranslatorSRI/Babel/wiki/Babel-output-formats#conflation +# url = f"https://nodenorm.transltr.io/get_normalized_nodes?curie={curie}&conflate={conflate}" +# resp = requests.get(url) + + def construct_documents(row: pd.Series, semantic_type_map): """ SemMedDB Database Details: https://lhncbc.nlm.nih.gov/ii/tools/SemRep_SemMedDB_SKR/dbinfo.html @@ -161,84 +501,62 @@ def construct_documents(row: pd.Series, semantic_type_map): OBJECT_SEMTYPE : The semantic type of the object of the predication OBJECT_NOVELTY : The novelty of the object of the predication """ - predication_id = row["PREDICATION_ID"] - pmid = row["PMID"] - predicate = row["PREDICATE"] - - sub_cui_list = row["SUBJECT_CUI"].split("|") - sub_name_list = row["SUBJECT_NAME"].split("|") - sub_semantic_type_abv = row["SUBJECT_SEMTYPE"] - sub_semantic_type_name = semantic_type_map.get(sub_semantic_type_abv, None) - sub_novelty = row["SUBJECT_NOVELTY"] - - obj_cui_list = row["OBJECT_CUI"].split("|") - obj_name_list = row["OBJECT_NAME"].split("|") - obj_semantic_type_abv = row["OBJECT_SEMTYPE"] - obj_semantic_type_name = semantic_type_map.get(obj_semantic_type_abv, None) - obj_novelty = row["OBJECT_NOVELTY"] - - # if "C" not present, the CUI field must be one or more gene ids - sub_id_field = "umls" if "C" in row["SUBJECT_CUI"] else "ncbigene" - obj_id_field = "umls" if "C" in row["OBJECT_CUI"] else "ncbigene" - - if sub_id_field == "umls": - if '|' in row["SUBJECT_CUI"]: # equivalent to `if len(sub_cui_list) > 1` - # take first CUI if it contains gene id(s) - sub_cui_list = [sub_cui_list[0]] - sub_name_list = [sub_name_list[0]] - - if obj_id_field == "umls": - if '|' in row["OBJECT_CUI"]: # equivalent to `if len(obj_cui_list) > 1` - # take first CUI if it contains gene id(s) - obj_cui_list = [obj_cui_list[0]] - obj_name_list = [obj_name_list[0]] - - id_count = 0 # loop to get all id combinations if one record has multiple ids - for sub_idx, sub_cui in enumerate(sub_cui_list): - for obj_idx, obj_cui in enumerate(obj_cui_list): - - id_count += 1 - if len(sub_cui_list) == 1 and len(obj_cui_list) == 1: - _id = predication_id - else: - _id = predication_id + "_" + str(id_count) # add sequence id - - doc = { - "_id": _id, - "predicate": predicate, - "predication_id": predication_id, - "pmid": pmid, - "subject": { - sub_id_field: sub_cui, - "name": sub_name_list[sub_idx], - "semantic_type_abbreviation": sub_semantic_type_abv, - "semantic_type_name": sub_semantic_type_name, - "novelty": sub_novelty - }, - "object": { - obj_id_field: obj_cui, - "name": obj_name_list[obj_idx], - "semantic_type_abbreviation": obj_semantic_type_abv, - "semantic_type_name": obj_semantic_type_name, - "novelty": obj_novelty - } - } - - # del semtype_name field if we did not any mappings - if not sub_semantic_type_name: - del doc["subject"]["semantic_type_name"] - if not obj_semantic_type_name: - del doc["object"]["semantic_type_name"] - - yield doc + doc = { + "_id": row["_ID"], + "predication_id": row["PREDICATION_ID"], + "pmid": row["PMID"], + "predicate": row["PREDICATE"], + "subject": { + row["SUBJECT_PREFIX"]: row["SUBJECT_CUI"], + "name": row["SUBJECT_NAME"], + "semantic_type_abbreviation": row["SUBJECT_SEMTYPE"], + "semantic_type_name": semantic_type_map.get(row["SUBJECT_SEMTYPE"], None), + "novelty": row["SUBJECT_NOVELTY"] + }, + "object": { + row["OBJECT_PREFIX"]: row["OBJECT_CUI"], + "name": row["OBJECT_NAME"], + "semantic_type_abbreviation": row["OBJECT_SEMTYPE"], + "semantic_type_name": semantic_type_map.get(row["OBJECT_SEMTYPE"], None), + "novelty": row["OBJECT_NOVELTY"] + } + } + + # del semtype_name field if we did not any mappings + if not doc["subject"]["semantic_type_name"]: + del doc["subject"]["semantic_type_name"] + if not doc["object"]["semantic_type_name"]: + del doc["object"]["semantic_type_name"] + + yield doc def load_data(data_folder): - semantic_type_df = read_semantic_type_data_frame(data_folder, "SemanticTypes_2013AA.txt") - semantic_type_map = dict(zip(semantic_type_df["abv"], semantic_type_df["label"])) - semmed_df = read_semmed_data_frame(data_folder, "semmedVER43_2022_R_PREDICATION.csv") - semmed_df = remove_invalid_object_cuis(semmed_df) - semmed_df = remove_zero_novelty(semmed_df) + semmed_df = delete_zero_novelty_scores(semmed_df) + semmed_df = delete_invalid_object_cuis(semmed_df) + semmed_df = explode_pipes(semmed_df) + + mrcui_df = read_mrcui_data_frame(data_folder, "MRCUI.RRF") + deleted_cuis = get_retired_cuis_for_deletion(mrcui_df) + semmed_df = delete_retired_cuis(semmed_df, deleted_cuis) + + semmed_df = add_prefix_columns(semmed_df) + semmed_cui_name_semtype_df = get_cui_name_and_semtype_from_semmed(semmed_df) + umls_cui_name_semtype_df = read_cui_name_and_semtype_from_umls(data_folder, "UMLS_CUI_Semtype.tsv") + retirement_mapping_df = get_retirement_mapping_data_frame(mrcui_df) + retirement_mapping_df = add_cui_name_and_semtype_to_retirement_mapping(retirement_mapping_df, semmed_cui_name_semtype_df, umls_cui_name_semtype_df) + semmed_df = map_retired_cuis(semmed_df, retirement_mapping_df) + + semtype_mappings_df = read_semantic_type_mappings_data_frame(data_folder, "SemanticTypes_2018AB.txt") + semtype_name_map = get_semtype_name_map(semtype_mappings_df) + + # TODO query node normalizer in each GroupBy(PredicateID) for _, row in semmed_df.iterrows(): - yield from construct_documents(row, semantic_type_map) + yield from construct_documents(row, semtype_name_map) + +# TODO load_data(data_folder, use_intermediate=False) +""" +if not use_intermediate: + go thru data cleaning +""" \ No newline at end of file From 856268cfa46454d45b50b6439b8e4e883051e801 Mon Sep 17 00:00:00 2001 From: Yao Yao Date: Sat, 31 Dec 2022 15:49:48 -0800 Subject: [PATCH 09/15] add two marker columns, IS_SUBJECT_PIPED and IS_OBJECT_PIPED; change retired predication dtype to Pyarrow string before concatenation; add the client to query Node Normalizer --- parser.py | 99 ++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 83 insertions(+), 16 deletions(-) diff --git a/parser.py b/parser.py index c06381a..acd7825 100644 --- a/parser.py +++ b/parser.py @@ -1,9 +1,14 @@ import os -from typing import Dict, Set -import requests +import aiohttp +import asyncio import pandas as pd import numpy as np +from typing import Dict, Set +from collections.abc import Collection # for type hints + +from biothings.utils.common import iter_n + ################################### # PART 1: Load Semantic Type Data # @@ -272,8 +277,11 @@ def explode_pipes(semmed_data_frame: pd.DataFrame): sub_piped_flags = semmed_data_frame["SUBJECT_CUI"].str.contains(r"\|") obj_piped_flags = semmed_data_frame["OBJECT_CUI"].str.contains(r"\|") - piped_flags = sub_piped_flags | obj_piped_flags + # These two indices are necessary to locate equivalent NCBIGene IDs + semmed_data_frame["IS_SUBJECT_PIPED"] = sub_piped_flags + semmed_data_frame["IS_OBJECT_PIPED"] = obj_piped_flags + piped_flags = sub_piped_flags | obj_piped_flags semmed_data_frame["IS_PIPED"] = piped_flags semmed_data_frame.set_index("IS_PIPED", append=False, inplace=True) # use "IS_PIPED" as the new index; discard the original integer index @@ -306,17 +314,17 @@ def explode_pipes(semmed_data_frame: pd.DataFrame): Rows containing such values after "explode" operations should be dropped. """ - piped_predications.reset_index(drop=False, inplace=True) + piped_predications.reset_index(drop=False, inplace=True) # switch to the integer index, for ".drop(index=?)" operation below empty_value_flags = \ piped_predications["SUBJECT_CUI"].eq('') | piped_predications["SUBJECT_NAME"].eq('None') | \ piped_predications["OBJECT_CUI"].eq('') | piped_predications["OBJECT_NAME"].eq('None') empty_value_index = piped_predications.index[empty_value_flags] piped_predications.drop(index=empty_value_index, inplace=True) - piped_predications.set_index("IS_PIPED", append=False, inplace=True) + piped_predications.set_index("IS_PIPED", append=False, inplace=True) # switch back to the "IS_PIPED" index, for ".concat()" operation below semmed_data_frame.drop(index=True, inplace=True) # drop the original piped predications (marked by True values in "IS_PIPED" index) semmed_data_frame = pd.concat([semmed_data_frame, piped_predications], copy=False) # append the "exploded" piped predications - semmed_data_frame.reset_index(drop=False, inplace=True) + semmed_data_frame.reset_index(drop=True, inplace=True) # drop the "IS_PIPED" index (no longer needed) return semmed_data_frame @@ -451,6 +459,15 @@ def map_retired_cuis(semmed_data_frame: pd.DataFrame, retirement_mapping_data_fr # Drop all predications whose retired objects are unmatched retired_predications.dropna(axis=0, how="any", subset=["OBJECT_CUI", "OBJECT_NAME", "OBJECT_SEMTYPE"], inplace=True) + retired_predications = retired_predications.astype({ + "SUBJECT_CUI": "string[pyarrow]", + "SUBJECT_NAME": "string[pyarrow]", + "SUBJECT_SEMTYPE": "string[pyarrow]", + "OBJECT_CUI": "string[pyarrow]", + "OBJECT_NAME": "string[pyarrow]", + "OBJECT_SEMTYPE": "string[pyarrow]", + }) + ########## # Step 4 # ########## @@ -468,19 +485,69 @@ def map_retired_cuis(semmed_data_frame: pd.DataFrame, retirement_mapping_data_fr return semmed_data_frame -################## -# PART 5: Parser # -################## +################################## +# PART 5: Node Normalizer Client # +################################## + +async def query_node_normalizer_for_equivalent_ncbigene_ids(cui_collection: Collection, chunk_size: int, connector_limit: int) -> Dict: + """ + Given a collection of CUIs, query Node Normalizer to fetch their equivalent NCBIGene IDs. -# def query_node_normalizer(cui: str): -# # TODO batch query for the whole data frame? (POST with `curies`) -# # TODO or single queries row by row? (GET with `curie`) -# curie = f"UMLS:{cui}" -# conflate = False # "conflate" means "the conflated data will be returned", see https://github.com/TranslatorSRI/Babel/wiki/Babel-output-formats#conflation -# url = f"https://nodenorm.transltr.io/get_normalized_nodes?curie={curie}&conflate={conflate}" -# resp = requests.get(url) + To avoid timeout issues, the CUI collection will be partitioned into chunks. + Each chunk of CUIs will be passed to the Node Normalizer's POST endpoint for querying. + To avoid other traffic errors, use `connector_limit` to control the number of parallel connections to the endpoint. + """ + + # Define the querying task for each chunk of CUIs + async def _query(aio_session: aiohttp.ClientSession, cui_chunk: Collection) -> dict: + cui_gene_id_map = {} + + cui_prefix = "UMLS:" + gene_id_prefix = "NCBIGene:" + + url = f"https://nodenorm.transltr.io/get_normalized_nodes" + payload = { + # {"conflate": True} means "the conflated data will be returned by the endpoint", which is not necessary here. + # See https://github.com/TranslatorSRI/Babel/wiki/Babel-output-formats#conflation + "conflate": False, + "curies": [f"{cui_prefix}{cui}" for cui in cui_chunk] + } + + async with aio_session.post(url, json=payload) as resp: + json_resp = await resp.json() + + for curie, curie_result in json_resp.items(): + if curie_result is None: + continue + + for eq_id in curie_result.get("equivalent_identifiers"): + identifier = eq_id["identifier"] + if identifier.startswith(gene_id_prefix): + cui = curie[len(cui_prefix):] # trim out the prefix "UMLS:" + cui_gene_id_map[cui] = identifier[len(gene_id_prefix):] # trim out the prefix "NCBIGene:" + break + + return cui_gene_id_map + + # Create a querying task for each chunk of CUI collections, run them, collect and combine the results + connector = aiohttp.TCPConnector(limit=connector_limit) # + async with aiohttp.ClientSession(connector=connector, raise_for_status=True) as session: + tasks = [_query(session, cui_chunk) for cui_chunk in iter_n(cui_collection, chunk_size)] + + # "asyncio.gather()" will wait on the entire task set to be completed. + # If you want to process results greedily as they come in, loop over asyncio.as_completed() + cui_gene_id_maps = await asyncio.gather(*tasks, return_exceptions=True) # "cui_gene_id_maps" is a list of dictionaries + + # Merge all dictionaries in "cui_gene_id_maps" + merged_map = {cui: gene_id for cg_map in cui_gene_id_maps for cui, gene_id in cg_map.items()} + return merged_map + + +################## +# PART 6: Parser # +################## def construct_documents(row: pd.Series, semantic_type_map): """ From 37f78eec7b18783d0a81b95434967a81e2e934f5 Mon Sep 17 00:00:00 2001 From: Yao Yao Date: Tue, 3 Jan 2023 22:46:02 -0800 Subject: [PATCH 10/15] add functions to locate and delete equivalent NCBIGene IDs for piped CUIs; add a function to determine the document ID for each predication; change node normalizer parameter conflate to True --- parser.py | 107 ++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 96 insertions(+), 11 deletions(-) diff --git a/parser.py b/parser.py index acd7825..7fd1dd1 100644 --- a/parser.py +++ b/parser.py @@ -365,6 +365,7 @@ def get_cui_name_and_semtype_from_semmed(semmed_data_frame: pd.DataFrame): sub_cui_semtype_data_frame = semmed_data_frame.loc[sub_cui_flags, ["SUBJECT_CUI", "SUBJECT_NAME", "SUBJECT_SEMTYPE"]] obj_cui_semtype_data_frame = semmed_data_frame.loc[obj_cui_flags, ["OBJECT_CUI", "OBJECT_NAME", "OBJECT_SEMTYPE"]] + # TODO add notes sub_cui_semtype_data_frame.drop_duplicates(subset=["SUBJECT_CUI", "SUBJECT_SEMTYPE"], inplace=True) obj_cui_semtype_data_frame.drop_duplicates(subset=["OBJECT_CUI", "OBJECT_SEMTYPE"], inplace=True) @@ -486,6 +487,93 @@ def map_retired_cuis(semmed_data_frame: pd.DataFrame, retirement_mapping_data_fr return semmed_data_frame +async def delete_equivalent_ncbigene_ids(semmed_data_frame: pd.DataFrame): + async def get_cui_to_gene_id_maps(sub_cui_flags: pd.Series, obj_cui_flags: pd.Series): + sub_cuis = set(semmed_data_frame.loc[sub_cui_flags, "SUBJECT_CUI"].unique()) + obj_cuis = set(semmed_data_frame.loc[obj_cui_flags, "OBJECT_CUI"].unique()) + + cuis = sub_cuis.union(obj_cuis) + chunk_size = 1000 + connector_limit = 10 + # a map where there the key is a source CUI and the value is its equivalent NCBIGene ID + cui_gene_id_map = await query_node_normalizer_for_equivalent_ncbigene_ids(cuis, chunk_size=chunk_size, connector_limit=connector_limit) + + sub_cui_gene_id_map = {cui: gene_id for cui, gene_id in cui_gene_id_map.items() if cui in sub_cuis} + obj_cui_gene_id_map = {cui: gene_id for cui, gene_id in cui_gene_id_map.items() if cui in obj_cuis} + + return sub_cui_gene_id_map, obj_cui_gene_id_map + + def get_pred_id_to_cui_maps(sub_cui_flags: pd.Series, obj_cui_flags: pd.Series): + sub_cui_predications = semmed_data_frame.loc[sub_cui_flags, ["SUBJECT_CUI", "PREDICATION_ID"]] + obj_cui_predications = semmed_data_frame.loc[obj_cui_flags, ["OBJECT_CUI", "PREDICATION_ID"]] + + pred_id_sub_cui_map = dict(zip(sub_cui_predications["PREDICATION_ID"], sub_cui_predications["SUBJECT_CUI"])) + pred_id_obj_cui_map = dict(zip(obj_cui_predications["PREDICATION_ID"], obj_cui_predications["OBJECT_CUI"])) + + return pred_id_sub_cui_map, pred_id_obj_cui_map + + def establish_pred_id_to_gene_id_map(pid_cui_map: Dict, cui_gid_map: Dict): + pid_gid_map = {pid: cui_gid_map[cui] for pid, cui in pid_cui_map.items() if cui in cui_gid_map} + return pid_gid_map + + def get_row_index_of_equivalent_ncbigene_ids(pred_id_sub_gene_id_map: Dict, pred_id_obj_gene_id_map: Dict): + sub_piped_predications = semmed_data_frame.loc[semmed_data_frame["IS_SUBJECT_PIPED"], ["PREDICATION_ID", "SUBJECT_CUI"]] + obj_piped_predications = semmed_data_frame.loc[semmed_data_frame["IS_OBJECT_PIPED"], ["PREDICATION_ID", "OBJECT_CUI"]] + + sub_piped_predications.reset_index(drop=False, inplace=True) # make the integer index a column named "index" + obj_piped_predications.reset_index(drop=False, inplace=True) # make the integer index a column named "index" + + sub_piped_predications.set_index(["PREDICATION_ID", "SUBJECT_CUI"], append=False, inplace=True) # do not append the default integer index to columns + obj_piped_predications.set_index(["PREDICATION_ID", "OBJECT_CUI"], append=False, inplace=True) # do not append the default integer index to columns + + sub_piped_predications.sort_index(inplace=True) + obj_piped_predications.sort_index(inplace=True) + + dest_pred_id_sub_gene_id_pairs = [(pid, gid) for (pid, gid) in pred_id_sub_gene_id_map.items() if (pid, gid) in sub_piped_predications.index] + dest_pred_id_obj_gene_id_pairs = [(pid, gid) for (pid, gid) in pred_id_obj_gene_id_map.items() if (pid, gid) in obj_piped_predications.index] + + dest_row_index_of_equivalent_gid_for_sub = set(sub_piped_predications.loc[dest_pred_id_sub_gene_id_pairs, "index"].values) + dest_row_index_of_equivalent_gid_for_obj = set(obj_piped_predications.loc[dest_pred_id_obj_gene_id_pairs, "index"].values) + + dest_row_index = dest_row_index_of_equivalent_gid_for_sub.union(dest_row_index_of_equivalent_gid_for_obj) + return dest_row_index + + candidate_sub_cui_flags = semmed_data_frame["IS_SUBJECT_PIPED"] & semmed_data_frame["SUBJECT_PREFIX"].eq("umls") + candidate_obj_cui_flags = semmed_data_frame["IS_OBJECT_PIPED"] & semmed_data_frame["OBJECT_PREFIX"].eq("umls") + sub_cui_gid_map, obj_cui_gid_map = await get_cui_to_gene_id_maps(candidate_sub_cui_flags, candidate_obj_cui_flags) + + source_sub_cui_flags = semmed_data_frame["IS_SUBJECT_PIPED"] & semmed_data_frame["SUBJECT_CUI"].isin(sub_cui_gid_map) + source_obj_cui_flags = semmed_data_frame["IS_OBJECT_PIPED"] & semmed_data_frame["OBJECT_CUI"].isin(obj_cui_gid_map) + pid_sub_cui_map, pid_obj_cui_map = get_pred_id_to_cui_maps(source_sub_cui_flags, source_obj_cui_flags) + + pid_sub_gid_map = establish_pred_id_to_gene_id_map(pid_sub_cui_map, sub_cui_gid_map) + pid_obj_gid_map = establish_pred_id_to_gene_id_map(pid_obj_cui_map, obj_cui_gid_map) + + dest_equivalent_gid_index = get_row_index_of_equivalent_ncbigene_ids(pid_sub_gid_map, pid_obj_gid_map) + + semmed_data_frame.drop(index=dest_equivalent_gid_index, inplace=True) + return semmed_data_frame + + +def add_document_id_column(semmed_data_frame: pd.DataFrame): + # CUIs in descending order so a true CUI precedes a NCBIGene ID + semmed_data_frame.sort_values(by=['PREDICATION_ID', 'SUBJECT_CUI', 'OBJECT_CUI'], + ascending=[True, False, False], ignore_index=True, inplace=True) + + primary_ids = semmed_data_frame["PREDICATION_ID"].astype("string[pyarrow]") + + groupwise_pred_nums = semmed_data_frame.groupby("PREDICATION_ID").cumcount().add(1) + secondary_ids = (f"{pid}-{num}" for pid, num in zip(semmed_data_frame["PREDICATION_ID"], groupwise_pred_nums)) + secondary_ids = pd.Series(data=secondary_ids, dtype="string[pyarrow]", index=semmed_data_frame.index) + + group_sizes = semmed_data_frame.groupby('PREDICATION_ID').transform('size') + _ids = pd.Series(data=np.where(group_sizes.eq(1), primary_ids, secondary_ids), + dtype="string[pyarrow]", index=semmed_data_frame.index) + + semmed_data_frame["_ID"] = _ids + return semmed_data_frame + + ################################## # PART 5: Node Normalizer Client # ################################## @@ -509,9 +597,9 @@ async def _query(aio_session: aiohttp.ClientSession, cui_chunk: Collection) -> d url = f"https://nodenorm.transltr.io/get_normalized_nodes" payload = { - # {"conflate": True} means "the conflated data will be returned by the endpoint", which is not necessary here. + # {"conflate": True} means "the conflated data will be returned by the endpoint". # See https://github.com/TranslatorSRI/Babel/wiki/Babel-output-formats#conflation - "conflate": False, + "conflate": True, "curies": [f"{cui_prefix}{cui}" for cui in cui_chunk] } @@ -549,7 +637,7 @@ async def _query(aio_session: aiohttp.ClientSession, cui_chunk: Collection) -> d # PART 6: Parser # ################## -def construct_documents(row: pd.Series, semantic_type_map): +def construct_document(row: pd.Series, semantic_type_map): """ SemMedDB Database Details: https://lhncbc.nlm.nih.gov/ii/tools/SemRep_SemMedDB_SKR/dbinfo.html @@ -615,15 +703,12 @@ def load_data(data_folder): retirement_mapping_df = add_cui_name_and_semtype_to_retirement_mapping(retirement_mapping_df, semmed_cui_name_semtype_df, umls_cui_name_semtype_df) semmed_df = map_retired_cuis(semmed_df, retirement_mapping_df) + semmed_df = await delete_equivalent_ncbigene_ids(semmed_df) + + semmed_df = add_document_id_column(semmed_df) + semtype_mappings_df = read_semantic_type_mappings_data_frame(data_folder, "SemanticTypes_2018AB.txt") semtype_name_map = get_semtype_name_map(semtype_mappings_df) - # TODO query node normalizer in each GroupBy(PredicateID) for _, row in semmed_df.iterrows(): - yield from construct_documents(row, semtype_name_map) - -# TODO load_data(data_folder, use_intermediate=False) -""" -if not use_intermediate: - go thru data cleaning -""" \ No newline at end of file + yield from construct_document(row, semtype_name_map) From 7dd09da48efc8924e867d2f30a3391eec7f9bb35 Mon Sep 17 00:00:00 2001 From: Yao Yao Date: Tue, 10 Jan 2023 13:24:08 -0800 Subject: [PATCH 11/15] add cache reading/writing for Node Normalizer queries; revise equivalent NCBIGene ID row location algorithm (reducing running time from 24min to 40sec --- parser.py | 47 ++++++++++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/parser.py b/parser.py index 7fd1dd1..b6b9931 100644 --- a/parser.py +++ b/parser.py @@ -1,4 +1,5 @@ import os +import pickle import aiohttp import asyncio import pandas as pd @@ -487,16 +488,27 @@ def map_retired_cuis(semmed_data_frame: pd.DataFrame, retirement_mapping_data_fr return semmed_data_frame -async def delete_equivalent_ncbigene_ids(semmed_data_frame: pd.DataFrame): +async def delete_equivalent_ncbigene_ids(semmed_data_frame: pd.DataFrame, + node_normalizer_cache: str = None, + node_normalizer_output: str = None): async def get_cui_to_gene_id_maps(sub_cui_flags: pd.Series, obj_cui_flags: pd.Series): sub_cuis = set(semmed_data_frame.loc[sub_cui_flags, "SUBJECT_CUI"].unique()) obj_cuis = set(semmed_data_frame.loc[obj_cui_flags, "OBJECT_CUI"].unique()) - cuis = sub_cuis.union(obj_cuis) - chunk_size = 1000 - connector_limit = 10 - # a map where there the key is a source CUI and the value is its equivalent NCBIGene ID - cui_gene_id_map = await query_node_normalizer_for_equivalent_ncbigene_ids(cuis, chunk_size=chunk_size, connector_limit=connector_limit) + if node_normalizer_cache and os.path.exists(node_normalizer_cache): + with open(node_normalizer_cache, 'rb') as handle: + cui_gene_id_map = pickle.load(handle) + else: + cuis = sub_cuis.union(obj_cuis) + chunk_size = 1000 + connector_limit = 10 + # a map where there the key is a source CUI and the value is its equivalent NCBIGene ID + cui_gene_id_map = await query_node_normalizer_for_equivalent_ncbigene_ids(cuis, chunk_size=chunk_size, connector_limit=connector_limit) + + # Output to the specified pickle file regardless if it's cache or live response + if node_normalizer_output: + with open(node_normalizer_output, 'wb') as handle: + pickle.dump(cui_gene_id_map, handle, protocol=pickle.HIGHEST_PROTOCOL) sub_cui_gene_id_map = {cui: gene_id for cui, gene_id in cui_gene_id_map.items() if cui in sub_cuis} obj_cui_gene_id_map = {cui: gene_id for cui, gene_id in cui_gene_id_map.items() if cui in obj_cuis} @@ -523,20 +535,21 @@ def get_row_index_of_equivalent_ncbigene_ids(pred_id_sub_gene_id_map: Dict, pred sub_piped_predications.reset_index(drop=False, inplace=True) # make the integer index a column named "index" obj_piped_predications.reset_index(drop=False, inplace=True) # make the integer index a column named "index" - sub_piped_predications.set_index(["PREDICATION_ID", "SUBJECT_CUI"], append=False, inplace=True) # do not append the default integer index to columns - obj_piped_predications.set_index(["PREDICATION_ID", "OBJECT_CUI"], append=False, inplace=True) # do not append the default integer index to columns + pid_sub_gid_df = pd.DataFrame(data=pred_id_sub_gene_id_map.items(), columns=["PREDICATION_ID", "SUBJECT_CUI"]) + pid_obj_gid_df = pd.DataFrame(data=pred_id_obj_gene_id_map.items(), columns=["PREDICATION_ID", "OBJECT_CUI"]) - sub_piped_predications.sort_index(inplace=True) - obj_piped_predications.sort_index(inplace=True) + dest_sub_gid_df = sub_piped_predications.merge(pid_sub_gid_df, how="inner", + left_on=["PREDICATION_ID", "SUBJECT_CUI"], + right_on=["PREDICATION_ID", "SUBJECT_CUI"]) + dest_obj_gid_df = obj_piped_predications.merge(pid_obj_gid_df, how="inner", + left_on=["PREDICATION_ID", "OBJECT_CUI"], + right_on=["PREDICATION_ID", "OBJECT_CUI"]) - dest_pred_id_sub_gene_id_pairs = [(pid, gid) for (pid, gid) in pred_id_sub_gene_id_map.items() if (pid, gid) in sub_piped_predications.index] - dest_pred_id_obj_gene_id_pairs = [(pid, gid) for (pid, gid) in pred_id_obj_gene_id_map.items() if (pid, gid) in obj_piped_predications.index] + dest_sub_gid_index = set(dest_sub_gid_df["index"].unique()) + dest_obj_gid_index = set(dest_obj_gid_df["index"].unique()) - dest_row_index_of_equivalent_gid_for_sub = set(sub_piped_predications.loc[dest_pred_id_sub_gene_id_pairs, "index"].values) - dest_row_index_of_equivalent_gid_for_obj = set(obj_piped_predications.loc[dest_pred_id_obj_gene_id_pairs, "index"].values) - - dest_row_index = dest_row_index_of_equivalent_gid_for_sub.union(dest_row_index_of_equivalent_gid_for_obj) - return dest_row_index + dest_gid_index = dest_sub_gid_index.union(dest_obj_gid_index) + return dest_gid_index candidate_sub_cui_flags = semmed_data_frame["IS_SUBJECT_PIPED"] & semmed_data_frame["SUBJECT_PREFIX"].eq("umls") candidate_obj_cui_flags = semmed_data_frame["IS_OBJECT_PIPED"] & semmed_data_frame["OBJECT_PREFIX"].eq("umls") From facb3b8e213572e8ee9e30d59b7e1abca6f3aab9 Mon Sep 17 00:00:00 2001 From: Yao Yao Date: Tue, 10 Jan 2023 13:34:12 -0800 Subject: [PATCH 12/15] add comments --- parser.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/parser.py b/parser.py index b6b9931..33abfad 100644 --- a/parser.py +++ b/parser.py @@ -126,7 +126,8 @@ def read_cui_name_and_semtype_from_umls(data_folder, filename) -> pd.DataFrame: # Each element is a tuple of (column_index, column_name, data_type) (0, "CUI", "string"), (1, "CONCEPT_NAME", "string"), - # (2, "SEMTYPE_FULLNAME", "string"), # we will map semantic type abbreviations to fullnames when constructing documents later, no need to read this column + # we will map semantic type abbreviations to fullnames when constructing documents later, no need to read this column for now + # (2, "SEMTYPE_FULLNAME", "string"), (3, "SEMTYPE", "string") ] column_indices = [e[0] for e in column_info] @@ -150,7 +151,7 @@ def read_cui_name_and_semtype_from_umls(data_folder, filename) -> pd.DataFrame: def read_semmed_data_frame(data_folder, filename) -> pd.DataFrame: filepath = os.path.join(data_folder, filename) - encoding = "latin1" # TODO encode in UTF-8 before outputting? Once read in strings, it's UTF (to be confirmed)? + encoding = "latin1" # file may contain chars in other languages (e.g. French) na_value = r"\N" column_info = [ # Each element is a tuple of (column_index, column_name, data_type) @@ -366,7 +367,13 @@ def get_cui_name_and_semtype_from_semmed(semmed_data_frame: pd.DataFrame): sub_cui_semtype_data_frame = semmed_data_frame.loc[sub_cui_flags, ["SUBJECT_CUI", "SUBJECT_NAME", "SUBJECT_SEMTYPE"]] obj_cui_semtype_data_frame = semmed_data_frame.loc[obj_cui_flags, ["OBJECT_CUI", "OBJECT_NAME", "OBJECT_SEMTYPE"]] - # TODO add notes + """ + Drop duplicates in advance in order to: + 1. reduce memory usage, and + 2. avoid the "ArrowInvalid: offset overflow while concatenating arrays" error due to a bug in Apache Arrow. + + See https://issues.apache.org/jira/browse/ARROW-10799 for the bug details + """ sub_cui_semtype_data_frame.drop_duplicates(subset=["SUBJECT_CUI", "SUBJECT_SEMTYPE"], inplace=True) obj_cui_semtype_data_frame.drop_duplicates(subset=["OBJECT_CUI", "OBJECT_SEMTYPE"], inplace=True) From b21e6415d71f0483b9d63e41feba0b8dcdb1bdfc Mon Sep 17 00:00:00 2001 From: Yao Yao Date: Tue, 10 Jan 2023 13:51:33 -0800 Subject: [PATCH 13/15] revise function delete_equivalent_ncbigene_ids to reduce peak memory usage from 3x dataframe size to 2x --- parser.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/parser.py b/parser.py index 33abfad..03a9ebf 100644 --- a/parser.py +++ b/parser.py @@ -576,20 +576,20 @@ def get_row_index_of_equivalent_ncbigene_ids(pred_id_sub_gene_id_map: Dict, pred def add_document_id_column(semmed_data_frame: pd.DataFrame): - # CUIs in descending order so a true CUI precedes a NCBIGene ID + # CUIs in descending order so a true CUI always precedes a NCBIGene ID inside a predication group semmed_data_frame.sort_values(by=['PREDICATION_ID', 'SUBJECT_CUI', 'OBJECT_CUI'], ascending=[True, False, False], ignore_index=True, inplace=True) - primary_ids = semmed_data_frame["PREDICATION_ID"].astype("string[pyarrow]") + pred_groups = semmed_data_frame.loc[:, ["PREDICATION_ID"]].groupby("PREDICATION_ID") + groupwise_pred_nums = pred_groups.cumcount().add(1) + group_sizes = pred_groups.transform("size") - groupwise_pred_nums = semmed_data_frame.groupby("PREDICATION_ID").cumcount().add(1) + primary_ids = semmed_data_frame["PREDICATION_ID"].astype("string[pyarrow]") secondary_ids = (f"{pid}-{num}" for pid, num in zip(semmed_data_frame["PREDICATION_ID"], groupwise_pred_nums)) secondary_ids = pd.Series(data=secondary_ids, dtype="string[pyarrow]", index=semmed_data_frame.index) - group_sizes = semmed_data_frame.groupby('PREDICATION_ID').transform('size') _ids = pd.Series(data=np.where(group_sizes.eq(1), primary_ids, secondary_ids), dtype="string[pyarrow]", index=semmed_data_frame.index) - semmed_data_frame["_ID"] = _ids return semmed_data_frame From 1d04d49fe991c78b5f67abd24f2cabf945276ee9 Mon Sep 17 00:00:00 2001 From: Yao Yao Date: Tue, 10 Jan 2023 14:48:33 -0800 Subject: [PATCH 14/15] change the version number --- version.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/version.py b/version.py index 5668890..9285c19 100644 --- a/version.py +++ b/version.py @@ -1,3 +1,4 @@ def get_release(self): - # 07/2021 - return "2.0" + # 07/2022 + # See https://lhncbc.nlm.nih.gov/ii/tools/SemRep_SemMedDB_SKR/SemMedDB_download.html + return "43" From 279888e8b2481f09b63a8aaad931eed9dc8a5b51 Mon Sep 17 00:00:00 2001 From: Yao Yao Date: Tue, 10 Jan 2023 18:49:02 -0800 Subject: [PATCH 15/15] change dumper file links in the manifest; update README --- README.md | 34 +++++++++++++++++++++++++++++++++- manifest.json | 17 ++++++++++------- 2 files changed, 43 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index afdfce3..31f7fda 100644 --- a/README.md +++ b/README.md @@ -1 +1,33 @@ -# SEMMED API parser 08/2021 +# SemMedDB Predication Parser + +## Source Data Files + +There are four files required by this parser: + +1. The _PREDICATION_ CSV file. + - Version: `semmedVER43_R` + - Download page: https://lhncbc.nlm.nih.gov/ii/tools/SemRep_SemMedDB_SKR/SemMedDB_download.html + - Direct download link: https://data.lhncbc.nlm.nih.gov/umls-restricted/ii/tools/SemRep_SemMedDB_SKR/semmedVER43_2022_R_PREDICATION.csv.gz + - Filename: `semmedVER43_2022_R_PREDICATION.csv.gz` +2. The _Semantic Type Mappings_ file. + - Version: `2018AB` (UMLS release version number, as shown in [UMLS Release File Archives](https://www.nlm.nih.gov/research/umls/licensedcontent/umlsarchives04.html)) + - Download page: https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/documentation/SemanticTypesAndGroups.html + - Direct download link: https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/SemanticTypes_2018AB.txt + - Filenname: `SemanticTypes_2018AB.txt` +3. The _Retired CUI Mapping_ file. + - Version: `2022AA` (UMLS release version number, as shown in [UMLS Release File Archives](https://www.nlm.nih.gov/research/umls/licensedcontent/umlsarchives04.html)) + - Download page: this file is part of the [_2022AA UMLS Metathesaurus Full Subset_](https://www.nlm.nih.gov/research/umls/licensedcontent/umlsarchives04.html); no direct download link is available. + - Description: https://www.ncbi.nlm.nih.gov/books/NBK9685/table/ch03.T.retired_cui_mapping_file_mrcui_rr/ + - Filename: `MRCUI.RRF` +4. The _Preferred CUI Names & Semtypes_ file. + - Github Repo: https://github.com/erikyao/UMLS_CUI_Semtype + - How to Generate: + - Source data files: + - The _Retired CUI Mapping_ file, `MRCUI.RRF` (see above). + - The _Concept Names and Sources_ file, `MRCONSO.RRF`. + - Version: `2022AA` + - Download page: https://www.nlm.nih.gov/research/umls/licensedcontent/umlsarchives04.html + - Description: https://www.ncbi.nlm.nih.gov/books/NBK9685/table/ch03.T.concept_names_and_sources_file_mr/ + - The _Semantic Type Mappings_ file, `SemanticTypes_2018AB.txt` (see above). + - Script: [parser.py](https://github.com/erikyao/UMLS_CUI_Semtype/blob/main/parser.py) + - Filename: `UMLS_CUI_Semtype.tsv`, as shown [here](https://github.com/erikyao/UMLS_CUI_Semtype/blob/main/parser.py#L188) \ No newline at end of file diff --git a/manifest.json b/manifest.json index fbe29fc..f884c92 100644 --- a/manifest.json +++ b/manifest.json @@ -1,14 +1,17 @@ { - "version": "0.1", - "requires" : "pyyaml", + "version": "0.2", "__metadata__" : { - "license_url" : "https://skr3.nlm.nih.gov/SemMedDB/", - "licence" : "CC BY 4.0", - "url" : "https://skr3.nlm.nih.gov/SemMedDB/" + "license_url" : "https://lhncbc.nlm.nih.gov/ii/tools/SemRep_SemMedDB_SKR.html", + "licence" : "UMLS Metathesaurus License", + "url" : "https://lhncbc.nlm.nih.gov/ii/tools/SemRep_SemMedDB_SKR.html" }, "dumper" : { - "data_url" : ["https://dl.dropboxusercontent.com/s/im2ru5epalfr9lt/semmed_source.tar.gz", - "https://metamap.nlm.nih.gov/Docs/SemanticTypes_2013AA.txt"], + "data_url" : [ + "https://data.lhncbc.nlm.nih.gov/umls-restricted/ii/tools/SemRep_SemMedDB_SKR/semmedVER43_2022_R_PREDICATION.csv.gz", + "https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/SemanticTypes_2018AB.txt", + "http://localhost:8080/dataupload/mysrc/semmeddb/MRCUI.RRF", + "http://localhost:8080/dataupload/mysrc/semmeddb/UMLS_CUI_Semtype.tsv" + ], "uncompress" : true, "release" : "version:get_release" },