diff --git a/code/kg2c/build_kg2c.py b/code/kg2c/build_kg2c.py index 6443c4098..fe77a8ecd 100644 --- a/code/kg2c/build_kg2c.py +++ b/code/kg2c/build_kg2c.py @@ -19,7 +19,7 @@ sys.path.append(os.path.dirname(os.path.abspath(__file__))) from create_kg2c_files import create_kg2c_files -from record_kg2c_meta_info import record_meta_kg_info +from record_kg2c_meta_info import record_select_meta_info import file_manager KG2C_DIR = f"{os.path.dirname(os.path.abspath(__file__))}" @@ -118,8 +118,8 @@ def main(): # Actually build KG2c logging.info("Calling create_kg2c_files.py..") create_kg2c_files(args.kg2pre_version, args.sub_version, args.biolink_version, synonymizer_name, args.test) - logging.info("Calling record_kg2c_meta_info.py..") - record_meta_kg_info(args.biolink_version, args.test) + logging.info("Calling record_select_meta_info.py..") + record_select_meta_info(args.biolink_version, args.test) # Upload artifacts to the relevant places file_manager.make_kg2c_tarball(args.test) diff --git a/code/kg2c/file_manager.py b/code/kg2c/file_manager.py index a144487b3..c24e523e8 100644 --- a/code/kg2c/file_manager.py +++ b/code/kg2c/file_manager.py @@ -257,9 +257,6 @@ def upload_kg2c_files_to_arax_databases_server(kg2pre_version: str, sub_version: upload_file_to_arax_databases_server(local_file_path=f"{KG2C_DIR}/kg2c.sqlite{test_suffix}", remote_file_name=f"kg2c_{sub_version}_KG{kg2pre_version}.sqlite{test_suffix}", kg2pre_version=kg2pre_version) - upload_file_to_arax_databases_server(local_file_path=f"{KG2C_DIR}/meta_kg.json{test_suffix}", - remote_file_name=f"meta_kg_{sub_version}_KG{kg2pre_version}c.json{test_suffix}", - kg2pre_version=kg2pre_version) upload_file_to_arax_databases_server(local_file_path=f"{KG2C_DIR}/fda_approved_drugs.pickle{test_suffix}", remote_file_name=f"fda_approved_drugs_{sub_version}_KG{kg2pre_version}c.pickle{test_suffix}", kg2pre_version=kg2pre_version) diff --git a/code/kg2c/record_kg2c_meta_info.py b/code/kg2c/record_kg2c_meta_info.py index 68f6d602d..8954c0a82 100644 --- a/code/kg2c/record_kg2c_meta_info.py +++ b/code/kg2c/record_kg2c_meta_info.py @@ -22,87 +22,6 @@ KG2C_DIR = f"{os.path.dirname(os.path.abspath(__file__))}" -def serialize_with_sets(obj: any) -> any: - # Thank you https://stackoverflow.com/a/60544597 - if isinstance(obj, set): - return list(obj) - else: - return obj - -def get_meta_qualifier(qualified_predicate, qualified_object_direction, qualified_object_aspect): - meta_qualifier = [] - if (len(qualified_predicate) != 0): - meta_qualifier.append({"qualifier_type_id": "biolink:qualified_predicate", "applicable_values": qualified_predicate}) - if(len(qualified_object_direction) != 0): - meta_qualifier.append({"qualifier_type_id": "biolink:object_direction_qualifier", "applicable_values": qualified_object_direction}) - if(len(qualified_object_aspect) != 0): - meta_qualifier.append({"qualifier_type_id":"biolink:object_aspect_qualifier", "applicable_values": qualified_object_aspect}) - return meta_qualifier -def add_edge_to_applicable_values(qualifier_dict, key, value): #Adds the qualifier value to the corresponding applicable values of the list - if (value != ""): #given the key nad the value is not alreadt present. - if(key in qualifier_dict): - if(value not in qualifier_dict[key]): - qualifier_dict[key].append(value) - else: - qualifier_dict[key] = [value] - else: - if(key not in qualifier_dict): - qualifier_dict[key] = [] - - -def build_meta_kg(nodes_by_id: Dict[str, Dict[str, any]], edges_by_id: Dict[str, Dict[str, any]], - meta_kg_file_name: str, biolink_helper: BiolinkHelper, is_test: bool): - logging.info(f"Building meta KG..") - logging.info(" Gathering all meta triples..") - meta_triples = set() - qualified_predicate = {} - qualified_object_direction = {} - qualified_object_aspect = {} - for edge in edges_by_id.values(): - subject_node_id = edge["subject"] - object_node_id = edge["object"] - if not is_test or (subject_node_id in nodes_by_id and object_node_id in nodes_by_id): - subject_node = nodes_by_id[subject_node_id] - object_node = nodes_by_id[object_node_id] - subject_categories = biolink_helper.add_conflations(subject_node["all_categories"]) - object_categories = biolink_helper.add_conflations(object_node["all_categories"]) - predicate = edge["predicate"] - - for subject_category in subject_categories: - for object_category in object_categories: - add_edge_to_applicable_values(qualified_predicate, f"{subject_category}-{object_category}", edge["qualified_predicate"]) #Adding the qualified_predicate of the edge to the corresponding applicable values list for the object_category-subject_category pair - add_edge_to_applicable_values(qualified_object_direction, f"{subject_category}-{object_category}", edge["qualified_object_direction"]) #Adding the qualified_object_direction of the edge to the corresponding applicable values list for the object_category-subject_category pair - add_edge_to_applicable_values(qualified_object_aspect, f"{subject_category}-{object_category}", edge["qualified_object_aspect"]) #Adding the qualified_object_aspect of the edge to the corresponding applicable values list for the object_category-subject_category pair - meta_triples.add((subject_category, predicate, object_category)) - kg2_infores_curie = "infores:rtx-kg2" - - meta_edges = [{"subject": triple[0], - "predicate": triple[1], - "object": triple[2], - "qualifiers": get_meta_qualifier(qualified_predicate[f"{triple[0]}-{triple[2]}"], qualified_object_direction[f"{triple[0]}-{triple[2]}"], qualified_object_aspect[f"{triple[0]}-{triple[2]}"]) } if (qualified_predicate[f"{triple[0]}-{triple[2]}"] != []) else {"subject": triple[0], - "predicate": triple[1], - "object": triple[2]} - for triple in meta_triples] - logging.info(f" Created {len(meta_edges)} meta edges") - - logging.info(" Gathering all meta nodes..") - with open(f"{KG2C_DIR}/equivalent_curies.pickle", "rb") as equiv_curies_file: - equivalent_curies_dict = pickle.load(equiv_curies_file) - meta_nodes = defaultdict(lambda: defaultdict(lambda: set())) - for node_id, node in nodes_by_id.items(): - equivalent_curies = equivalent_curies_dict.get(node_id, [node_id]) - prefixes = {curie.split(":")[0] for curie in equivalent_curies} - categories = biolink_helper.add_conflations(node["category"]) - for category in categories: - meta_nodes[category]["id_prefixes"].update(prefixes) - logging.info(f" Created {len(meta_nodes)} meta nodes") - - logging.info(" Saving meta KG to JSON file..") - meta_kg = {"nodes": meta_nodes, "edges": meta_edges} - with open(f"{KG2C_DIR}/{meta_kg_file_name}", "w+") as meta_kg_file: - json.dump(meta_kg, meta_kg_file, default=serialize_with_sets, indent=2) - - def add_neighbor_counts_to_sqlite(nodes_by_id: Dict[str, Dict[str, any]], edges_by_id: Dict[str, Dict[str, any]], sqlite_file_name: str, label_property_name: str): logging.info("Counting up node neighbors by category..") @@ -191,7 +110,7 @@ def generate_fda_approved_drugs_pickle(edges_by_id: Dict[str, Dict[str, any]], f pickle.dump(fda_approved_drugs, pickle_file) -def record_meta_kg_info(biolink_version: str, is_test: bool): +def record_select_meta_info(biolink_version: str, is_test: bool): logging.info("Starting to record KG2c meta info..") bh = BiolinkHelper(biolink_version) start = time.time() @@ -209,15 +128,13 @@ def record_meta_kg_info(biolink_version: str, is_test: bool): for node in nodes_by_id.values(): node[expanded_labels_property_name] = bh.get_ancestors(node["all_categories"], include_mixins=True) - meta_kg_file_name = f"meta_kg.json{'_TEST' if is_test else ''}" sqlite_file_name = f"kg2c.sqlite{'_TEST' if is_test else ''}" fda_approved_file_name = f"fda_approved_drugs.pickle{'_TEST' if is_test else ''}" - build_meta_kg(nodes_by_id, edges_by_id, meta_kg_file_name, bh, is_test) add_neighbor_counts_to_sqlite(nodes_by_id, edges_by_id, sqlite_file_name, expanded_labels_property_name) add_category_counts_to_sqlite(nodes_by_id, sqlite_file_name, expanded_labels_property_name) generate_fda_approved_drugs_pickle(edges_by_id, fda_approved_file_name) - logging.info(f"Recording meta KG info took {round((time.time() - start) / 60, 1)} minutes.") + logging.info(f"Recording meta info took {round((time.time() - start) / 60, 1)} minutes.") def main(): @@ -230,7 +147,7 @@ def main(): help="The Biolink version that the given KG2pre version uses (e.g., 4.0.1).") arg_parser.add_argument("--test", dest="test", action='store_true', default=False) args = arg_parser.parse_args() - record_meta_kg_info(args.biolink_version, args.test) + record_select_meta_info(args.biolink_version, args.test) if __name__ == "__main__":