diff --git a/src/reports/index_wide_compendia_tests.py b/src/reports/index_wide_compendia_tests.py new file mode 100644 index 00000000..b360193a --- /dev/null +++ b/src/reports/index_wide_compendia_tests.py @@ -0,0 +1,72 @@ +""" +There are some tests we would like to do that apply to the entire Babel compendia. + +To do this, our current strategy is to go through the entire Babel compendia and +add the relevant information into a SQLite database. We can then check with this +database to look for relevant duplication. +""" +import json +import logging +import sqlite3 +from pathlib import Path + + +def report_on_index_wide_compendia_tests(compendia_files, sqlite_file, report_file): + Path(sqlite_file).touch() + Path(report_file).touch() + + # Open the SQLite file that we will use to keep track of duplicates. + # Connect to the SQLite database + conn = sqlite3.connect(sqlite_file + '.db') + c = conn.cursor() + + # Create a compendia table if it doesn't exist + c.execute('''CREATE TABLE IF NOT EXISTS compendia ( + preferred_curie TEXT NOT NULL PRIMARY KEY, + curie TEXT NOT NULL + )''') + + # Go through all the compendia files in the order provided. + for compendia_file_index, compendia_file in enumerate(compendia_files): + # Go through every entry in each compendia_file + logging.info(f"Reading {compendia_file} ({compendia_file_index + 1}/{len(compendia_files)})") + + count_curies = 0 + with open(compendia_file, 'r') as compendiafile: + for line in compendiafile: + entry = json.loads(line) + identifiers = entry['identifiers'] + + if len(identifiers) > 0: + preferred_curie = identifiers[0]['i'] + for identifier in identifiers: + curie = identifier['i'] + count_curies += 1 + c.execute("INSERT INTO compendia (preferred_curie, curie) VALUES (?, ?)", (preferred_curie, curie)) + + logging.info(f"Read {count_curies} into SQLite database {sqlite_file}.") + + # Query the table to check if the data was inserted correctly + conn.commit() + c.execute("SELECT COUNT(*) FROM compendia") + record_count = c.fetchone() + + logging.info(f"SQLite database contains {record_count} records.") + + # Start writing the report file. + with open(report_file, 'w') as reportfile: + c.execute("SELECT COUNT(curie) FROM compendia") + curie_count = c.fetchone() + + # Look for curies mapped to multiple preferred_curies. + c.execute("SELECT curie, COUNT(DISTINCT preferred_curie), GROUP_CONCAT(DISTINCT preferred_curie) FROM compendia GROUP BY curie HAVING COUNT(DISTINCT preferred_curie) > 1 ORDER BY COUNT(DISTINCT preferred_curie) DESC;") + results = c.fetchall() + duplicates = [{'curie': duplicate[0], 'count': duplicate[1], 'preferred_curies': duplicate[2].split(',')} for duplicate in results] + + json.dump({ + 'curie_count': curie_count, + 'duplicates': duplicates + }, reportfile) + + # Close the database connection + conn.close() diff --git a/src/reports/index_wide_synonym_tests.py b/src/reports/index_wide_synonym_tests.py new file mode 100644 index 00000000..84ec8f37 --- /dev/null +++ b/src/reports/index_wide_synonym_tests.py @@ -0,0 +1,77 @@ +""" +There are some tests we would like to do that apply to the entire Babel synonyms. + +To do this, our current strategy is to go through the entire Babel synonyms and +add the relevant information into a SQLite database. We can then check with this +database to look for relevant duplication. +""" +import json +import logging +import sqlite3 +from pathlib import Path + + +def report_on_index_wide_synonym_tests(synonym_files, sqlite_file, report_file): + # Start writing to the report file so Snakemake knows we're working. + Path(report_file).touch() + Path(sqlite_file).touch() + + # Open the SQLite file that we will use to keep track of duplicates. + # Connect to the SQLite database + conn = sqlite3.connect(sqlite_file + '.db') + c = conn.cursor() + + # Create a compendia table if it doesn't exist + c.execute('''CREATE TABLE IF NOT EXISTS synonyms ( + curie TEXT NOT NULL PRIMARY KEY UNIQUE, + biolink_type TEXT, + preferred_name TEXT, + preferred_name_lc TEXT + )''') + + # Go through all the compendia files in the order provided. + for synonyms_file_index, synonyms_file in enumerate(synonym_files): + # Go through every entry in each synonyms_file + logging.info(f"Reading synonyms file {synonyms_file} ({synonyms_file_index + 1}/{len(synonym_files)})") + + count_entries = 0 + with open(synonyms_file, 'r') as synonymsfile: + for line in synonymsfile: + entry = json.loads(line) + count_entries += 1 + + curie = entry['curie'] + if len(entry['types']) > 0: + biolink_type = 'biolink:' + entry['types'][0] + preferred_name = entry['preferred_name'] + preferred_name_lc = preferred_name.lower() + + # This should give us an error if we see the same CURIE in multiple files. + c.execute("INSERT INTO synonyms (curie, biolink_type, preferred_name, preferred_name_lc) VALUES (?, ?, ?, ?)", + (curie, biolink_type, preferred_name, preferred_name_lc)) + + logging.info(f"Read {count_entries} entries from {synonyms_file}.") + conn.commit() + + # Count the number of curie values in the synonyms table in SQLite. + c.execute("SELECT COUNT(curie) FROM synonyms") + curie_count = c.fetchone() + + logging.info(f"{curie_count} CURIEs loaded into {sqlite_file}") + + with open(report_file, 'w') as reportfile: + c.execute("SELECT COUNT(curie) FROM synonyms") + curie_count = c.fetchone() + + # Look for identical preferred_name_lc values. + c.execute("SELECT preferred_name_lc, COUNT(preferred_name_lc), GROUP_CONCAT(DISTINCT curie) FROM synonyms GROUP BY preferred_name_lc HAVING COUNT(preferred_name_lc) > 1 ORDER BY COUNT(preferred_name_lc) DESC;") + results = c.fetchall() + duplicates = [{'preferred_name_lc': duplicate[0], 'count': duplicate[1], 'curies': duplicate[2].split(',')} for duplicate in results] + + json.dump({ + 'curie_count': curie_count, + 'duplicates': duplicates + }, reportfile) + + # Close the database connection + conn.close() diff --git a/src/snakefiles/reports.snakefile b/src/snakefiles/reports.snakefile index 13d58438..4dc1c078 100644 --- a/src/snakefiles/reports.snakefile +++ b/src/snakefiles/reports.snakefile @@ -2,6 +2,8 @@ import os from src.reports.compendia_per_file_reports import assert_files_in_directory, \ generate_content_report_for_compendium, summarize_content_report_for_compendia +from src.reports.index_wide_synonym_tests import report_on_index_wide_synonym_tests +from src.reports.index_wide_compendia_tests import report_on_index_wide_compendia_tests # Some paths we will use at multiple times in these reports. compendia_path = config['output_directory'] + '/compendia' @@ -90,6 +92,23 @@ rule generate_summary_content_report_for_compendia: run: summarize_content_report_for_compendia(input.expected_content_reports, output.report_path) +rule test_compendia_for_duplication: + input: + compendia_files = expand("{compendia_path}/{compendium_file}", compendia_path=compendia_path, compendium_file=compendia_files), + output: + sqlite_file = config['output_directory']+'/reports/duplication/synonyms.sqlite3', + report_path = config['output_directory']+'/reports/duplication/synonym_duplication_report.json', + run: + report_on_index_wide_compendia_tests(input.compendia_files, output.sqlite_file, output.report_path) + +rule test_synonyms_for_duplication: + input: + synonyms_files = expand("{synonyms_path}/{synonym_file}", synonyms_path=synonyms_path, synonym_file=synonyms_files), + output: + sqlite_file = config['output_directory']+'/reports/duplication/synonyms.sqlite3', + report_path = config['output_directory']+'/reports/duplication/synonym_duplication_report.json', + run: + report_on_index_wide_synonym_tests(input.synonyms_files, output.sqlite_file, output.report_path) # Check that all the reports were built correctly. rule all_reports: