Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add index wide tests #225

Closed
wants to merge 11 commits into from
Closed
72 changes: 72 additions & 0 deletions src/reports/index_wide_compendia_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
"""
There are some tests we would like to do that apply to the entire Babel compendia.

To do this, our current strategy is to go through the entire Babel compendia and
add the relevant information into a SQLite database. We can then check with this
database to look for relevant duplication.
"""
import json
import logging
import sqlite3
from pathlib import Path


def report_on_index_wide_compendia_tests(compendia_files, sqlite_file, report_file):
Path(sqlite_file).touch()
Path(report_file).touch()

# Open the SQLite file that we will use to keep track of duplicates.
# Connect to the SQLite database
conn = sqlite3.connect(sqlite_file + '.db')
c = conn.cursor()

# Create a compendia table if it doesn't exist
c.execute('''CREATE TABLE IF NOT EXISTS compendia (
preferred_curie TEXT NOT NULL PRIMARY KEY,
curie TEXT NOT NULL
)''')

# Go through all the compendia files in the order provided.
for compendia_file_index, compendia_file in enumerate(compendia_files):
# Go through every entry in each compendia_file
logging.info(f"Reading {compendia_file} ({compendia_file_index + 1}/{len(compendia_files)})")

count_curies = 0
with open(compendia_file, 'r') as compendiafile:
for line in compendiafile:
entry = json.loads(line)
identifiers = entry['identifiers']

if len(identifiers) > 0:
preferred_curie = identifiers[0]['i']
for identifier in identifiers:
curie = identifier['i']
count_curies += 1
c.execute("INSERT INTO compendia (preferred_curie, curie) VALUES (?, ?)", (preferred_curie, curie))

logging.info(f"Read {count_curies} into SQLite database {sqlite_file}.")

# Query the table to check if the data was inserted correctly
conn.commit()
c.execute("SELECT COUNT(*) FROM compendia")
record_count = c.fetchone()

logging.info(f"SQLite database contains {record_count} records.")

# Start writing the report file.
with open(report_file, 'w') as reportfile:
c.execute("SELECT COUNT(curie) FROM compendia")
curie_count = c.fetchone()

# Look for curies mapped to multiple preferred_curies.
c.execute("SELECT curie, COUNT(DISTINCT preferred_curie), GROUP_CONCAT(DISTINCT preferred_curie) FROM compendia GROUP BY curie HAVING COUNT(DISTINCT preferred_curie) > 1 ORDER BY COUNT(DISTINCT preferred_curie) DESC;")
results = c.fetchall()
duplicates = [{'curie': duplicate[0], 'count': duplicate[1], 'preferred_curies': duplicate[2].split(',')} for duplicate in results]

json.dump({
'curie_count': curie_count,
'duplicates': duplicates
}, reportfile)

# Close the database connection
conn.close()
77 changes: 77 additions & 0 deletions src/reports/index_wide_synonym_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""
There are some tests we would like to do that apply to the entire Babel synonyms.

To do this, our current strategy is to go through the entire Babel synonyms and
add the relevant information into a SQLite database. We can then check with this
database to look for relevant duplication.
"""
import json
import logging
import sqlite3
from pathlib import Path


def report_on_index_wide_synonym_tests(synonym_files, sqlite_file, report_file):
# Start writing to the report file so Snakemake knows we're working.
Path(report_file).touch()
Path(sqlite_file).touch()

# Open the SQLite file that we will use to keep track of duplicates.
# Connect to the SQLite database
conn = sqlite3.connect(sqlite_file + '.db')
c = conn.cursor()

# Create a compendia table if it doesn't exist
c.execute('''CREATE TABLE IF NOT EXISTS synonyms (
curie TEXT NOT NULL PRIMARY KEY UNIQUE,
biolink_type TEXT,
preferred_name TEXT,
preferred_name_lc TEXT
)''')

# Go through all the compendia files in the order provided.
for synonyms_file_index, synonyms_file in enumerate(synonym_files):
# Go through every entry in each synonyms_file
logging.info(f"Reading synonyms file {synonyms_file} ({synonyms_file_index + 1}/{len(synonym_files)})")

count_entries = 0
with open(synonyms_file, 'r') as synonymsfile:
for line in synonymsfile:
entry = json.loads(line)
count_entries += 1

curie = entry['curie']
if len(entry['types']) > 0:
biolink_type = 'biolink:' + entry['types'][0]
preferred_name = entry['preferred_name']
preferred_name_lc = preferred_name.lower()

# This should give us an error if we see the same CURIE in multiple files.
c.execute("INSERT INTO synonyms (curie, biolink_type, preferred_name, preferred_name_lc) VALUES (?, ?, ?, ?)",
(curie, biolink_type, preferred_name, preferred_name_lc))

logging.info(f"Read {count_entries} entries from {synonyms_file}.")
conn.commit()

# Count the number of curie values in the synonyms table in SQLite.
c.execute("SELECT COUNT(curie) FROM synonyms")
curie_count = c.fetchone()

logging.info(f"{curie_count} CURIEs loaded into {sqlite_file}")

with open(report_file, 'w') as reportfile:
c.execute("SELECT COUNT(curie) FROM synonyms")
curie_count = c.fetchone()

# Look for identical preferred_name_lc values.
c.execute("SELECT preferred_name_lc, COUNT(preferred_name_lc), GROUP_CONCAT(DISTINCT curie) FROM synonyms GROUP BY preferred_name_lc HAVING COUNT(preferred_name_lc) > 1 ORDER BY COUNT(preferred_name_lc) DESC;")
results = c.fetchall()
duplicates = [{'preferred_name_lc': duplicate[0], 'count': duplicate[1], 'curies': duplicate[2].split(',')} for duplicate in results]

json.dump({
'curie_count': curie_count,
'duplicates': duplicates
}, reportfile)

# Close the database connection
conn.close()
19 changes: 19 additions & 0 deletions src/snakefiles/reports.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ import os

from src.reports.compendia_per_file_reports import assert_files_in_directory, \
generate_content_report_for_compendium, summarize_content_report_for_compendia
from src.reports.index_wide_synonym_tests import report_on_index_wide_synonym_tests
from src.reports.index_wide_compendia_tests import report_on_index_wide_compendia_tests

# Some paths we will use at multiple times in these reports.
compendia_path = config['output_directory'] + '/compendia'
Expand Down Expand Up @@ -90,6 +92,23 @@ rule generate_summary_content_report_for_compendia:
run:
summarize_content_report_for_compendia(input.expected_content_reports, output.report_path)

rule test_compendia_for_duplication:
input:
compendia_files = expand("{compendia_path}/{compendium_file}", compendia_path=compendia_path, compendium_file=compendia_files),
output:
sqlite_file = config['output_directory']+'/reports/duplication/synonyms.sqlite3',
report_path = config['output_directory']+'/reports/duplication/synonym_duplication_report.json',
run:
report_on_index_wide_compendia_tests(input.compendia_files, output.sqlite_file, output.report_path)

rule test_synonyms_for_duplication:
input:
synonyms_files = expand("{synonyms_path}/{synonym_file}", synonyms_path=synonyms_path, synonym_file=synonyms_files),
output:
sqlite_file = config['output_directory']+'/reports/duplication/synonyms.sqlite3',
report_path = config['output_directory']+'/reports/duplication/synonym_duplication_report.json',
run:
report_on_index_wide_synonym_tests(input.synonyms_files, output.sqlite_file, output.report_path)

# Check that all the reports were built correctly.
rule all_reports:
Expand Down