Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: move over all_ontology.json generator script + gha to repo from single-cell-curation #27

Merged
merged 4 commits into from
Feb 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions .github/workflows/generate_all_ontology.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
name: Updates to Ontology Files

on:
push:
paths:
- '**/tools/ontology-builder/ontology-references/owl_info.yml'
branches-ignore:
- main

jobs:
ontology-processing:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v3
with:
ref: ${{ github.event.pull_request.head.ref }}
- name: ontology changes
uses: dorny/paths-filter@v2
id: filter
with:
filters: |
owl_info:
- 'tools/ontology-builder/ontology-references/owl_info.yml'
- name: Set up Python 3.8
uses: actions/setup-python@v1
with:
python-version: 3.8
- name: Python cache
uses: actions/cache@v1
with:
path: ~/.cache/pip
key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements*.txt') }}
restore-keys: |
${{ runner.os }}-pip-
- name: install requirements
run: |
pip install -r tools/ontology-builder/requirements.txt
- name: setup git
run: |
git config user.name github-actions
git config user.email github-actions@github.com
- name: owl-processing
if: ${{ steps.filter.outputs.owl_info == 'true' }}
run: |
python3 ./tools/ontology-builder/all_ontology_generator.py
git add ./tools/ontology-builder/ontology-references/all_ontology.json.gz
- name: Commit
if: ${{ steps.filter.outputs.owl_info == 'true' }}
run: |
git commit -m "AUTO: update ontologies"
git push
214 changes: 214 additions & 0 deletions tools/ontology-builder/all_ontology_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
import env
import gzip
import json
import os
import re
import urllib.request
from threading import Thread
from typing import List
from urllib.error import HTTPError, URLError

import owlready2
import yaml


def _download_owls(owl_info_yml: str = env.OWL_INFO_YAML, output_dir: str = env.ONTOLOGY_DIR):
"""
Downloads the ontology owl files specified in 'owl_info_yml' into 'output_dir'

:param str owl_info_yml: path to yaml file wit OWL information
:param str output_dir: path to writable directory where owl files will be downloaded to

:rtype None
"""

with open(owl_info_yml, "r") as owl_info_handle:
owl_info = yaml.safe_load(owl_info_handle)

def download(_ontology, _url):
print(f"Start Downloading {_ontology}")
# Format of owl (handles cases where they are compressed)
download_format = _url.split(".")[-1]

output_file = os.path.join(output_dir, _ontology + ".owl")
if download_format == "gz":
urllib.request.urlretrieve(_url, output_file + ".gz")
_decompress(output_file + ".gz", output_file)
os.remove(output_file + ".gz")
else:
urllib.request.urlretrieve(_url, output_file)
print(f"Finish Downloading {_ontology}")

threads = []
for ontology, _ in owl_info.items():
latest_version = owl_info[ontology]["latest"]
url = owl_info[ontology]["urls"][latest_version]
try:
urllib.request.urlopen(url)
except HTTPError as e:
raise Exception(f"{ontology} with pinned URL {url} returns status code {e.code}") from e
except URLError as e:
raise Exception(f"{ontology} with pinned URL {url} fails due to {e.reason}") from e

t = Thread(target=download, args=(ontology, url))
t.start()
threads.append(t)

for t in threads:
t.join()


def _decompress(infile: str, tofile: str):
"""
Decompresses a gziped file

:param str infile: path gziped file
:param str tofile: path to output decompressed file

:rtype None
"""
with open(infile, "rb") as inf, open(tofile, "w", encoding="utf8") as tof:
decom_str = gzip.decompress(inf.read()).decode("utf-8")
tof.write(decom_str)


def _parse_owls(
working_dir: str = env.ONTOLOGY_DIR,
owl_info_yml: str = env.OWL_INFO_YAML,
output_json_file: str = env.PARSED_ONTOLOGIES_FILE,
):
"""
Parser all owl files in working_dir. Extracts information from all classes in the owl file.
The extracted information is written into a gzipped a json file with the following structure:
{
"ontology_name":
{
"term_id": {
"label": "..."
"deprecated": True
"ancestors": [
"ancestor1_term_id_1",
"ancestor2_term_id_2"
]
}
}

"term_id2": {
...
}

...
}
}

:param str working_dir: path to folder with owl files
:param str owl_info_yml: path to writable directory where owl files will be downloaded to
:param str owl_info_yml: path to yaml file wit owl information
:param str output_json_file: path to output jsaon file

:rtype None
"""

with open(owl_info_yml, "r") as owl_info_handle:
owl_info = yaml.safe_load(owl_info_handle)

owl_files = []
for owl_file in os.listdir(working_dir):
if owl_file.endswith(".owl"):
owl_files.append(os.path.join(working_dir, owl_file))

# Parse owl files
onto_dict = {}
for owl_file in owl_files:
world = owlready2.World()
onto = world.get_ontology(owl_file)
onto.load()
onto_dict[onto.name] = {}

print(f"Processing {onto.name}")

for onto_class in onto.classes():
term_id = onto_class.name.replace("_", ":")

# Skip terms that are not direct children from this ontology
if onto.name != term_id.split(":")[0]:
continue

# If there are specified target terms then only work with them
if onto.name in owl_info and "only" in owl_info[onto.name] and term_id not in owl_info[onto.name]["only"]:
continue

# Gets label
onto_dict[onto.name][term_id] = dict()
try:
onto_dict[onto.name][term_id]["label"] = onto_class.label[0]
except IndexError:
onto_dict[onto.name][term_id]["label"] = ""

# Add the "deprecated" status
onto_dict[onto.name][term_id]["deprecated"] = False
if onto_class.deprecated and onto_class.deprecated.first():
# if deprecated, include information to determine replacement term(s)
onto_dict[onto.name][term_id]["deprecated"] = True
if onto_class.comment:
onto_dict[onto.name][term_id]["comments"] = [str(c) for c in onto_class.comment]
# stores term tracking URL, such as a github issue discussing deprecated term
if hasattr(onto_class, "IAO_0000233") and onto_class.IAO_0000233:
onto_dict[onto.name][term_id]["term_tracker"] = str(onto_class.IAO_0000233[0])

# only need to record replaced_by OR considers
if onto_class.IAO_0100001 and onto_class.IAO_0100001.first():
# url --> term
ontology_term = re.findall(r"[^\W_]+", str(onto_class.IAO_0100001[0]))
onto_dict[onto.name][term_id]["replaced_by"] = f"{ontology_term[-2]}:{ontology_term[-1]}"
else:
if hasattr(onto_class, "consider") and onto_class.consider:
onto_dict[onto.name][term_id]["consider"] = [str(c) for c in onto_class.consider]
# Gets ancestors
ancestors = _get_ancestors(onto_class, onto.name)

# If "children_of" specified in owl info then skip the current term if it is
# not a children of those indicated.
if (onto.name in owl_info and "children_of" in owl_info[onto.name]) and (
not list(set(ancestors) & set(owl_info[onto.name]["children_of"]))
):
onto_dict[onto.name].pop(term_id)
continue

# only add the ancestors if it's not NCBITaxon, as this saves a lot of disk space
if onto.name == "NCBITaxon":
onto_dict[onto.name][term_id]["ancestors"] = []
else:
onto_dict[onto.name][term_id]["ancestors"] = ancestors

with gzip.open(output_json_file, "wt") as output_json:
json.dump(onto_dict, output_json, indent=2)


def _get_ancestors(onto_class: owlready2.entity.ThingClass, ontololgy_name: str) -> List[str]:
"""
Returns a list of ancestors ids of the given onto class, only returns those belonging to ontology_name,
it will format the id from the form CL_xxxx to CL:xxxx

:param owlready2.entity.ThingClass onto_class: the class for which ancestors will be retrieved
:param str ontololgy_name: only ancestors from this ontology will be kept

:rtype List[str]
:return list of ancestors (term ids), it could be empty
"""

ancestors = []

for ancestor in onto_class.ancestors():
if onto_class.name == ancestor.name:
continue
if ancestor.name.split("_")[0] == ontololgy_name:
ancestors.append(ancestor.name.replace("_", ":"))

return ancestors


# Download and parse owls upon execution
if __name__ == "__main__":
_download_owls()
_parse_owls()
6 changes: 6 additions & 0 deletions tools/ontology-builder/env.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import os

PACKAGE_ROOT = os.path.dirname(os.path.realpath(__file__))
ONTOLOGY_DIR = os.path.join(PACKAGE_ROOT, "ontology-references")
OWL_INFO_YAML = os.path.join(ONTOLOGY_DIR, "owl_info.yml")
PARSED_ONTOLOGIES_FILE = os.path.join(ONTOLOGY_DIR, "all_ontology.json.gz")
Binary file not shown.
38 changes: 38 additions & 0 deletions tools/ontology-builder/ontology-references/owl_info.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
CL:
latest: 2024-01-04
urls:
2024-01-04: https://github.com/obophenotype/cell-ontology/releases/download/v2024-01-04/cl.owl
EFO:
latest: 2024-01-15 EFO 3.62.0
urls:
2024-01-15 EFO 3.62.0: https://github.com/EBISPOT/efo/releases/download/v3.62.0/efo.owl
HANCESTRO:
latest: 3.0
urls:
3.0: https://github.com/EBISPOT/hancestro/raw/3.0/hancestro-base.owl
HsapDv:
latest: 2020-03-10
urls:
2020-03-10: http://aber-owl.net/media/ontologies/HSAPDV/11/hsapdv.owl
MONDO:
latest: 2024-01-03
urls:
2024-01-03: https://github.com/monarch-initiative/mondo/releases/download/v2024-01-03/mondo.owl
MmusDv:
latest: 2020-03-10
urls:
2020-03-10: http://aber-owl.net/media/ontologies/MMUSDV/9/mmusdv.owl
NCBITaxon:
latest: 2023-06-20
urls:
2023-06-20: https://github.com/obophenotype/ncbitaxon/releases/download/v2023-06-20/ncbitaxon.owl.gz
children_of:
- NCBITaxon:33208
UBERON:
latest: 2024-01-18
urls:
2024-01-18: https://github.com/obophenotype/uberon/releases/download/v2024-01-18/uberon.owl
PATO:
latest: 2023-05-18
urls:
2023-05-18: https://github.com/pato-ontology/pato/raw/v2023-05-18/pato.owl
2 changes: 2 additions & 0 deletions tools/ontology-builder/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
owlready2==0.38
PyYaml==6.0