diff --git a/.github/workflows/generate_all_ontology.yml b/.github/workflows/generate_all_ontology.yml index a8e31ba7..5808fa8d 100644 --- a/.github/workflows/generate_all_ontology.yml +++ b/.github/workflows/generate_all_ontology.yml @@ -3,8 +3,10 @@ name: Updates to Ontology Files on: push: paths: - - "**/api/python/src/cellxgene_ontology_guide/artifacts/ontology_info.json" + - "**/artifact-schemas/ontology_info_schema.json" - "**/artifact-schemas/all_ontology_schema.json" + - "**/ontology-assets/ontology_info.json" + - "**/tools/ontology-builder/src/all_ontology_generator.py" branches-ignore: - main @@ -37,7 +39,7 @@ jobs: - name: ontology-processing run: | python3 ./tools/ontology-builder/src/all_ontology_generator.py - git add ./api/python/src/cellxgene_ontology_guide/artifacts/all_ontology.json.gz + git add ./ontology-assets/*.json.gz - name: Commit run: | git commit -m "AUTO: update ontologies" diff --git a/.github/workflows/validate_json_schemas.yml b/.github/workflows/validate_json_schemas.yml index 081a20bc..aa23cf66 100644 --- a/.github/workflows/validate_json_schemas.yml +++ b/.github/workflows/validate_json_schemas.yml @@ -30,6 +30,6 @@ jobs: - name: install requirements run: | pip install -r tools/ontology-builder/requirements.txt - - name: validate curated lists + - name: validate json schemas run: | - python3 ./tools/ontology-builder/src/validate_curated_lists.py + python3 ./tools/ontology-builder/src/validate_json_schemas.py diff --git a/api/python/pyproject.toml b/api/python/pyproject.toml index 9165ee84..28f2d5dd 100644 --- a/api/python/pyproject.toml +++ b/api/python/pyproject.toml @@ -12,7 +12,7 @@ authors = [ license = { file = "LICENSE" } readme = "README.md" requires-python = "~= 3.11" -dependencies = [] +dependencies = ["semantic_version==2.8.5"] [project.optional-dependencies] test = ["pytest"] diff --git a/api/python/src/cellxgene_ontology_guide/artifact_download.py b/api/python/src/cellxgene_ontology_guide/artifact_download.py deleted file mode 100644 index e2777fc0..00000000 --- a/api/python/src/cellxgene_ontology_guide/artifact_download.py +++ /dev/null @@ -1,44 +0,0 @@ -import gzip -import json -from io import BytesIO -from typing import Any -from urllib.error import HTTPError, URLError -from urllib.request import urlopen - -from constants import ONTOLOGY_ASSET_RELEASE_URL, SCHEMA_VERSION_TO_ONTOLOGY_ASSET_TAG - - -def load_artifact_by_schema(schema_version: str, filename: str) -> Any: - """ - Load ontology files from GitHub Release Assets, based on the provided schema version. - Returns ValueError if the schema version is not supported in this package version or filename is not found for - given schema_version. - - :param schema_version: str version of the schema to load ontology assets for - :param filename: str name of the asset to load - :return: Nested dict representation of the content of the asset - """ - try: - ontology_asset_tag = SCHEMA_VERSION_TO_ONTOLOGY_ASSET_TAG[schema_version] - except KeyError as e: - raise ValueError(f"Schema version {schema_version} is not supported in this package version.") from e - - download_url = f"{ONTOLOGY_ASSET_RELEASE_URL}/{ontology_asset_tag}/{filename}" - - try: - with urlopen(download_url) as response: - if response.status == 200: - content: bytes = response.read() - if filename.endswith("json.gz"): - with gzip.open(BytesIO(content), "rt") as f: - return json.load(f) - else: - return json.loads(content) - else: - raise ValueError(f"Server responded with status code: {response.status}") - except HTTPError as e: - raise ValueError( - f"Could not get {filename} for schema version {schema_version} in GitHub Release Assets: {e}" - ) from e - except URLError as e: - raise ValueError(f"URL error occurred: {e.reason}") from e diff --git a/api/python/src/cellxgene_ontology_guide/constants.py b/api/python/src/cellxgene_ontology_guide/constants.py index 58b2c775..483f2b1c 100644 --- a/api/python/src/cellxgene_ontology_guide/constants.py +++ b/api/python/src/cellxgene_ontology_guide/constants.py @@ -2,7 +2,5 @@ PACKAGE_ROOT = os.path.dirname(os.path.realpath(__file__)) DATA_ROOT = os.path.join(PACKAGE_ROOT, "data") -ALL_ONTOLOGY_FILENAME = "all_ontology.json.gz" +ONTOLOGY_FILENAME_SUFFIX = ".json.gz" ONTOLOGY_INFO_FILENAME = "ontology_info.json" -ONTOLOGY_ASSET_RELEASE_URL = "https://github.com/chanzuckerberg/cellxgene-ontology-guide/releases/download" -SCHEMA_VERSION_TO_ONTOLOGY_ASSET_TAG = {"5.0.0": "ontology-assets-v0.0.1"} diff --git a/api/python/src/cellxgene_ontology_guide/ontology_parser.py b/api/python/src/cellxgene_ontology_guide/ontology_parser.py index 72820800..42435d89 100644 --- a/api/python/src/cellxgene_ontology_guide/ontology_parser.py +++ b/api/python/src/cellxgene_ontology_guide/ontology_parser.py @@ -1,10 +1,10 @@ import re from typing import Any, Dict, List, Union -from artifact_download import load_artifact_by_schema -from constants import ALL_ONTOLOGY_FILENAME, ONTOLOGY_INFO_FILENAME from entities import Ontology, OntologyFileType, OntologyVariant +from cellxgene_ontology_guide.supported_versions import CXGSchema + class OntologyParser: """ @@ -19,8 +19,7 @@ def __init__(self, schema_version: str): :param schema_version: str version of the schema to load ontology metadata for """ - self.ontology_dict = load_artifact_by_schema(schema_version, ALL_ONTOLOGY_FILENAME) - self.supported_ontologies = load_artifact_by_schema(schema_version, ONTOLOGY_INFO_FILENAME) + self.cxg_schema = CXGSchema(version=schema_version) def _parse_ontology_name(self, term_id: str) -> str: """ @@ -35,7 +34,7 @@ def _parse_ontology_name(self, term_id: str) -> str: raise ValueError(f"{term_id} does not conform to expected regex pattern {pattern} and cannot be queried.") ontology_name = term_id.split(":")[0] - if ontology_name not in self.supported_ontologies: + if ontology_name not in self.cxg_schema.supported_ontologies: raise ValueError(f"{term_id} is not part of a supported ontology, its metadata cannot be fetched.") return ontology_name @@ -52,7 +51,7 @@ def get_term_ancestors(self, term_id: str, include_self: bool = False) -> List[s :return: flattened List[str] of ancestor terms """ ontology_name = self._parse_ontology_name(term_id) - ancestors: List[str] = self.ontology_dict[ontology_name][term_id]["ancestors"] + ancestors: List[str] = self.cxg_schema.ontology(ontology_name)[term_id]["ancestors"] return ancestors + [term_id] if include_self else ancestors def get_term_list_ancestors(self, term_ids: str, include_self: bool = False) -> Dict[str, List[str]]: @@ -95,7 +94,7 @@ def get_terms_descendants(self, term_ids: List[str], include_self: bool = False) ontology_names.add(ontology_name) for ontology in ontology_names: - for candidate_descendant, candidate_metadata in self.ontology_dict[ontology].items(): + for candidate_descendant, candidate_metadata in self.cxg_schema.ontology(ontology).items(): for ancestor_id in descendants_dict: if ancestor_id in candidate_metadata["ancestors"]: descendants_dict[ancestor_id].append(candidate_descendant) @@ -112,7 +111,7 @@ def is_term_deprecated(self, term_id: str) -> bool: :return: boolean flag indicating whether the term is deprecated """ ontology_name = self._parse_ontology_name(term_id) - is_deprecated: bool = self.ontology_dict[ontology_name][term_id].get("deprecated") + is_deprecated: bool = self.cxg_schema.ontology(ontology_name)[term_id].get("deprecated") return is_deprecated def get_term_replacement(self, term_id: str) -> Union[str, None]: @@ -125,7 +124,7 @@ def get_term_replacement(self, term_id: str) -> Union[str, None]: :return: replacement str term ID if it exists, None otherwise """ ontology_name = self._parse_ontology_name(term_id) - replaced_by: str = self.ontology_dict[ontology_name][term_id].get("replaced_by") + replaced_by: str = self.cxg_schema.ontology(ontology_name)[term_id].get("replaced_by") return replaced_by if replaced_by else None def get_term_metadata(self, term_id: str) -> Dict[str, Any]: @@ -145,7 +144,7 @@ def get_term_metadata(self, term_id: str) -> Dict[str, Any]: """ ontology_name = self._parse_ontology_name(term_id) return { - key: self.ontology_dict[ontology_name][term_id].get(key, None) + key: self.cxg_schema.ontology(ontology_name)[term_id].get(key, None) for key in {"comments", "term_tracker", "consider"} } @@ -159,7 +158,7 @@ def get_term_label(self, term_id: str) -> str: :return: str human-readable label for the term """ ontology_name = self._parse_ontology_name(term_id) - label: str = self.ontology_dict[ontology_name][term_id]["label"] + label: str = self.cxg_schema.ontology(ontology_name)[term_id]["label"] return label def get_ontology_download_url( @@ -178,8 +177,8 @@ def get_ontology_download_url( :param ontology_variant: OntologyVariant enum of the ontology variant to fetch :return: str download URL for the requested ontology file """ - source_url = self.supported_ontologies[ontology.name]["source"] - version = self.supported_ontologies[ontology.name]["version"] + source_url = self.cxg_schema.supported_ontologies[ontology.name]["source"] + version = self.cxg_schema.supported_ontologies[ontology.name]["version"] return ( f"{source_url}/{version}/{ontology.value}-{ontology_variant.value}.{ontology_filetype.value}" if ontology_variant diff --git a/api/python/src/cellxgene_ontology_guide/supported_versions.py b/api/python/src/cellxgene_ontology_guide/supported_versions.py new file mode 100644 index 00000000..a47ba9e0 --- /dev/null +++ b/api/python/src/cellxgene_ontology_guide/supported_versions.py @@ -0,0 +1,77 @@ +import functools +import gzip +import json +import os +from typing import Any, Dict, List, Optional + +from constants import DATA_ROOT, ONTOLOGY_FILENAME_SUFFIX, ONTOLOGY_INFO_FILENAME +from semantic_version import Version + +from cellxgene_ontology_guide.entities import Ontology + + +@functools.cache +def load_ontology_file(file_name: str) -> Any: + """Load the ontology file from the data directory and return it as a dict.""" + with gzip.open(os.path.join(DATA_ROOT, file_name), "rt") as f: + return json.load(f) + + +def clear_ontology_file_cache() -> None: + """Clear the cache for the load_ontology_file function.""" + load_ontology_file.cache_clear() + + +def get_latest_schema_version(versions: List[str]) -> str: + """Given a list of schema versions, return the latest version. + + :param versions: List[str] list of schema versions. Versions can be in the format "v5.0.0" or "5.0.0" + :return: str latest version with a "v" prefix + """ + + def _coerce(v: str) -> Version: + return Version.coerce(v[1:]) if v[0] == "v" else Version.coerce(v) + + return "v" + str(sorted([_coerce(version) for version in versions])[-1]) + + +def load_supported_versions() -> Any: + """Load the ontology_info.json file and return it as a dict.""" + with open(os.path.join(DATA_ROOT, ONTOLOGY_INFO_FILENAME)) as f: + return json.load(f) + + +class CXGSchema: + """A class to represent the ontology information used by a cellxgene schema version.""" + + def __init__(self, version: Optional[str] = None): + """ + + :param version: The schema version to use. If not provided, the latest schema version will be used. + """ + ontology_info = load_supported_versions() + if version is None: + version = get_latest_schema_version(ontology_info.keys()) + elif version not in ontology_info: + raise ValueError(f"Schema version {version} is not supported in this package version.") + + self.version = version + self.supported_ontologies = ontology_info[version] + self.ontology_file_names: Dict[str, str] = {} + + def ontology(self, name: str) -> Any: + """Return the ontology terms for the given ontology name. Load from the file cache if available. + :param name: str name of the ontology to get the terms for + :return: dict representation of the ontology terms + """ + if name not in self.ontology_file_names: + if getattr(Ontology, name, None) is None: + raise ValueError(f"Ontology {name} is not supported in this package version.") + + try: + onto_version = self.supported_ontologies[name]["version"] + except KeyError as e: + raise ValueError(f"Ontology {name} is not supported for schema version {self.version}") from e + file_name = f"{name}-ontology-{onto_version}{ONTOLOGY_FILENAME_SUFFIX}" + self.ontology_file_names[name] = file_name # save to file name to access from cache + return load_ontology_file(self.ontology_file_names[name]) diff --git a/api/python/tests/conftest.py b/api/python/tests/conftest.py new file mode 100644 index 00000000..56432e59 --- /dev/null +++ b/api/python/tests/conftest.py @@ -0,0 +1,15 @@ +from unittest.mock import patch + +import pytest + + +@pytest.fixture +def mock_load_supported_versions(tmpdir): + with patch("cellxgene_ontology_guide.supported_versions.load_supported_versions") as mock: + yield mock + + +@pytest.fixture +def mock_load_ontology_file(): + with patch("cellxgene_ontology_guide.supported_versions.load_ontology_file") as mock: + yield mock diff --git a/api/python/tests/test_artifact_download.py b/api/python/tests/test_artifact_download.py deleted file mode 100644 index 9da1be1d..00000000 --- a/api/python/tests/test_artifact_download.py +++ /dev/null @@ -1,94 +0,0 @@ -import gzip -from unittest.mock import Mock, patch -from urllib.error import HTTPError, URLError - -import pytest -from cellxgene_ontology_guide.artifact_download import load_artifact_by_schema -from cellxgene_ontology_guide.constants import ALL_ONTOLOGY_FILENAME, ONTOLOGY_ASSET_RELEASE_URL, ONTOLOGY_INFO_FILENAME - - -@pytest.fixture -def mock_urlopen(): - """A fixture that mocks urlopen and simulates a successful response.""" - - def get_mock_response(url): - if url.endswith(ALL_ONTOLOGY_FILENAME): - mock_response = Mock() - mock_response.__enter__ = Mock(return_value=mock_response) - mock_response.__exit__ = Mock(return_value=None) - mock_response.read.return_value = gzip.compress(b'{"key": "value"}') - mock_response.status = 200 - return mock_response - elif url.endswith(ONTOLOGY_INFO_FILENAME): - mock_response = Mock() - mock_response.__enter__ = Mock(return_value=mock_response) - mock_response.__exit__ = Mock(return_value=None) - mock_response.read.return_value = ( - b'{"CL": {"version": "2024-01-01", "source": "http://example.com", "filetype": "owl"}}' - ) - mock_response.status = 200 - return mock_response - else: - raise HTTPError(url, 404, "Not Found", hdrs=None, fp=None) - - with patch("cellxgene_ontology_guide.artifact_download.urlopen", side_effect=get_mock_response) as mock: - yield mock - - -@pytest.fixture -def mock_urlopen_url_error(): - """A fixture that mocks urlopen and simulates a URLError.""" - with patch( - "cellxgene_ontology_guide.artifact_download.urlopen", side_effect=URLError(reason="Network Unreachable") - ) as mock: - yield mock - - -def test_load_artifact_by_schema__success_gzip(mock_urlopen): - schema_version = "5.0.0" - expected_tag = "ontology-assets-v0.0.1" - expected_resp_content = {"key": "value"} - - result = load_artifact_by_schema(schema_version, ALL_ONTOLOGY_FILENAME) - expected_download_url = f"{ONTOLOGY_ASSET_RELEASE_URL}/{expected_tag}/{ALL_ONTOLOGY_FILENAME}" - - mock_urlopen.assert_called_once_with(expected_download_url) - assert result == expected_resp_content - - -def test_load_artifact_by_schema__success_json(mock_urlopen): - schema_version = "5.0.0" - expected_tag = "ontology-assets-v0.0.1" - expected_resp_content = {"CL": {"version": "2024-01-01", "source": "http://example.com", "filetype": "owl"}} - - result = load_artifact_by_schema(schema_version, ONTOLOGY_INFO_FILENAME) - expected_download_url = f"{ONTOLOGY_ASSET_RELEASE_URL}/{expected_tag}/{ONTOLOGY_INFO_FILENAME}" - - mock_urlopen.assert_called_once_with(expected_download_url) - assert result == expected_resp_content - - -def test_load_artifact_by_schema__unsupported_schema_version(mock_urlopen): - schema_version = "v0.0.0" - with pytest.raises(ValueError) as exc_info: - load_artifact_by_schema(schema_version, ALL_ONTOLOGY_FILENAME) - assert "Schema version v0.0.0 is not supported in this package version." in str(exc_info.value) - mock_urlopen.assert_not_called() - - -def test_load_artifact_by_schema__http_error(mock_urlopen): - schema_version = "5.0.0" - filename = "missing.json" - with pytest.raises(ValueError) as exc_info: - load_artifact_by_schema(schema_version, filename) - assert "Could not get missing.json for schema version 5.0.0 in GitHub Release Assets" in str(exc_info.value) - mock_urlopen.assert_called_once() - - -def test_load_artifact_by_schema__url_error(mock_urlopen_url_error): - schema_version = "5.0.0" - filename = "all_ontology.json.gz" - with pytest.raises(ValueError) as exc_info: - load_artifact_by_schema(schema_version, filename) - assert "URL error occurred: Network Unreachable" in str(exc_info.value) - mock_urlopen_url_error.assert_called_once() diff --git a/api/python/tests/test_ontology_parser.py b/api/python/tests/test_ontology_parser.py index 8ca34ce0..1b0da5a0 100644 --- a/api/python/tests/test_ontology_parser.py +++ b/api/python/tests/test_ontology_parser.py @@ -1,57 +1,49 @@ from unittest.mock import patch import pytest -from cellxgene_ontology_guide.constants import ALL_ONTOLOGY_FILENAME, ONTOLOGY_INFO_FILENAME from cellxgene_ontology_guide.entities import Ontology, OntologyFileType, OntologyVariant from cellxgene_ontology_guide.ontology_parser import OntologyParser +from cellxgene_ontology_guide.supported_versions import CXGSchema -@pytest.fixture(scope="module") +@pytest.fixture def ontology_dict(): return { - "CL": { - "CL:0000000": {"ancestors": [], "label": "cell A", "deprecated": False}, - "CL:0000001": { - "ancestors": ["CL:0000000"], - "label": "cell B", - "deprecated": False, - "consider": ["CL:0000004"], - }, - "CL:0000002": {"ancestors": ["CL:0000000"], "label": "cell C", "deprecated": False}, - "CL:0000003": { - "ancestors": ["CL:0000000"], - "label": "obsolete cell", - "deprecated": True, - "replaced_by": "CL:0000004", - "comments": ["this term was deprecated in favor of a descendant term of CL:0000001"], - "term_tracker": "http://example.com/issue/1234", - }, - "CL:0000004": {"ancestors": ["CL:0000001", "CL:0000000"], "label": "cell B2", "deprecated": False}, - } + "CL:0000000": {"ancestors": [], "label": "cell A", "deprecated": False}, + "CL:0000001": { + "ancestors": ["CL:0000000"], + "label": "cell B", + "deprecated": False, + "consider": ["CL:0000004"], + }, + "CL:0000002": {"ancestors": ["CL:0000000"], "label": "cell C", "deprecated": False}, + "CL:0000003": { + "ancestors": ["CL:0000000"], + "label": "obsolete cell", + "deprecated": True, + "replaced_by": "CL:0000004", + "comments": ["this term was deprecated in favor of a descendant term of CL:0000001"], + "term_tracker": "http://example.com/issue/1234", + }, + "CL:0000004": {"ancestors": ["CL:0000001", "CL:0000000"], "label": "cell B2", "deprecated": False}, } -@pytest.fixture(scope="module") -def supported_ontologies(): - return {"CL": {"version": "2024-01-01", "source": "http://example.com", "filetype": "owl"}} - - -@pytest.fixture(scope="module") -def mock_load_artifact_by_schema(ontology_dict, supported_ontologies): - def get_mock_artifact_by_schema(schema_version, filename): - if filename == ALL_ONTOLOGY_FILENAME: - return ontology_dict - elif filename == ONTOLOGY_INFO_FILENAME: - return supported_ontologies +@pytest.fixture +def mock_CXGSchema(ontology_dict, mock_load_supported_versions, mock_load_ontology_file): + mock_load_supported_versions.return_value = { + "v5.0.0": {"CL": {"version": "2024-01-01", "source": "http://example.com", "filetype": "owl"}} + } + cxg_schema = CXGSchema() + cxg_schema.ontology_file_names = {"CL": "CL-ontology-2024-01-01.json.gz"} + mock_load_ontology_file.return_value = ontology_dict - with patch( - "cellxgene_ontology_guide.ontology_parser.load_artifact_by_schema", side_effect=get_mock_artifact_by_schema - ) as mock: + with patch("cellxgene_ontology_guide.ontology_parser.CXGSchema", return_value=cxg_schema) as mock: yield mock -@pytest.fixture(scope="module") -def ontology_parser(mock_load_artifact_by_schema): +@pytest.fixture +def ontology_parser(mock_CXGSchema): return OntologyParser(schema_version="5.0.0") diff --git a/api/python/tests/test_supported_versions.py b/api/python/tests/test_supported_versions.py new file mode 100644 index 00000000..c88532c0 --- /dev/null +++ b/api/python/tests/test_supported_versions.py @@ -0,0 +1,102 @@ +import gzip +import json +from unittest.mock import patch + +import pytest +from cellxgene_ontology_guide.supported_versions import ( + CXGSchema, + get_latest_schema_version, + load_ontology_file, + load_supported_versions, +) + +MODULE_PATH = "cellxgene_ontology_guide.supported_versions" + + +@pytest.fixture +def initialized_CXGSchemaInfo(mock_load_supported_versions): + mock_load_supported_versions.return_value = {"v5.0.0": {"CL": {"version": "v2024-01-04"}}} + return CXGSchema() + + +@pytest.mark.parametrize("versions, expected", [(["v5.0.0", "v0.0.1"], "v5.0.0"), (["5.0.0", "0.0.1"], "v5.0.0")]) +def test__get_latest_schema_version__OK(versions, expected): + assert get_latest_schema_version(versions) == "v5.0.0" + + +@pytest.fixture +def mock_ontology_file(tmpdir): + with patch(f"{MODULE_PATH}.DATA_ROOT", tmpdir): + # Create a temporary ontology file + test_file_name = "test_ontology.json.gz" + onto_file = tmpdir.join(test_file_name) + file_contents = {"test": "file contents"} + with gzip.open(str(onto_file), "wt") as onto_file: + json.dump(file_contents, onto_file) + yield test_file_name, file_contents + + +def test__load_ontology_file__OK(mock_ontology_file): + test_file_name, file_contents = mock_ontology_file + assert load_ontology_file(test_file_name) == file_contents + assert load_ontology_file.cache_info().hits == 0 + assert load_ontology_file.cache_info().misses == 1 + load_ontology_file(test_file_name) + assert load_ontology_file.cache_info().hits == 1 + assert load_ontology_file.cache_info().misses == 1 + + +def test__clear_ontology_file_cache__OK(mock_ontology_file): + test_file_name, _ = mock_ontology_file + load_ontology_file(test_file_name) + assert load_ontology_file.cache_info().misses == 1 + load_ontology_file.cache_clear() + assert load_ontology_file.cache_info().misses == 0 + load_ontology_file(test_file_name) + assert load_ontology_file.cache_info().misses == 1 + + +def test__load_supported_versions__OK(tmpdir): + with patch(f"{MODULE_PATH}.DATA_ROOT", tmpdir): + # Create a temporary ontology_info.json file + test_file_name = tmpdir.join("ontology_info.json") + file_contents = {"test": "file contents"} + with test_file_name.open("w") as f: + json.dump(file_contents, f) + assert load_supported_versions() == file_contents + + +class TestCXGSchema: + def test__init__defaults(self, mock_load_supported_versions): + support_versions = {"v5.0.0": "current version", "v0.0.1": "old version"} + mock_load_supported_versions.return_value = support_versions + cxgs = CXGSchema() + assert cxgs.version == "v5.0.0" + assert cxgs.supported_ontologies == support_versions["v5.0.0"] + + def test__init__specific_version(self, mock_load_supported_versions): + support_versions = {"v5.0.0": "current version", "v0.0.1": "old version"} + mock_load_supported_versions.return_value = support_versions + cxgs = CXGSchema(version="v0.0.1") + assert cxgs.version == "v0.0.1" + assert cxgs.supported_ontologies == support_versions["v0.0.1"] + + def test__init__unsupported_version(self, mock_load_supported_versions): + mock_load_supported_versions.return_value = {} + with pytest.raises(ValueError): + CXGSchema(version="v5.0.1") + + def test__ontology__unsupported_ontology_by_package(self, initialized_CXGSchemaInfo, mock_load_ontology_file): + with pytest.raises(ValueError): + initialized_CXGSchemaInfo.ontology("GO") + mock_load_ontology_file.assert_not_called() + + def test__ontology__unsupported_ontology_by_schema(self, initialized_CXGSchemaInfo, mock_load_ontology_file): + with pytest.raises(ValueError): + initialized_CXGSchemaInfo.ontology("EFO") + mock_load_ontology_file.assert_not_called() + + def test__ontology__OK(self, initialized_CXGSchemaInfo, mock_load_ontology_file): + ontology_file_contents = {"CL:1234": "efgh"} + mock_load_ontology_file.return_value = ontology_file_contents + assert initialized_CXGSchemaInfo.ontology("CL") == {"CL:1234": "efgh"} diff --git a/artifact-schemas/all_ontology_schema.json b/artifact-schemas/all_ontology_schema.json index 3dc5dd10..65e74e4b 100644 --- a/artifact-schemas/all_ontology_schema.json +++ b/artifact-schemas/all_ontology_schema.json @@ -3,74 +3,61 @@ "title": "Valid Ontology Term JSON Schema", "description": "Schema for file containing metadata for Ontology Terms accepted in dataset submissions to CZ CellXGene Data Portal.", "type": "object", - "properties": { - "EFO": { "$ref": "#/definitions/ontologyCategory" }, - "UBERON": { "$ref": "#/definitions/ontologyCategory" }, - "CL": { "$ref": "#/definitions/ontologyCategory" }, - "HANCESTRO": { "$ref": "#/definitions/ontologyCategory" }, - "HsapDv": { "$ref": "#/definitions/ontologyCategory" }, - "MmusDv": { "$ref": "#/definitions/ontologyCategory" }, - "PATO": { "$ref": "#/definitions/ontologyCategory" }, - "NCBITaxon": { "$ref": "#/definitions/ontologyCategory" }, - "MONDO": { "$ref": "#/definitions/ontologyCategory" } - }, - "additionalProperties": false, - "definitions": { - "ontologyCategory": { + "patternProperties": { + "^(EFO|UBERON|CL|HANCESTRO|HsapDv|MmusDv|PATO|NCBITaxon|MONDO):[0-9]{7}$": { "type": "object", - "patternProperties": { - "^(EFO|UBERON|CL|HANCESTRO|HsapDv|MmusDv|PATO|NCBITaxon|MONDO):[0-9]{7}$": { - "type": "object", - "properties": { - "label": { - "type": "string", - "description": "human-readable name for the ontology entry." - }, - "deprecated": { - "type": "boolean", - "description": "Indicates whether the ontology entry is deprecated." - }, - "ancestors": { - "type": "array", - "items": { - "type": "string", - "pattern": "^(EFO|UBERON|CL|HANCESTRO|HsapDv|MmusDv|PATO|NCBITaxon|MONDO):[0-9]{7}$", - "description": "List of ancestor IDs for the ontology entry." - }, - "description": "An array of ancestor ontology terms that this term is a subclass of." - }, - "comments": { - "type": "array", - "items": { - "type": "string" - }, - "minItems": 0, - "description": "Optional comments regarding the ontology entry from ontology curators." - }, - "term_tracker": { - "type": "string", - "format": "uri", - "description": "Optional URL to track discussion around the term's history and changes." - }, - "consider": { - "type": "array", - "items": { - "type": "string" - }, - "minItems": 0, - "description": "Suggests alternative IDs to consider in place of this ontology entry." - }, - "replaced_by": { - "type": "string", - "pattern": "^(EFO|UBERON|CL|HANCESTRO|HsapDv|MmusDv|PATO|NCBITaxon|MONDO):[0-9]{7}$", - "description": "If deprecated, the ID of the ontology entry that should canonically replace this one." - } + "properties": { + "label": { + "type": "string", + "description": "human-readable name for the ontology entry." + }, + "deprecated": { + "type": "boolean", + "description": "Indicates whether the ontology entry is deprecated." + }, + "ancestors": { + "type": "array", + "items": { + "type": "string", + "pattern": "^(EFO|UBERON|CL|HANCESTRO|HsapDv|MmusDv|PATO|NCBITaxon|MONDO):[0-9]{7}$", + "description": "List of ancestor IDs for the ontology entry." + }, + "description": "An array of ancestor ontology terms that this term is a subclass of." + }, + "comments": { + "type": "array", + "items": { + "type": "string" }, - "required": ["label", "deprecated", "ancestors"], - "additionalProperties": false, + "minItems": 0, + "description": "Optional comments regarding the ontology entry from ontology curators." + }, + "term_tracker": { + "type": "string", + "format": "uri", + "description": "Optional URL to track discussion around the term's history and changes." + }, + "consider": { + "type": "array", + "items": { + "type": "string" + }, + "minItems": 0, + "description": "Suggests alternative IDs to consider in place of this ontology entry." + }, + "replaced_by": { + "type": "string", + "pattern": "^(EFO|UBERON|CL|HANCESTRO|HsapDv|MmusDv|PATO|NCBITaxon|MONDO):[0-9]{7}$", + "description": "If deprecated, the ID of the ontology entry that should canonically replace this one." } }, + "required": [ + "label", + "deprecated", + "ancestors" + ], "additionalProperties": false } - } -} \ No newline at end of file + }, + "additionalProperties": false +} diff --git a/artifact-schemas/ontology_info_schema.json b/artifact-schemas/ontology_info_schema.json index 437a6aa3..529b34bd 100644 --- a/artifact-schemas/ontology_info_schema.json +++ b/artifact-schemas/ontology_info_schema.json @@ -1,18 +1,42 @@ { "$schema": "http://json-schema.org/draft-07/schema#", "title": "Ontology Version and Source Schema", - "description": "A schema for the set of valid ontology reference files mapping to a CZ CellXGene Dataset Schema Version", + "description": "A schema for the set of valid ontology reference files mapping to a CZ CellXGene Dataset Schema Versions", "type": "object", - "properties": { - "CL": { "$ref": "#/definitions/ontologyEntry" }, - "EFO": { "$ref": "#/definitions/ontologyEntry" }, - "HANCESTRO": { "$ref": "#/definitions/ontologyEntry" }, - "HsapDv": { "$ref": "#/definitions/ontologyEntry" }, - "MONDO": { "$ref": "#/definitions/ontologyEntry" }, - "MmusDv": { "$ref": "#/definitions/ontologyEntry" }, - "NCBITaxon": { "$ref": "#/definitions/ontologyEntry" }, - "UBERON": { "$ref": "#/definitions/ontologyEntry" }, - "PATO": { "$ref": "#/definitions/ontologyEntry" } + "patternProperties": { + "^v[0-9]+\\.[0-9]+\\.[0-9]+$": { + "description": "The version of CellxGene schema that maps to this set of ontology versions", + "type": "object", + "properties": { + "CL": { + "$ref": "#/definitions/ontologyEntry" + }, + "EFO": { + "$ref": "#/definitions/ontologyEntry" + }, + "HANCESTRO": { + "$ref": "#/definitions/ontologyEntry" + }, + "HsapDv": { + "$ref": "#/definitions/ontologyEntry" + }, + "MONDO": { + "$ref": "#/definitions/ontologyEntry" + }, + "MmusDv": { + "$ref": "#/definitions/ontologyEntry" + }, + "NCBITaxon": { + "$ref": "#/definitions/ontologyEntry" + }, + "UBERON": { + "$ref": "#/definitions/ontologyEntry" + }, + "PATO": { + "$ref": "#/definitions/ontologyEntry" + } + } + } }, "additionalProperties": false, "definitions": { @@ -33,8 +57,13 @@ "description": "filetype used to build generated artifacts for this ontology data release" } }, - "required": ["version", "source", "filetype"], + "required": [ + "version", + "source", + "filetype" + ], "additionalProperties": false } } -} \ No newline at end of file +} + diff --git a/ontology-assets/CL-ontology-v2024-01-04.json.gz b/ontology-assets/CL-ontology-v2024-01-04.json.gz new file mode 100644 index 00000000..321bfe79 Binary files /dev/null and b/ontology-assets/CL-ontology-v2024-01-04.json.gz differ diff --git a/ontology-assets/EFO-ontology-v3.62.0.json.gz b/ontology-assets/EFO-ontology-v3.62.0.json.gz new file mode 100644 index 00000000..4e634008 Binary files /dev/null and b/ontology-assets/EFO-ontology-v3.62.0.json.gz differ diff --git a/ontology-assets/HANCESTRO-ontology-3.0.json.gz b/ontology-assets/HANCESTRO-ontology-3.0.json.gz new file mode 100644 index 00000000..ad2ffa09 Binary files /dev/null and b/ontology-assets/HANCESTRO-ontology-3.0.json.gz differ diff --git a/ontology-assets/HsapDv-ontology-11.json.gz b/ontology-assets/HsapDv-ontology-11.json.gz new file mode 100644 index 00000000..3c222f51 Binary files /dev/null and b/ontology-assets/HsapDv-ontology-11.json.gz differ diff --git a/ontology-assets/MONDO-ontology-v2024-01-03.json.gz b/ontology-assets/MONDO-ontology-v2024-01-03.json.gz new file mode 100644 index 00000000..1ccd44f3 Binary files /dev/null and b/ontology-assets/MONDO-ontology-v2024-01-03.json.gz differ diff --git a/ontology-assets/MmusDv-ontology-9.json.gz b/ontology-assets/MmusDv-ontology-9.json.gz new file mode 100644 index 00000000..c95b2ba5 Binary files /dev/null and b/ontology-assets/MmusDv-ontology-9.json.gz differ diff --git a/ontology-assets/NCBITaxon-ontology-v2023-06-20.json.gz b/ontology-assets/NCBITaxon-ontology-v2023-06-20.json.gz new file mode 100644 index 00000000..deb73068 Binary files /dev/null and b/ontology-assets/NCBITaxon-ontology-v2023-06-20.json.gz differ diff --git a/ontology-assets/PATO-ontology-v2023-05-18.json.gz b/ontology-assets/PATO-ontology-v2023-05-18.json.gz new file mode 100644 index 00000000..397bb188 Binary files /dev/null and b/ontology-assets/PATO-ontology-v2023-05-18.json.gz differ diff --git a/ontology-assets/UBERON-ontology-v2024-01-18.json.gz b/ontology-assets/UBERON-ontology-v2024-01-18.json.gz new file mode 100644 index 00000000..0f72d056 Binary files /dev/null and b/ontology-assets/UBERON-ontology-v2024-01-18.json.gz differ diff --git a/ontology-assets/ontology_info.json b/ontology-assets/ontology_info.json index 4762cc07..9a186060 100644 --- a/ontology-assets/ontology_info.json +++ b/ontology-assets/ontology_info.json @@ -1,47 +1,49 @@ { - "CL": { - "version": "v2024-01-04", - "source": "https://github.com/obophenotype/cell-ontology/releases/download", - "filetype": "owl" - }, - "EFO": { - "version": "v3.62.0", - "source": "https://github.com/EBISPOT/efo/releases/download", - "filetype": "owl" - }, - "HANCESTRO": { - "version": "3.0", - "source": "https://github.com/EBISPOT/hancestro/raw", - "filetype": "owl" - }, - "HsapDv": { - "version": "11", - "source": "http://aber-owl.net/media/ontologies/HSAPDV", - "filetype": "owl" - }, - "MONDO": { - "version": "v2024-01-03", - "source": "https://github.com/monarch-initiative/mondo/releases/download", - "filetype": "owl" - }, - "MmusDv": { - "version": "9", - "source": "http://aber-owl.net/media/ontologies/MMUSDV", - "filetype": "owl" - }, - "NCBITaxon": { - "version": "v2023-06-20", - "source": "https://github.com/obophenotype/ncbitaxon/releases/download", - "filetype": "owl.gz" - }, - "UBERON": { - "version": "v2024-01-18", - "source": "https://github.com/obophenotype/uberon/releases/download", - "filetype": "owl" - }, - "PATO": { - "version": "v2023-05-18", - "source": "https://github.com/pato-ontology/pato/raw", - "filetype": "owl" + "v5.0.0": { + "CL": { + "version": "v2024-01-04", + "source": "https://github.com/obophenotype/cell-ontology/releases/download", + "filetype": "owl" + }, + "EFO": { + "version": "v3.62.0", + "source": "https://github.com/EBISPOT/efo/releases/download", + "filetype": "owl" + }, + "HANCESTRO": { + "version": "3.0", + "source": "https://github.com/EBISPOT/hancestro/raw", + "filetype": "owl" + }, + "HsapDv": { + "version": "11", + "source": "http://aber-owl.net/media/ontologies/HSAPDV", + "filetype": "owl" + }, + "MONDO": { + "version": "v2024-01-03", + "source": "https://github.com/monarch-initiative/mondo/releases/download", + "filetype": "owl" + }, + "MmusDv": { + "version": "9", + "source": "http://aber-owl.net/media/ontologies/MMUSDV", + "filetype": "owl" + }, + "NCBITaxon": { + "version": "v2023-06-20", + "source": "https://github.com/obophenotype/ncbitaxon/releases/download", + "filetype": "owl.gz" + }, + "UBERON": { + "version": "v2024-01-18", + "source": "https://github.com/obophenotype/uberon/releases/download", + "filetype": "owl" + }, + "PATO": { + "version": "v2023-05-18", + "source": "https://github.com/pato-ontology/pato/raw", + "filetype": "owl" + } } -} \ No newline at end of file +} diff --git a/tools/ontology-builder/requirements.txt b/tools/ontology-builder/requirements.txt index f11bffc3..90416e6d 100644 --- a/tools/ontology-builder/requirements.txt +++ b/tools/ontology-builder/requirements.txt @@ -1,3 +1,4 @@ owlready2==0.45 PyYaml==6.0.1 jsonschema==4.21.1 +semantic-version==2.8.5 diff --git a/tools/ontology-builder/src/all_ontology_generator.py b/tools/ontology-builder/src/all_ontology_generator.py index 5369e3d2..2c40e5ec 100755 --- a/tools/ontology-builder/src/all_ontology_generator.py +++ b/tools/ontology-builder/src/all_ontology_generator.py @@ -4,24 +4,47 @@ import re import urllib.request from threading import Thread -from typing import Any, Dict, Set +from typing import Any, Dict, List, Optional, Set from urllib.error import HTTPError, URLError import env import owlready2 +import semantic_version -def _download_ontologies(onto_info_file: str = env.ONTO_INFO_FILE, output_dir: str = env.RAW_ONTOLOGY_DIR) -> None: +def _get_latest_version(versions: List[str]) -> str: + return "v" + str(sorted([semantic_version.Version.coerce(version[1:]) for version in versions])[-1]) + + +def _get_ontology_info_file( + ontology_info_file: str = env.ONTOLOGY_INFO_FILE, cellxgene_schema_version: Optional[str] = None +) -> Any: + """ + Read ontology information from file + + :param str ontology_info_file: path to file with ontology information + + :rtype Any + :return ontology information + """ + with open(ontology_info_file, "r") as f: + ontology_info = json.load(f) + if cellxgene_schema_version: + ontology_info_version = ontology_info[cellxgene_schema_version] + else: + ontology_info_version = ontology_info[_get_latest_version(ontology_info.keys())] + return ontology_info_version + + +def _download_ontologies(ontology_info: Dict[str, Any], output_dir: str = env.RAW_ONTOLOGY_DIR) -> None: """ Downloads the ontology files specified in 'ontology_info.json' into 'output_dir' - :param str onto_info_file: path to file with ontology information + :param str ontology_info: a dictionary with ontology names as keys and their respective URLs and versions :param str output_dir: path to writable directory where ontology files will be downloaded to :rtype None """ - with open(onto_info_file, "r") as f: - ontology_info = json.load(f) def download(_ontology: str, _url: str) -> None: print(f"Start Downloading {_ontology}") @@ -177,12 +200,14 @@ def _extract_ontology_term_metadata(onto: owlready2.entity.ThingClass) -> Dict[s def _parse_ontologies( + ontology_info: Any, working_dir: str = env.RAW_ONTOLOGY_DIR, - output_json_file: str = env.PARSED_ONTOLOGIES_FILE, + output_path: str = env.ONTOLOGY_ASSETS_DIR, ) -> None: """ Parse all ontology files in working_dir. Extracts information from all classes in the ontology file. - The extracted information is written into a gzipped a json file with the following structure: + The extracted information is written into a gzipped a json file with the following [schema]( + artifact-schemas/all_ontology_schema.json): { "ontology_name": { @@ -203,23 +228,29 @@ def _parse_ontologies( ... } } - + :param ANY ontology_info: the ontology references used to download the ontology files. It follows this [schema]( + ./artifact-schemas/ontology_info_schema.json) :param str working_dir: path to folder with ontology files - :param str output_json_file: path to output json file + :param str output_path: path to output json files :rtype None """ - onto_dict: Dict[str, Any] = dict() for onto_file in os.listdir(working_dir): + if onto_file.startswith("."): + continue onto = _load_ontology_object(os.path.join(working_dir, onto_file)) - print(f"Processing {onto.name}") - onto_dict[onto.name] = _extract_ontology_term_metadata(onto) + version = ontology_info[onto.name]["version"] + output_file = f"{onto.name}-ontology-{version}.json.gz" + print(f"Processing {output_file}") + + onto_dict = _extract_ontology_term_metadata(onto) - with gzip.open(output_json_file, "wt") as output_json: - json.dump(onto_dict, output_json, indent=2) + with gzip.open(os.path.join(output_path, output_file), "wt") as output_json: + json.dump(onto_dict, output_json, indent=2) # Download and parse ontology files upon execution if __name__ == "__main__": - _download_ontologies() - _parse_ontologies() + ontology_info = _get_ontology_info_file() + _download_ontologies(ontology_info) + _parse_ontologies(ontology_info) diff --git a/tools/ontology-builder/src/env.py b/tools/ontology-builder/src/env.py index 4e19722e..1acf2f1f 100644 --- a/tools/ontology-builder/src/env.py +++ b/tools/ontology-builder/src/env.py @@ -5,5 +5,4 @@ PROJECT_ROOT_DIR = os.path.realpath(__file__).rsplit("/", maxsplit=4)[0] SCHEMA_DIR = os.path.join(PROJECT_ROOT_DIR, "artifact-schemas") ONTOLOGY_ASSETS_DIR = os.path.join(PROJECT_ROOT_DIR, "ontology-assets") -ONTO_INFO_FILE = os.path.join(ONTOLOGY_ASSETS_DIR, "ontology_info.json") -PARSED_ONTOLOGIES_FILE = os.path.join(ONTOLOGY_ASSETS_DIR, "all_ontology.json.gz") +ONTOLOGY_INFO_FILE = os.path.join(ONTOLOGY_ASSETS_DIR, "ontology_info.json") diff --git a/tools/ontology-builder/tests/test_all_ontology_generator.py b/tools/ontology-builder/tests/test_all_ontology_generator.py index 261da674..be026ed1 100644 --- a/tools/ontology-builder/tests/test_all_ontology_generator.py +++ b/tools/ontology-builder/tests/test_all_ontology_generator.py @@ -1,31 +1,66 @@ +import json import os import urllib.request from unittest.mock import MagicMock, patch import pytest -from all_ontology_generator import _download_ontologies, _parse_ontologies +from all_ontology_generator import _download_ontologies, _get_latest_version, _get_ontology_info_file, _parse_ontologies @pytest.fixture -def mock_ontology_info(tmpdir): +def mock_ontology_info(): + return { + "ontology_name": { + "source": "http://example.com", + "version": "v1", + "filetype": "owl", + } + } + + +@pytest.fixture +def mock_ontology_info_file(tmpdir, mock_ontology_info): # Create a temporary ontology info file - onto_info_file = tmpdir.join("ontology_info.json") - onto_info_file.write('{"ontology_name": {"source": "http://example.com", "version": "v1", "filetype": "owl"}}') - return str(onto_info_file) + ontology_info_file = tmpdir.join("ontology_info.json") + ontology_info = {"v1.0.0": True, "v2.0.0": mock_ontology_info} + ontology_info_file.write(json.dumps(ontology_info)) + return str(ontology_info_file) @pytest.fixture def mock_raw_ontology_dir(tmpdir): - # Create a temporary directory for raw ontology files - raw_ontology_dir = tmpdir.mkdir("raw_ontology") - return str(raw_ontology_dir) + # Create a temporary ontology file + sub_dir_name = "raw_ontology" + sub_dir = tmpdir.mkdir(sub_dir_name) + onto_owl_file = tmpdir.join(sub_dir_name, "ontology_name.owl") + onto_owl_file.write("") + return str(sub_dir) -@pytest.fixture -def mock_parsed_ontology_file(tmpdir): - # Create a temporary gzipped json file for parsed ontology data - parsed_ontology_file = tmpdir.join("parsed_ontologies.json.gz") - return str(parsed_ontology_file) +def test_get_latest_version(): + # Call the function + latest_version = _get_latest_version(versions=["v1", "v2.0", "v3.0.0", "v3.0.1", "v3.1.0"]) + + # Assertion + assert latest_version == "v3.1.0" + + +def test_get_ontology_info_file_default(mock_ontology_info_file): + # Call the function + ontology_info = _get_ontology_info_file(ontology_info_file=mock_ontology_info_file) + + # Assertion + assert ontology_info == {"ontology_name": {"source": "http://example.com", "version": "v1", "filetype": "owl"}} + + +def test_get_ontology_info_file_version(mock_ontology_info_file): + # Call the function + ontology_info = _get_ontology_info_file( + ontology_info_file=mock_ontology_info_file, cellxgene_schema_version="v1.0.0" + ) + + # Assertion + assert ontology_info is True def test_download_ontologies(mock_ontology_info, mock_raw_ontology_dir): @@ -37,22 +72,32 @@ def test_download_ontologies(mock_ontology_info, mock_raw_ontology_dir): mock_urlopen.return_value = mock_response # Call the function - _download_ontologies(onto_info_file=mock_ontology_info, output_dir=mock_raw_ontology_dir) + _download_ontologies(ontology_info=mock_ontology_info, output_dir=mock_raw_ontology_dir) mock_urlretrieve.assert_called_once() -def test_parse_ontologies(mock_raw_ontology_dir, mock_parsed_ontology_file): +def test_parse_ontologies(mock_ontology_info, mock_raw_ontology_dir, tmpdir): # Mocking _load_ontology_object and _extract_ontology_term_metadata with patch("all_ontology_generator._load_ontology_object") as mock_load_ontology, patch( "all_ontology_generator._extract_ontology_term_metadata" ) as mock_extract_metadata: # Mock return values - mock_load_ontology.return_value = MagicMock(name="ontology_object") + MockOntologyObject = MagicMock() + MockOntologyObject.name = "ontology_name" # Must match the name of the ontology file + mock_load_ontology.return_value = MockOntologyObject mock_extract_metadata.return_value = {"term_id": {"label": "Term Label", "deprecated": False, "ancestors": []}} + # Mock output path + output_path = tmpdir.mkdir("output") # Call the function - _parse_ontologies(working_dir=mock_raw_ontology_dir, output_json_file=mock_parsed_ontology_file) + _parse_ontologies(ontology_info=mock_ontology_info, working_dir=mock_raw_ontology_dir, output_path=output_path) + + # Assert the output file is created + assert os.path.exists(os.path.join(output_path, "ontology_name-ontology-v1.json.gz")) + + # Assert output_path has the same number of files as mock_raw_ontology_dir. + assert len(os.listdir(output_path)) == len(os.listdir(mock_raw_ontology_dir)) # Assert _load_ontology_object is called for each ontology file assert mock_load_ontology.call_count == len(os.listdir(mock_raw_ontology_dir)) @@ -70,7 +115,7 @@ def test_download_ontologies_http_error(mock_ontology_info, mock_raw_ontology_di # Assertion with pytest.raises(Exception) as exc_info: - _download_ontologies(onto_info_file=mock_ontology_info, output_dir=mock_raw_ontology_dir) + _download_ontologies(ontology_info=mock_ontology_info, output_dir=mock_raw_ontology_dir) assert "returns status code 404" in str(exc_info.value) @@ -81,5 +126,5 @@ def test_download_ontologies_url_error(mock_ontology_info, mock_raw_ontology_dir # Assertion with pytest.raises(Exception) as exc_info: - _download_ontologies(onto_info_file=mock_ontology_info, output_dir=mock_raw_ontology_dir) + _download_ontologies(ontology_info=mock_ontology_info, output_dir=mock_raw_ontology_dir) assert "fails due to Connection refused" in str(exc_info.value)