Skip to content

Commit

Permalink
feat: load GH Release Assets for schema version in memory (#72)
Browse files Browse the repository at this point in the history
  • Loading branch information
nayib-jose-gloria authored Feb 28, 2024
1 parent 10797fc commit 58bad0a
Show file tree
Hide file tree
Showing 17 changed files with 219 additions and 138 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/generate_all_ontology.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name: Updates to Ontology Files
on:
push:
paths:
- "**/api/python/src/cellxgene_ontology_guide/artifacts/ontology_info.yml"
- "**/api/python/src/cellxgene_ontology_guide/artifacts/ontology_info.json"
- "**/artifact-schemas/all_ontology_schema.json"
branches-ignore:
- main
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ jobs:
strategy:
matrix:
include:
- file_name: "ontology_info.yml"
- file_name: "ontology_info.json"
content_type: "application/yml"
- file_name: "all_ontology.json.gz"
content_type: "application/zip"
Expand Down
4 changes: 1 addition & 3 deletions api/python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,7 @@ authors = [
license = { text = "MIT" }
readme = "README.md"
requires-python = "~= 3.11"
dependencies = [
"PyYAML"
]
dependencies = []

[project.optional-dependencies]
test = ["pytest"]
Expand Down
41 changes: 29 additions & 12 deletions api/python/src/cellxgene_ontology_guide/artifact_download.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,36 @@
import os
from urllib.error import HTTPError, URLError
from urllib.request import urlopen

from constants import ARTIFACT_DIR, CURRENT_SCHEMA_VERSION
from constants import ONTOLOGY_ASSET_RELEASE_URL, SCHEMA_VERSION_TO_ONTOLOGY_ASSET_TAG


def load_artifact_by_schema(schema_version: str, filename: str) -> str:
def load_artifact_by_schema(schema_version: str, filename: str) -> bytes:
"""
Load ontology files from GitHub Release Assets, based on the provided schema version.
Returns ValueError if the schema version is not supported in this package version.
Returns ValueError if the schema version is not supported in this package version or filename is not found for
given schema_version.
:param schema_version: str version of the schema to load ontology files for
:param filename: str name of the file to load
:return: str path to the ontology file
:param schema_version: str version of the schema to load ontology assets for
:param filename: str name of the asset to load
:return: bytes content of the asset
"""
if schema_version == CURRENT_SCHEMA_VERSION:
return os.path.join(ARTIFACT_DIR, filename)
else:
# TODO: Add support for loading ontology files from different schema versions
raise ValueError(f"Schema version {schema_version} is not supported in this package version.")
try:
ontology_asset_tag = SCHEMA_VERSION_TO_ONTOLOGY_ASSET_TAG[schema_version]
except KeyError as e:
raise ValueError(f"Schema version {schema_version} is not supported in this package version.") from e

download_url = f"{ONTOLOGY_ASSET_RELEASE_URL}/{ontology_asset_tag}/{filename}"

try:
with urlopen(download_url) as response:
if response.status == 200:
content: bytes = response.read()
return content
else:
raise ValueError(f"Server responded with status code: {response.status}")
except HTTPError as e:
raise ValueError(
f"Could not get {filename} for schema version {schema_version} in GitHub Release Assets: {e}"
) from e
except URLError as e:
raise ValueError(f"URL error occurred: {e.reason}") from e
Binary file not shown.

This file was deleted.

6 changes: 3 additions & 3 deletions api/python/src/cellxgene_ontology_guide/constants.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os

PACKAGE_ROOT = os.path.dirname(os.path.realpath(__file__))
ARTIFACT_DIR = os.path.join(PACKAGE_ROOT, "artifacts")
ALL_ONTOLOGY_FILENAME = "all_ontology.json.gz"
ONTOLOGY_INFO_FILENAME = "ontology_info.yml"
CURRENT_SCHEMA_VERSION = "5.0.0"
ONTOLOGY_INFO_FILENAME = "ontology_info.json"
ONTOLOGY_ASSET_RELEASE_URL = "https://github.com/chanzuckerberg/cellxgene-ontology-guide/releases/download"
SCHEMA_VERSION_TO_ONTOLOGY_ASSET_TAG = {"5.0.0": "ontology-assets-v0.0.1"}
38 changes: 26 additions & 12 deletions api/python/src/cellxgene_ontology_guide/ontology_parser.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,49 @@
import gzip
import json
import re
from io import BytesIO
from typing import Any, Dict, List, Union

import yaml
from artifact_download import load_artifact_by_schema
from constants import ALL_ONTOLOGY_FILENAME, CURRENT_SCHEMA_VERSION, ONTOLOGY_INFO_FILENAME
from constants import ALL_ONTOLOGY_FILENAME, ONTOLOGY_INFO_FILENAME


class OntologyParser:
"""
An object to parse ontology term metadata from ontologies corresponding to a given CellxGene Schema Version.
"""

def __init__(self, schema_version: str = CURRENT_SCHEMA_VERSION):
# Private attribute to keep track of instances
_instances: Dict[str, Any] = {}

def __new__(cls, schema_version: str) -> Any:
"""
Ensure that only one instance per schema_version exists.
"""
if schema_version not in cls._instances:
instance = super(OntologyParser, cls).__new__(cls)
cls._instances[schema_version] = instance
return instance
return cls._instances[schema_version]

def __init__(self, schema_version: str):
"""
Initialize an OntologyParser object with the ontology metadata corresponding to the given CellxGene schema
version. By default, loads the ontology metadata for the latest compatible schema version from disk. If a
different schema version is set, the corresponding ontology metadata will be loaded instead. If not available
from disk, it will make a network call to GitHub Release Assets.
version. If not cached, it will make a network call to GitHub Release Assets to load in memory and
parse the corresponding ontology metadata.
:param schema_version: str version of the schema to load ontology metadata for
"""
all_ontology_filepath = load_artifact_by_schema(schema_version, ALL_ONTOLOGY_FILENAME)
ontology_info_filepath = load_artifact_by_schema(schema_version, ONTOLOGY_INFO_FILENAME)
if not hasattr(self, "initialized"): # Prevents reinitialization
all_ontology = load_artifact_by_schema(schema_version, ALL_ONTOLOGY_FILENAME)
ontology_info = load_artifact_by_schema(schema_version, ONTOLOGY_INFO_FILENAME)

with gzip.open(BytesIO(all_ontology), "rt") as f:
self.ontology_dict = json.load(f)

with gzip.open(all_ontology_filepath, "rt") as f:
self.ontology_dict = json.load(f)
self.supported_ontologies = json.loads(ontology_info)

with open(ontology_info_filepath, "rt") as f:
self.supported_ontologies = yaml.safe_load(f)
self.initialized = True

def _parse_ontology_name(self, term_id: str) -> str:
"""
Expand Down
77 changes: 65 additions & 12 deletions api/python/tests/test_artifact_download.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,72 @@
import os
from unittest.mock import Mock, patch
from urllib.error import HTTPError, URLError

import pytest
from cellxgene_ontology_guide.artifact_download import load_artifact_by_schema
from cellxgene_ontology_guide.constants import ARTIFACT_DIR, CURRENT_SCHEMA_VERSION
from cellxgene_ontology_guide.constants import ALL_ONTOLOGY_FILENAME, ONTOLOGY_ASSET_RELEASE_URL


def test_load_artifact_by_schema():
assert load_artifact_by_schema(CURRENT_SCHEMA_VERSION, "ontology_info.yml") == os.path.join(
ARTIFACT_DIR, "ontology_info.yml"
)
assert load_artifact_by_schema(CURRENT_SCHEMA_VERSION, "all_ontology.json.gz") == os.path.join(
ARTIFACT_DIR, "all_ontology.json.gz"
)
@pytest.fixture
def mock_urlopen():
"""A fixture that mocks urlopen and simulates a successful response."""

def get_mock_response(url):
if url.endswith(ALL_ONTOLOGY_FILENAME):
mock_response = Mock()
mock_response.__enter__ = Mock(return_value=mock_response)
mock_response.__exit__ = Mock(return_value=None)
mock_response.read.return_value = b'{"key": "value"}'
mock_response.status = 200
return mock_response
else:
raise HTTPError(url, 404, "Not Found", hdrs=None, fp=None)

def test_load_artifact_by_schema_raises_value_error():
with pytest.raises(ValueError):
load_artifact_by_schema("0.0.0", "ontology_info.yml")
with patch("cellxgene_ontology_guide.artifact_download.urlopen", side_effect=get_mock_response) as mock:
yield mock


@pytest.fixture
def mock_urlopen_url_error():
"""A fixture that mocks urlopen and simulates a URLError."""
with patch(
"cellxgene_ontology_guide.artifact_download.urlopen", side_effect=URLError(reason="Network Unreachable")
) as mock:
yield mock


def test_load_artifact_by_schema__success(mock_urlopen):
schema_version = "5.0.0"
expected_tag = "ontology-assets-v0.0.1"
expected_resp_content = b'{"key": "value"}'

result = load_artifact_by_schema(schema_version, ALL_ONTOLOGY_FILENAME)
expected_download_url = f"{ONTOLOGY_ASSET_RELEASE_URL}/{expected_tag}/{ALL_ONTOLOGY_FILENAME}"

mock_urlopen.assert_called_once_with(expected_download_url)
assert result == expected_resp_content


def test_load_artifact_by_schema__unsupported_schema_version(mock_urlopen):
schema_version = "v0.0.0"
with pytest.raises(ValueError) as exc_info:
load_artifact_by_schema(schema_version, ALL_ONTOLOGY_FILENAME)
assert "Schema version v0.0.0 is not supported in this package version." in str(exc_info.value)
mock_urlopen.assert_not_called()


def test_load_artifact_by_schema__http_error(mock_urlopen):
schema_version = "5.0.0"
filename = "missing.json"
with pytest.raises(ValueError) as exc_info:
load_artifact_by_schema(schema_version, filename)
assert "Could not get missing.json for schema version 5.0.0 in GitHub Release Assets" in str(exc_info.value)
mock_urlopen.assert_called_once()


def test_load_artifact_by_schema__url_error(mock_urlopen_url_error):
schema_version = "5.0.0"
filename = "all_ontology.json.gz"
with pytest.raises(ValueError) as exc_info:
load_artifact_by_schema(schema_version, filename)
assert "URL error occurred: Network Unreachable" in str(exc_info.value)
mock_urlopen_url_error.assert_called_once()
39 changes: 32 additions & 7 deletions api/python/tests/test_ontology_parser.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
import gzip
import json
from unittest.mock import patch

import pytest
from cellxgene_ontology_guide.constants import ALL_ONTOLOGY_FILENAME, ONTOLOGY_INFO_FILENAME
from cellxgene_ontology_guide.ontology_parser import OntologyParser


@pytest.fixture(scope="module")
def ontology_dict():
return {
ontology_dict = {
"CL": {
"CL:0000000": {"ancestors": [], "label": "cell A", "deprecated": False},
"CL:0000001": {
Expand All @@ -25,19 +30,31 @@ def ontology_dict():
"CL:0000004": {"ancestors": ["CL:0000001", "CL:0000000"], "label": "cell B2", "deprecated": False},
}
}
return gzip.compress(json.dumps(ontology_dict).encode("utf-8"))


@pytest.fixture(scope="module")
def supported_ontologies():
return {"CL": {"version": "2024-01-01", "source": "http://example.com", "filetype": "owl"}}
return b'{"CL": {"version": "2024-01-01", "source": "http://example.com", "filetype": "owl"}}'


@pytest.fixture(scope="module")
def mock_load_artifact_by_schema(ontology_dict, supported_ontologies):
def get_mock_artifact_by_schema(schema_version, filename):
if filename == ALL_ONTOLOGY_FILENAME:
return ontology_dict
elif filename == ONTOLOGY_INFO_FILENAME:
return supported_ontologies

with patch(
"cellxgene_ontology_guide.ontology_parser.load_artifact_by_schema", side_effect=get_mock_artifact_by_schema
) as mock:
yield mock


@pytest.fixture(scope="module")
def ontology_parser(ontology_dict, supported_ontologies):
parser = OntologyParser()
parser.ontology_dict = ontology_dict
parser.supported_ontologies = supported_ontologies
return parser
def ontology_parser(mock_load_artifact_by_schema):
return OntologyParser(schema_version="5.0.0")


def test_parse_ontology_name(ontology_parser):
Expand Down Expand Up @@ -110,3 +127,11 @@ def test_get_term_metadata(ontology_parser):

def test_get_term_label(ontology_parser):
assert ontology_parser.get_term_label("CL:0000004") == "cell B2"


def test__init__multiple_ontology_parsers(mock_load_artifact_by_schema, ontology_parser):
ontology_parser_duplicate = OntologyParser(schema_version="5.0.0")
ontology_parser_4 = OntologyParser(schema_version="4.0.0")

assert ontology_parser_duplicate is ontology_parser
assert ontology_parser_4 is not ontology_parser
Binary file modified ontology-assets/all_ontology.json.gz
Binary file not shown.
47 changes: 47 additions & 0 deletions ontology-assets/ontology_info.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
{
"CL": {
"version": "v2024-01-04",
"source": "https://github.com/obophenotype/cell-ontology/releases/download",
"filetype": "owl"
},
"EFO": {
"version": "v3.62.0",
"source": "https://github.com/EBISPOT/efo/releases/download",
"filetype": "owl"
},
"HANCESTRO": {
"version": "3.0",
"source": "https://github.com/EBISPOT/hancestro/raw",
"filetype": "owl"
},
"HsapDv": {
"version": "11",
"source": "http://aber-owl.net/media/ontologies/HSAPDV",
"filetype": "owl"
},
"MONDO": {
"version": "v2024-01-03",
"source": "https://github.com/monarch-initiative/mondo/releases/download",
"filetype": "owl"
},
"MmusDv": {
"version": "9",
"source": "http://aber-owl.net/media/ontologies/MMUSDV",
"filetype": "owl"
},
"NCBITaxon": {
"version": "v2023-06-20",
"source": "https://github.com/obophenotype/ncbitaxon/releases/download",
"filetype": "owl.gz"
},
"UBERON": {
"version": "v2024-01-18",
"source": "https://github.com/obophenotype/uberon/releases/download",
"filetype": "owl"
},
"PATO": {
"version": "v2023-05-18",
"source": "https://github.com/pato-ontology/pato/raw",
"filetype": "owl"
}
}
Loading

0 comments on commit 58bad0a

Please sign in to comment.