Skip to content

Commit

Permalink
feature/bump mex-common to 0.43.0 (#300)
Browse files Browse the repository at this point in the history
### PR Context

- upgrade to robert-koch-institut/mex-common#347
- move away from "extracted data" naming, and over to "extracted items"
(for consistency)

### Changes

- BREAKING: rename artificial provider function `extracted_data` to
`extracted_items`
- prefer concrete unions over base classes for merged and extracted item
typing
  • Loading branch information
cutoffthetop authored Dec 11, 2024
1 parent a717b4d commit 3211cfe
Show file tree
Hide file tree
Showing 14 changed files with 64 additions and 54 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Changes
- extractors now use wikidata helper function

- BREAKING: rename artificial provider function `extracted_data` to `extracted_items`
- prefer concrete unions over base classes for merged and extracted item typing

### Deprecated

### Removed
Expand All @@ -22,6 +25,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [0.22.0] - 2024-12-10

### Changes

- wrap up ifsg model v3 update
- wrap up seq-repo model v3 update

## [0.21.0] - 2024-11-19

### Added
Expand Down
2 changes: 1 addition & 1 deletion mex/extractors/artificial/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def factories(faker: Faker, identities: IdentityMap) -> Faker:
def artificial_data(factories: Faker, identities: IdentityMap) -> None:
"""Create artificial data and load the models to the sinks."""
restore_identities(identities) # restore state of memory identity provider
load(m for c in EXTRACTED_MODEL_CLASSES for m in factories.extracted_data(c))
load(m for c in EXTRACTED_MODEL_CLASSES for m in factories.extracted_items(c))


@entrypoint(Settings)
Expand Down
10 changes: 6 additions & 4 deletions mex/extractors/artificial/provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from pydantic.fields import FieldInfo

from mex.common.identity import Identity
from mex.common.models import ExtractedData
from mex.common.models import AnyExtractedModel
from mex.common.types import (
TEMPORAL_ENTITY_FORMATS_BY_PRECISION,
UTC,
Expand Down Expand Up @@ -102,8 +102,10 @@ def field_value(
raise RuntimeError(msg)
return [factory() for _ in range(self.pyint(*self.min_max_for_field(field)))]

def extracted_data(self, model: type[ExtractedData]) -> list[ExtractedData]:
"""Get a list of extracted data instances for the given model class."""
def extracted_items(
self, model: type[AnyExtractedModel]
) -> list[AnyExtractedModel]:
"""Get a list of extracted items for the given model class."""
models = []
for identity in cast(list[Identity], self.generator.identities(model)):
# manually set identity related fields
Expand All @@ -130,7 +132,7 @@ def __init__(self, factory: Any, identities: IdentityMap) -> None:
super().__init__(factory)
self._identities = identities

def identities(self, model: type[ExtractedData]) -> list[Identity]:
def identities(self, model: type[AnyExtractedModel]) -> list[Identity]:
"""Return a list of identities for the given model class."""
return self._identities[model.__name__.removeprefix("Extracted")]

Expand Down
2 changes: 1 addition & 1 deletion mex/extractors/confluence_vvt/parse_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ def get_clean_current_row_all_cols_data(
def get_interne_vorgangsnummer_from_all_rows_data(
intnmr_dict: Any | None | list[str],
) -> list[str] | Any:
"""Get Interne Vorgangsnummer from the table extracted data.
"""Get Interne Vorgangsnummer from the extracted table.
Args:
intnmr_dict: Extracted dict or list of Interne Vorgangsnummer
Expand Down
4 changes: 2 additions & 2 deletions mex/extractors/publisher/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@

from mex.common.backend_api.connector import BackendApiConnector
from mex.common.logging import logger
from mex.common.models import MergedItem
from mex.common.models import AnyMergedModel


def get_merged_items() -> Generator[MergedItem, None, None]:
def get_merged_items() -> Generator[AnyMergedModel, None, None]:
"""Read merged items from backend."""
connector = BackendApiConnector.get()

Expand Down
6 changes: 3 additions & 3 deletions mex/extractors/publisher/filter.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
from collections.abc import Generator, Iterable

from mex.common.logging import logger
from mex.common.models import MergedItem
from mex.common.models import AnyMergedModel
from mex.extractors.settings import Settings


def filter_merged_items(
items: Iterable[MergedItem],
) -> Generator[MergedItem, None, None]:
items: Iterable[AnyMergedModel],
) -> Generator[AnyMergedModel, None, None]:
"""Filter to be published items by allow list."""
settings = Settings.get()

Expand Down
4 changes: 2 additions & 2 deletions mex/extractors/publisher/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
from collections.abc import Iterable

from mex.common.logging import logger
from mex.common.models import MergedItem
from mex.common.models import AnyMergedModel
from mex.common.settings import BaseSettings
from mex.common.transform import MExEncoder


def write_merged_items(items: Iterable[MergedItem]) -> None:
def write_merged_items(items: Iterable[AnyMergedModel]) -> None:
"""Write the incoming items into a new-line delimited JSON file."""
settings = BaseSettings.get()
ndjson_path = settings.work_dir / "publisher.ndjson"
Expand Down
14 changes: 7 additions & 7 deletions mex/extractors/publisher/main.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from mex.common.backend_api.models import MergedItemsResponse
from mex.common.cli import entrypoint
from mex.common.models import MergedItem
from mex.extractors.pipeline import asset, run_job_in_process
from mex.extractors.publisher.extract import get_merged_items
from mex.extractors.publisher.filter import filter_merged_items
Expand All @@ -8,17 +8,17 @@


@asset(group_name="publisher")
def extract_and_filter_merged_items() -> list[MergedItem]:
def extract_and_filter_merged_items() -> MergedItemsResponse:
"""Get merged items from mex-backend and filter them by allow-list."""
items = get_merged_items()

return list(filter_merged_items(items))
filtered = list(filter_merged_items(items))
return MergedItemsResponse(items=filtered, total=len(filtered))


@asset(group_name="publisher")
def publish_merged_items(extract_and_filter_merged_items: list[MergedItem]) -> None:
"""Write recieved merged items to ndjson file."""
write_merged_items(extract_and_filter_merged_items)
def publish_merged_items(extract_and_filter_merged_items: MergedItemsResponse) -> None:
"""Write received merged items to ndjson file."""
write_merged_items(extract_and_filter_merged_items.items)


@entrypoint(Settings)
Expand Down
6 changes: 3 additions & 3 deletions mex/extractors/sinks.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@
from itertools import tee

from mex.common.exceptions import MExError
from mex.common.models import ExtractedData
from mex.common.models import AnyExtractedModel
from mex.common.settings import BaseSettings
from mex.common.sinks.backend_api import post_to_backend_api
from mex.common.sinks.ndjson import write_ndjson
from mex.common.types import Identifier, Sink


def load(models: Iterable[ExtractedData]) -> None:
def load(models: Iterable[AnyExtractedModel]) -> None:
"""Load models to the backend API or write to NDJSON files.
Args:
Expand All @@ -19,7 +19,7 @@ def load(models: Iterable[ExtractedData]) -> None:
sink: Where to load the provided models
"""
settings = BaseSettings.get()
func: Callable[[Iterable[ExtractedData]], Iterable[Identifier]]
func: Callable[[Iterable[AnyExtractedModel]], Iterable[Identifier]]

for sink, model_gen in zip(
settings.sink, tee(models, len(settings.sink)), strict=False
Expand Down
8 changes: 4 additions & 4 deletions pdm.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ dependencies = [
"dagster-webserver>=1,<2",
"dagster>=1,<2",
"faker>=33,<34",
"mex-common @ git+https://github.com/robert-koch-institut/mex-common.git@0.42.0",
"mex-common @ git+https://github.com/robert-koch-institut/mex-common.git@0.43.0",
"numpy>=2,<3",
"openpyxl>=3,<4",
"pandas>=2,<3",
Expand Down
4 changes: 2 additions & 2 deletions tests/artificial/test_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,8 +163,8 @@ def test_builder_provider_field_value_error(faker: Faker) -> None:
faker.field_value(field, identity)


def test_builder_provider_extracted_data(faker: Faker) -> None:
models = faker.extracted_data(ExtractedContactPoint)
def test_builder_provider_extracted_items(faker: Faker) -> None:
models = faker.extracted_items(ExtractedContactPoint)
assert models[0].model_dump(exclude_defaults=True) == {
"email": [
"salazarmaria@example.com",
Expand Down
36 changes: 18 additions & 18 deletions tests/sumo/test_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,19 +33,19 @@ def test_extract_cc1_data_model_nokeda() -> None:
element_label="Name des EDIS",
element_label_en="Name of EDIS",
)
extracted_data = list(extract_cc1_data_model_nokeda())
assert len(extracted_data) == 3
assert extracted_data[0] == expected
extracted = list(extract_cc1_data_model_nokeda())
assert len(extracted) == 3
assert extracted[0] == expected


def test_extract_cc1_data_valuesets() -> None:
expected = Cc1DataValuesets(
category_label_de="Herzstillstand (nicht traumatisch)",
sheet_name="nokeda_cedis",
)
extracted_data = list(extract_cc1_data_valuesets())
assert len(extracted_data) == 6
assert extracted_data[0] == expected
extracted = list(extract_cc1_data_valuesets())
assert len(extracted) == 6
assert extracted[0] == expected


def test_extract_cc2_aux_mapping(
Expand All @@ -54,9 +54,9 @@ def test_extract_cc2_aux_mapping(
expected = Cc2AuxMapping(
variable_name_column=["0", "1", "2"], sheet_name="nokeda_age21"
)
extracted_data = list(extract_cc2_aux_mapping(cc2_aux_model))
assert len(extracted_data) == 2
assert extracted_data[0] == expected
extracted = list(extract_cc2_aux_mapping(cc2_aux_model))
assert len(extracted) == 2
assert extracted[0] == expected


def test_extract_cc2_aux_model() -> None:
Expand All @@ -67,16 +67,16 @@ def test_extract_cc2_aux_model() -> None:
in_database_static=True,
variable_name="aux_age21_min",
)
extracted_data = list(extract_cc2_aux_model())
assert len(extracted_data) == 2
assert extracted_data[0] == expected
extracted = list(extract_cc2_aux_model())
assert len(extracted) == 2
assert extracted[0] == expected


def test_extract_cc2_aux_valuesets() -> None:
expected = Cc2AuxValuesets(label_de="Kardiovaskulär", label_en="Cardiovascular")
extracted_data = list(extract_cc2_aux_valuesets())
assert len(extracted_data) == 3
assert extracted_data[0] == expected
extracted = list(extract_cc2_aux_valuesets())
assert len(extracted) == 3
assert extracted[0] == expected


def test_extract_cc2_feat_projection() -> None:
Expand All @@ -88,9 +88,9 @@ def test_extract_cc2_feat_projection() -> None:
feature_name_de="Respiratorisches Syncytial-Virus, spezifisch",
feature_description="specific RSV-ICD-10 codes",
)
extracted_data = list(extract_cc2_feat_projection())
assert len(extracted_data) == 3
assert extracted_data[0] == expected
extracted = list(extract_cc2_feat_projection())
assert len(extracted) == 3
assert extracted[0] == expected


@pytest.mark.usefixtures("mocked_ldap")
Expand Down
12 changes: 6 additions & 6 deletions tests/sumo/test_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@
def test_filter_and_log_variables(
extracted_primary_sources: dict[str, ExtractedPrimarySource],
) -> None:
extracted_data = extract_cc2_aux_model()
extracted_data_gens = tee(extracted_data, 2)
assert len(list(extracted_data_gens[0])) == 2
extracted_data = filter_and_log_cc2_aux_model(
extracted_data_gens[1], extracted_primary_sources["nokeda"]
extracted_models = extract_cc2_aux_model()
extracted_model_gens = tee(extracted_models, 2)
assert len(list(extracted_model_gens[0])) == 2
extracted_models = filter_and_log_cc2_aux_model(
extracted_model_gens[1], extracted_primary_sources["nokeda"]
)
assert len(list(extracted_data)) == 1
assert len(list(extracted_models)) == 1

0 comments on commit 3211cfe

Please sign in to comment.