Skip to content

Commit

Permalink
feature/mx-1352 Revamp primary source loading (#19)
Browse files Browse the repository at this point in the history
- fixing typo in `extract_oranigram_units` (now:
`extract_organigram_units`)
- remove dependency to `sqlalchemy`
- remove `BaseSettings.sqlite_path`
- hardcode the stableTargetId of the `mex` primary source to resolve
cyclic dependency
- remove sqlite-based logic
  - `insert_primary_source_into_db` function
  - `seed-mex-db` script
  - `insert_test_primary_sources_into_db` helper function
  - `seed_primary_sources_into_db` fixture
- because the middle-step of loading the primary sources into sqlite is
now cut out
- they are read directly from the JSON file in assets where they are
maintained
- add `BaseSettings.primary_sources_path` to point to the
`primary-source.json` file
- replace `transform_mex_db_primary_source_to_extracted_primary_source`
with `transform_seed_primary_sources_to_extracted_primary_sources` which
operates on iterables
- add `extracted_primary_sources` fixture that loads all primary sources
from `primary_sources_path` and returns a mapping of human-readable
identifiers to `ExtractedPrimarySource` objects
- this removes the need for creating individual
`extracted_foo_primary_source` fixtures
  • Loading branch information
cutoffthetop authored Aug 29, 2023
1 parent 0cd242b commit 16cf77f
Show file tree
Hide file tree
Showing 38 changed files with 274 additions and 1,081 deletions.
1 change: 0 additions & 1 deletion .mypy.ini
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
[mypy]
python_version = 3.11
follow_imports = silent
plugins = sqlalchemy.ext.mypy.plugin
show_error_codes = True
strict = True

Expand Down
11 changes: 5 additions & 6 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,21 +36,20 @@ repos:
additional_dependencies: [".[toml]"]
exclude: "test_"
- repo: https://github.com/python-poetry/poetry
rev: 1.5.1
rev: 1.6.0
hooks:
- id: poetry-check
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.4.1
rev: v1.5.1
hooks:
- id: mypy
name: mypy
files: ^mex/
additional_dependencies:
- "backoff>=2.2.1,<3"
- "click>=8.1.6,<9"
- "pydantic[dotenv,email]>=1.10.11,<2"
- "click>=8.1.7,<9"
- "pydantic[dotenv,email]>=1.10.12,<2"
- "pytest>=7.4.0,<8"
- "sqlalchemy[mypy]>=1.4.49,<2"
- "types-pytz>=2023.3.0.0,<2024"
- "types-requests>=2.31.0.2,<3"
- "types-setuptools>=68.0.0.3,<69"
- "types-setuptools>=68.1.0.0,<69"
Empty file.
11 changes: 0 additions & 11 deletions assets/mappings/dummy-primary-source/primary-source.csv

This file was deleted.

58 changes: 0 additions & 58 deletions assets/raw-data/platform/platforms.json

This file was deleted.

38 changes: 38 additions & 0 deletions assets/raw-data/primary-sources/primary-sources.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
[
{
"identifier": "mex",
"title": [
{
"language": "en",
"value": "Metadata Exchange"
}
]
},
{
"identifier": "organigram",
"title": [
{
"language": "en",
"value": "Organizational Units"
}
]
},
{
"identifier": "ldap",
"title": [
{
"language": "en",
"value": "Active Directory"
}
]
},
{
"identifier": "wikidata",
"title": [
{
"language": "en",
"value": "Wikidata APIs"
}
]
}
]
1 change: 0 additions & 1 deletion mex/common/backend_api/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from mex.common.backend_api.models import BulkInsertResponse
from mex.common.connector import HTTPConnector
from mex.common.models import MExModel
from mex.common.models.base import MExModel
from mex.common.settings import BaseSettings
from mex.common.types import Identifier

Expand Down
2 changes: 1 addition & 1 deletion mex/common/ldap/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,5 +212,5 @@ def analyse_person_string(string: str) -> list[PersonName]:
if len(split) == 2:
return [PersonName(surname=split[1], given_name=split[0], full_name=full_name)]

# found noone
# found no one
return []
9 changes: 8 additions & 1 deletion mex/common/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,12 @@
ExtractedDistribution,
MergedDistribution,
)
from mex.common.models.extracted_data import BaseExtractedData, ExtractedData
from mex.common.models.extracted_data import (
MEX_PRIMARY_SOURCE_IDENTIFIER_IN_PRIMARY_SOURCE,
MEX_PRIMARY_SOURCE_STABLE_TARGET_ID,
BaseExtractedData,
ExtractedData,
)
from mex.common.models.merged_item import MergedItem
from mex.common.models.organization import (
BaseOrganization,
Expand Down Expand Up @@ -82,6 +87,8 @@
"MergedResource",
"MergedVariable",
"MergedVariableGroup",
"MEX_PRIMARY_SOURCE_IDENTIFIER_IN_PRIMARY_SOURCE",
"MEX_PRIMARY_SOURCE_STABLE_TARGET_ID",
"MExModel",
)

Expand Down
36 changes: 21 additions & 15 deletions mex/common/models/extracted_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
from mex.common.models.base import MExModel
from mex.common.types import Identifier, PrimarySourceID

MEX_PRIMARY_SOURCE_STABLE_TARGET_ID = PrimarySourceID("00000000000000")
MEX_PRIMARY_SOURCE_IDENTIFIER_IN_PRIMARY_SOURCE = "mex"


class BaseExtractedData(MExModel):
"""Base model class definition for all extracted data instances."""
Expand Down Expand Up @@ -39,20 +42,22 @@ def set_identifiers(cls, values: dict[str, Any]) -> dict[str, Any]:
"""Ensure identifier and provenance attributes are set for this instance.
A lookup is performed to determine whether this extracted data instance already
has an identifier
or merged ID. If not, new ones are generated and the association remembered.
has an `identifier` or `stableTargetId`.
If not, new ones are generated and the association remembered.
If the `identifier` field has been set manually, e.g. passed to the constructor,
we check that it is already present in the database and is assigned to the same
extracted data instance: it must be the same combination of
we check that it is already present in the identity provider and is assigned to
the same extracted data instance: it must be the same combination of
`identifier`, `hadPrimarySource` and `identifierInPrimarySource`.
If the identity is not found or the `fragment_id` differs, an error is thrown.
If the identity is not found or the `identifier` differs, an error is thrown.
An exception is made for the MEx primary source which serves as the root node
for all relations: it may the `stableTargetId` manually.
Args:
values: Raw values to validate
Raises:
ValueError: If `identifier` was supplied but does not match the database
ValueError: If `identifier` was supplied but does not match the id provider
ValueError: If `identifierInPrimarySource` was missing
ValueError: If `hadPrimarySource` was missing
Expand Down Expand Up @@ -81,19 +86,20 @@ def set_identifiers(cls, values: dict[str, Any]) -> dict[str, Any]:
# validate extracted data ID
if identifier := values.get("identifier"):
if identity is None:
raise ValueError("Identifier not found in connected identity map.")
if identity.fragment_id != str(identifier):
raise ValueError("Identifier not found by identity provider.")
if identity.identifier != str(identifier):
raise ValueError("Identifier cannot be set manually to new value.")

# validate stable target ID
if stable_target_id := values.get("stableTargetId"):
if identity is None:
raise ValueError(
"Stable target ID not found in connected identity map."
)
if (
identity is None
and stable_target_id != MEX_PRIMARY_SOURCE_STABLE_TARGET_ID
):
raise ValueError("Stable target ID not found by identity provider.")
stable_target_id = Identifier(stable_target_id)
elif identity:
stable_target_id = Identifier(identity.merged_id)
stable_target_id = Identifier(identity.stableTargetId)
else:
stable_target_id = Identifier.generate()

Expand All @@ -106,6 +112,6 @@ def set_identifiers(cls, values: dict[str, Any]) -> dict[str, Any]:
)

# update instance values
values["identifier"] = Identifier(identity.fragment_id)
values["stableTargetId"] = Identifier(identity.merged_id)
values["identifier"] = Identifier(identity.identifier)
values["stableTargetId"] = Identifier(identity.stableTargetId)
return values
2 changes: 1 addition & 1 deletion mex/common/organigram/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@


@watch
def extract_oranigram_units() -> Generator[OrganigramUnit, None, None]:
def extract_organigram_units() -> Generator[OrganigramUnit, None, None]:
"""Extract organizational units from the organigram JSON file.
Settings:
Expand Down
17 changes: 6 additions & 11 deletions mex/common/organigram/transform.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,20 @@
from typing import Generator, Iterable

from mex.common.logging import watch
from mex.common.models import ExtractedOrganizationalUnit
from mex.common.models import ExtractedOrganizationalUnit, ExtractedPrimarySource
from mex.common.organigram.models import OrganigramUnit
from mex.common.types import (
Email,
OrganizationalUnitID,
PrimarySourceID,
Text,
TextLanguage,
)
from mex.common.types import Email, OrganizationalUnitID, Text, TextLanguage


@watch
def transform_organigram_units_to_organizational_units(
units: Iterable[OrganigramUnit],
units: Iterable[OrganigramUnit], primary_source: ExtractedPrimarySource
) -> Generator[ExtractedOrganizationalUnit, None, None]:
"""Transform organigram units into ExtractedOrganizationalUnit .
"""Transform organigram units into ExtractedOrganizationalUnits.
Args:
units: Iterable of organigram units coming from the JSON file
primary_source: Primary source for organigram
Returns:
Generator for ExtractedOrganizationalUnit
Expand All @@ -30,7 +25,7 @@ def transform_organigram_units_to_organizational_units(
for unit in units:
extracted_unit = ExtractedOrganizationalUnit( # type: ignore[call-arg]
identifierInPrimarySource=unit.identifier,
hadPrimarySource=PrimarySourceID.generate(seed=0), # TODO stopgap mx-603
hadPrimarySource=primary_source.stableTargetId,
alternativeName=[Text(value=name) for name in unit.alternativeName],
email=[Email(email) for email in unit.email],
name=[
Expand Down
Loading

0 comments on commit 16cf77f

Please sign in to comment.