From 1e62cc3bf65af69b8c4e318614023ff45ed86e9f Mon Sep 17 00:00:00 2001 From: esinsj <72222648+esinsj@users.noreply.github.com> Date: Tue, 12 Nov 2024 13:12:19 +0100 Subject: [PATCH] Feature/mx 1332 organigram check for duplicates (#324) # Added - organigram extraction checks for duplicate emails/labels in different organigram units --- CHANGELOG.md | 1 + mex/common/backend_api/connector.py | 2 +- mex/common/organigram/extract.py | 46 ++++++++++++++++++++++------- tests/organigram/test_extract.py | 40 +++++++++++++++++++++++++ 4 files changed, 78 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0d642664..8316291d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added +- organigram extraction checks for duplicate emails/labels in different organigram units ### Changes diff --git a/mex/common/backend_api/connector.py b/mex/common/backend_api/connector.py index 2ee350c1..2912d0ae 100644 --- a/mex/common/backend_api/connector.py +++ b/mex/common/backend_api/connector.py @@ -140,7 +140,7 @@ def get_merged_item( identifier: The merged item's identifier Raises: - MExError: If no merged item was found + HTTPError: If no merged item was found Returns: A single merged item diff --git a/mex/common/organigram/extract.py b/mex/common/organigram/extract.py index 173d8728..726302a1 100644 --- a/mex/common/organigram/extract.py +++ b/mex/common/organigram/extract.py @@ -1,6 +1,7 @@ import json from collections.abc import Generator, Iterable +from mex.common.exceptions import MExError from mex.common.logging import watch from mex.common.models import ExtractedOrganizationalUnit from mex.common.organigram.models import OrganigramUnit @@ -55,14 +56,26 @@ def get_unit_merged_ids_by_synonyms( Args: extracted_units: Iterable of extracted units + Raises: + MExError: If the same entry maps to different merged IDs + Returns: Mapping from unit synonyms to stableTargetIds """ - return { - synonym: MergedOrganizationalUnitIdentifier(extracted_unit.stableTargetId) - for extracted_unit in extracted_units - for synonym in _get_synonyms(extracted_unit) - } + synonym_dict: dict[str, MergedOrganizationalUnitIdentifier] = {} + for extracted_unit in extracted_units: + for synonym in _get_synonyms(extracted_unit): + if ( + synonym in synonym_dict + and synonym_dict[synonym] != extracted_unit.stableTargetId + ): + msg = ( + f"Conflict: label '{synonym}' is associated with merged unit IDs " + f"{synonym_dict[synonym]} and {extracted_unit.stableTargetId}." + ) + raise MExError(msg) + synonym_dict[synonym] = extracted_unit.stableTargetId + return synonym_dict def get_unit_merged_ids_by_emails( @@ -75,11 +88,24 @@ def get_unit_merged_ids_by_emails( Args: extracted_units: Iterable of extracted units + Raises: + MExError: If the same entry maps to different merged IDs + Returns: Mapping from lowercased `email` to stableTargetIds """ - return { - email.lower(): MergedOrganizationalUnitIdentifier(extracted_unit.stableTargetId) - for extracted_unit in extracted_units - for email in extracted_unit.email - } + email_dict: dict[str, MergedOrganizationalUnitIdentifier] = {} + for extracted_unit in extracted_units: + for email in extracted_unit.email: + lower_email = email.lower() + if ( + lower_email in email_dict + and email_dict[lower_email] != extracted_unit.stableTargetId + ): + msg = ( + f"Conflict: email '{email}' is associated with merged unit IDs " + f"{email_dict[lower_email]} and {extracted_unit.stableTargetId}." + ) + raise MExError(msg) + email_dict[lower_email] = extracted_unit.stableTargetId + return email_dict diff --git a/tests/organigram/test_extract.py b/tests/organigram/test_extract.py index 44c71d8b..c939cc8f 100644 --- a/tests/organigram/test_extract.py +++ b/tests/organigram/test_extract.py @@ -1,3 +1,6 @@ +import pytest + +from mex.common.exceptions import MExError from mex.common.models import ExtractedOrganizationalUnit from mex.common.organigram.extract import ( extract_organigram_units, @@ -5,6 +8,7 @@ get_unit_merged_ids_by_synonyms, ) from mex.common.organigram.models import OrganigramUnit +from mex.common.types import Text def test_extract_organigram_units( @@ -40,6 +44,24 @@ def test_get_unit_merged_ids_by_synonyms( } +def test_get_unit_merged_ids_by_synonyms_error( + extracted_child_unit: ExtractedOrganizationalUnit, + extracted_parent_unit: ExtractedOrganizationalUnit, +) -> None: + erroneus_extracted_child_unit = extracted_child_unit + erroneus_extracted_child_unit.name.append(Text(value="PARENT Dept.")) + + msg = ( + f"MExError: Conflict: label 'PARENT Dept.' is associated with " + f"merged unit IDs {erroneus_extracted_child_unit.stableTargetId} and " + f"{extracted_parent_unit.stableTargetId}." + ) + with pytest.raises(MExError, match=msg): + get_unit_merged_ids_by_synonyms( + [erroneus_extracted_child_unit, extracted_parent_unit] + ) + + def test_get_unit_merged_ids_by_emails( extracted_child_unit: ExtractedOrganizationalUnit, extracted_parent_unit: ExtractedOrganizationalUnit, @@ -52,3 +74,21 @@ def test_get_unit_merged_ids_by_emails( "pu@example.com": extracted_parent_unit.stableTargetId, # child unit has no emails } + + +def test_get_unit_merged_ids_by_emails_error( + extracted_child_unit: ExtractedOrganizationalUnit, + extracted_parent_unit: ExtractedOrganizationalUnit, +) -> None: + erroneus_extracted_child_unit = extracted_child_unit + erroneus_extracted_child_unit.email.append("PARENT@example.com") + + msg = ( + f"MExError: Conflict: email 'PARENT@example.com' is associated with " + f"merged unit IDs {erroneus_extracted_child_unit.stableTargetId} and " + f"{extracted_parent_unit.stableTargetId}." + ) + with pytest.raises(MExError, match=msg): + get_unit_merged_ids_by_emails( + [erroneus_extracted_child_unit, extracted_parent_unit] + )