From 5bbe5683e39c888d0d49c6e8796045c9d8e2d372 Mon Sep 17 00:00:00 2001 From: Janina Esins Date: Thu, 7 Nov 2024 18:15:05 +0100 Subject: [PATCH 1/8] throw MExError for label and email conflicts. --- mex/common/organigram/extract.py | 42 ++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/mex/common/organigram/extract.py b/mex/common/organigram/extract.py index 173d8728..4ab44514 100644 --- a/mex/common/organigram/extract.py +++ b/mex/common/organigram/extract.py @@ -1,6 +1,7 @@ import json from collections.abc import Generator, Iterable +from mex.common.exceptions import MExError from mex.common.logging import watch from mex.common.models import ExtractedOrganizationalUnit from mex.common.organigram.models import OrganigramUnit @@ -51,6 +52,7 @@ def get_unit_merged_ids_by_synonyms( """Return a mapping from unit alt_label and label to their merged IDs. There will be multiple entries per unit mapping to the same merged ID. + But if the same entry mapps to differnt merged unit IDs, an error is thrown. Args: extracted_units: Iterable of extracted units @@ -58,11 +60,20 @@ def get_unit_merged_ids_by_synonyms( Returns: Mapping from unit synonyms to stableTargetIds """ - return { - synonym: MergedOrganizationalUnitIdentifier(extracted_unit.stableTargetId) - for extracted_unit in extracted_units - for synonym in _get_synonyms(extracted_unit) - } + synonym_dict: dict[str, MergedOrganizationalUnitIdentifier] = {} + for extracted_unit in extracted_units: + for synonym in _get_synonyms(extracted_unit): + if ( + synonym in synonym_dict + and synonym_dict[synonym] != extracted_unit.stableTargetId + ): + msg = ( + f"Conflict: label '{synonym}' is associated with merged unit ID " + f"{synonym_dict[synonym]} and {extracted_unit.stableTargetId}." + ) + raise MExError(msg) + synonym_dict[synonym] = extracted_unit.stableTargetId + return synonym_dict def get_unit_merged_ids_by_emails( @@ -71,6 +82,7 @@ def get_unit_merged_ids_by_emails( """Return a mapping from unit emails to their merged IDs. There may be multiple emails per unit mapping to the same merged ID. + But if the same email mapps to differnt merged unit IDs, an error is thrown. Args: extracted_units: Iterable of extracted units @@ -78,8 +90,18 @@ def get_unit_merged_ids_by_emails( Returns: Mapping from lowercased `email` to stableTargetIds """ - return { - email.lower(): MergedOrganizationalUnitIdentifier(extracted_unit.stableTargetId) - for extracted_unit in extracted_units - for email in extracted_unit.email - } + email_dict: dict[str, MergedOrganizationalUnitIdentifier] = {} + for extracted_unit in extracted_units: + for email in extracted_unit.email: + lower_email = email.lower() + if ( + lower_email in email_dict + and email_dict[lower_email] != extracted_unit.stableTargetId + ): + msg = ( + f"Conflict: email {email} is associated with merged unit ID " + f"{email_dict[lower_email]} and {extracted_unit.stableTargetId}." + ) + raise MExError(msg) + email_dict[lower_email] = extracted_unit.stableTargetId + return email_dict From acd7335619af39313cc0c7eb738dc4bd0e41a507 Mon Sep 17 00:00:00 2001 From: Janina Esins Date: Thu, 7 Nov 2024 18:16:00 +0100 Subject: [PATCH 2/8] correct description of method. --- mex/common/backend_api/connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mex/common/backend_api/connector.py b/mex/common/backend_api/connector.py index 2ee350c1..2912d0ae 100644 --- a/mex/common/backend_api/connector.py +++ b/mex/common/backend_api/connector.py @@ -140,7 +140,7 @@ def get_merged_item( identifier: The merged item's identifier Raises: - MExError: If no merged item was found + HTTPError: If no merged item was found Returns: A single merged item From b4eaf598e30b099fafb8348c6db74ee54a5ab4e6 Mon Sep 17 00:00:00 2001 From: Janina Esins Date: Thu, 7 Nov 2024 18:18:12 +0100 Subject: [PATCH 3/8] update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0d642664..8316291d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added +- organigram extraction checks for duplicate emails/labels in different organigram units ### Changes From 6240a4a158df91426776724662bd07ff0b10f401 Mon Sep 17 00:00:00 2001 From: Janina Esins Date: Thu, 7 Nov 2024 18:31:29 +0100 Subject: [PATCH 4/8] fix typos --- mex/common/organigram/extract.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mex/common/organigram/extract.py b/mex/common/organigram/extract.py index 4ab44514..5b535ede 100644 --- a/mex/common/organigram/extract.py +++ b/mex/common/organigram/extract.py @@ -52,7 +52,7 @@ def get_unit_merged_ids_by_synonyms( """Return a mapping from unit alt_label and label to their merged IDs. There will be multiple entries per unit mapping to the same merged ID. - But if the same entry mapps to differnt merged unit IDs, an error is thrown. + But if the same entry mapps to different merged IDs, an error is thrown. Args: extracted_units: Iterable of extracted units @@ -68,7 +68,7 @@ def get_unit_merged_ids_by_synonyms( and synonym_dict[synonym] != extracted_unit.stableTargetId ): msg = ( - f"Conflict: label '{synonym}' is associated with merged unit ID " + f"Conflict: label '{synonym}' is associated with merged unit IDs " f"{synonym_dict[synonym]} and {extracted_unit.stableTargetId}." ) raise MExError(msg) @@ -82,7 +82,7 @@ def get_unit_merged_ids_by_emails( """Return a mapping from unit emails to their merged IDs. There may be multiple emails per unit mapping to the same merged ID. - But if the same email mapps to differnt merged unit IDs, an error is thrown. + But if the same email mapps to different merged IDs, an error is thrown. Args: extracted_units: Iterable of extracted units @@ -99,7 +99,7 @@ def get_unit_merged_ids_by_emails( and email_dict[lower_email] != extracted_unit.stableTargetId ): msg = ( - f"Conflict: email {email} is associated with merged unit ID " + f"Conflict: email {email} is associated with merged unit IDs " f"{email_dict[lower_email]} and {extracted_unit.stableTargetId}." ) raise MExError(msg) From 703e5a8277ac6951d8de12054dd6571b1ed54837 Mon Sep 17 00:00:00 2001 From: Janina Esins Date: Thu, 7 Nov 2024 19:06:29 +0100 Subject: [PATCH 5/8] test for same email in different units --- tests/organigram/test_extract.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/organigram/test_extract.py b/tests/organigram/test_extract.py index 44c71d8b..d50af5b7 100644 --- a/tests/organigram/test_extract.py +++ b/tests/organigram/test_extract.py @@ -1,3 +1,6 @@ +import pytest + +from mex.common.exceptions import MExError from mex.common.models import ExtractedOrganizationalUnit from mex.common.organigram.extract import ( extract_organigram_units, @@ -52,3 +55,20 @@ def test_get_unit_merged_ids_by_emails( "pu@example.com": extracted_parent_unit.stableTargetId, # child unit has no emails } + + +def test_get_unit_merged_ids_by_emails_error( + extracted_child_unit: ExtractedOrganizationalUnit, + extracted_parent_unit: ExtractedOrganizationalUnit, +) -> None: + erroneus_extracted_child_unit = extracted_child_unit + erroneus_extracted_child_unit.email.append("PARENT@example.com") + + msg = ( + "MExError: Conflict: email PARENT@example.com is associated with " + "merged unit IDs 6rqNvZSApUHlz8GkkVP48 and hIiJpZXVppHvoyeP0QtAoS." + ) + with pytest.raises(MExError, match=msg): + get_unit_merged_ids_by_emails( + [erroneus_extracted_child_unit, extracted_parent_unit] + ) From 53852ada85b0aa88d2401b4ebc489dd8e397cc4f Mon Sep 17 00:00:00 2001 From: Janina Esins Date: Thu, 7 Nov 2024 19:11:40 +0100 Subject: [PATCH 6/8] improve test for same email in different units. --- mex/common/organigram/extract.py | 2 +- tests/organigram/test_extract.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/mex/common/organigram/extract.py b/mex/common/organigram/extract.py index 5b535ede..2ed1ede5 100644 --- a/mex/common/organigram/extract.py +++ b/mex/common/organigram/extract.py @@ -99,7 +99,7 @@ def get_unit_merged_ids_by_emails( and email_dict[lower_email] != extracted_unit.stableTargetId ): msg = ( - f"Conflict: email {email} is associated with merged unit IDs " + f"Conflict: email '{email}' is associated with merged unit IDs " f"{email_dict[lower_email]} and {extracted_unit.stableTargetId}." ) raise MExError(msg) diff --git a/tests/organigram/test_extract.py b/tests/organigram/test_extract.py index d50af5b7..b0b03805 100644 --- a/tests/organigram/test_extract.py +++ b/tests/organigram/test_extract.py @@ -65,8 +65,9 @@ def test_get_unit_merged_ids_by_emails_error( erroneus_extracted_child_unit.email.append("PARENT@example.com") msg = ( - "MExError: Conflict: email PARENT@example.com is associated with " - "merged unit IDs 6rqNvZSApUHlz8GkkVP48 and hIiJpZXVppHvoyeP0QtAoS." + f"MExError: Conflict: email 'PARENT@example.com' is associated with " + f"merged unit IDs {erroneus_extracted_child_unit.stableTargetId} and " + f"{extracted_parent_unit.stableTargetId}." ) with pytest.raises(MExError, match=msg): get_unit_merged_ids_by_emails( From 2f95c38ea55827b90e02f143e7d77252fa31df08 Mon Sep 17 00:00:00 2001 From: Janina Esins Date: Tue, 12 Nov 2024 10:55:42 +0100 Subject: [PATCH 7/8] improve docstring and add test for MExError --- mex/common/organigram/extract.py | 9 +++++++-- tests/organigram/test_extract.py | 19 +++++++++++++++++++ 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/mex/common/organigram/extract.py b/mex/common/organigram/extract.py index 2ed1ede5..99ab2519 100644 --- a/mex/common/organigram/extract.py +++ b/mex/common/organigram/extract.py @@ -52,11 +52,13 @@ def get_unit_merged_ids_by_synonyms( """Return a mapping from unit alt_label and label to their merged IDs. There will be multiple entries per unit mapping to the same merged ID. - But if the same entry mapps to different merged IDs, an error is thrown. Args: extracted_units: Iterable of extracted units + Raises: + MExError: If the same entry maps to different merged IDs + Returns: Mapping from unit synonyms to stableTargetIds """ @@ -82,11 +84,14 @@ def get_unit_merged_ids_by_emails( """Return a mapping from unit emails to their merged IDs. There may be multiple emails per unit mapping to the same merged ID. - But if the same email mapps to different merged IDs, an error is thrown. + But if the same email maps to different merged IDs, an error is thrown. Args: extracted_units: Iterable of extracted units + Raises: + MExError: If the same entry maps to different merged IDs + Returns: Mapping from lowercased `email` to stableTargetIds """ diff --git a/tests/organigram/test_extract.py b/tests/organigram/test_extract.py index b0b03805..c939cc8f 100644 --- a/tests/organigram/test_extract.py +++ b/tests/organigram/test_extract.py @@ -8,6 +8,7 @@ get_unit_merged_ids_by_synonyms, ) from mex.common.organigram.models import OrganigramUnit +from mex.common.types import Text def test_extract_organigram_units( @@ -43,6 +44,24 @@ def test_get_unit_merged_ids_by_synonyms( } +def test_get_unit_merged_ids_by_synonyms_error( + extracted_child_unit: ExtractedOrganizationalUnit, + extracted_parent_unit: ExtractedOrganizationalUnit, +) -> None: + erroneus_extracted_child_unit = extracted_child_unit + erroneus_extracted_child_unit.name.append(Text(value="PARENT Dept.")) + + msg = ( + f"MExError: Conflict: label 'PARENT Dept.' is associated with " + f"merged unit IDs {erroneus_extracted_child_unit.stableTargetId} and " + f"{extracted_parent_unit.stableTargetId}." + ) + with pytest.raises(MExError, match=msg): + get_unit_merged_ids_by_synonyms( + [erroneus_extracted_child_unit, extracted_parent_unit] + ) + + def test_get_unit_merged_ids_by_emails( extracted_child_unit: ExtractedOrganizationalUnit, extracted_parent_unit: ExtractedOrganizationalUnit, From e1523bc9ccd661d76d7572fa93c2efb2ae54c82a Mon Sep 17 00:00:00 2001 From: Janina Esins Date: Tue, 12 Nov 2024 11:02:10 +0100 Subject: [PATCH 8/8] fixed redundance in doc string --- mex/common/organigram/extract.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mex/common/organigram/extract.py b/mex/common/organigram/extract.py index 99ab2519..726302a1 100644 --- a/mex/common/organigram/extract.py +++ b/mex/common/organigram/extract.py @@ -84,7 +84,6 @@ def get_unit_merged_ids_by_emails( """Return a mapping from unit emails to their merged IDs. There may be multiple emails per unit mapping to the same merged ID. - But if the same email maps to different merged IDs, an error is thrown. Args: extracted_units: Iterable of extracted units