diff --git a/.github/workflows/check_smoke_tests.yml b/.github/workflows/check_smoke_tests.yml index d1d33b9c..c22941b3 100644 --- a/.github/workflows/check_smoke_tests.yml +++ b/.github/workflows/check_smoke_tests.yml @@ -53,8 +53,8 @@ jobs: done # Run smoke tests and print the response - JSON_BODY_1='{"record": {"birth_date": "2053-11-07", "sex": "M", "mrn": "1234567890", "name":[{"family":"Shepard", "given":["John"]}]}}' - JSON_BODY_2='{"algorithm": "dibbs-enhanced", "record": {"birth_date": "2000-12-06", "sex": "M", "mrn": "9876543210", "name":[{"family":"Smith", "given":["William"]}]}}' + JSON_BODY_1='{"record": {"birth_date": "2053-11-07", "sex": "M", "identifiers":[{"value": "123456789", "type": "MR"}], "name":[{"family":"Shepard", "given":["John"]}]}}' + JSON_BODY_2='{"algorithm": "dibbs-enhanced", "record": {"birth_date": "2000-12-06", "sex": "M", "identifiers":[{"value": "9876543210", "type": "MR"}], "name":[{"family":"Smith", "given":["William"]}]}}' #basic tests RESPONSE_1=$(curl -s -X POST http://localhost:8080/link \ diff --git a/docs/site/reference.md b/docs/site/reference.md index af321af4..df92377b 100644 --- a/docs/site/reference.md +++ b/docs/site/reference.md @@ -15,14 +15,6 @@ linkage evaluation phase. The following features are supported: : The patient's birthdate (normalized to `YYYY-MM-DD`). -`MRN` - -: The patient's medical record number. - -`SSN` - -: The patient's social security number. - `SEX` : The patient's sex (normalized to `M`, `F`, or `U` for unknown). @@ -83,9 +75,13 @@ linkage evaluation phase. The following features are supported: : The patient's email address. -`DRIVERS_LICENSE` +`IDENTIFIER` + +: An identifier for the patient. Matching on this will check if any identifier type/authority/value combination matches. -: The patient's driver's license number. +`IDENTIFIER:` + +: The patient's specific identifier type. For example, `IDENTIFIER:MR` would be the patient's medical record number. Unlike `IDENTIFIER`, this will ONLY compare values of a specific type. Valid type codes can be found here http://hl7.org/fhir/R4/v2/0203/index.html. ### Blocking Key Types @@ -97,10 +93,6 @@ patient data and used during query retrieval. The following blocking key types a : The patients birthdate in the format `YYYY-MM-DD`. -`MRN` (ID: **2**) - -: The last 4 characters of a patient's medical record number. - `SEX` (ID: **3**) : The patient's sex in the format of `M`, `F`, or `U` for unknown. @@ -129,6 +121,10 @@ patient data and used during query retrieval. The following blocking key types a : The first 4 characters of the patient's email address. +`IDENTIFIER` (ID: **10**) + +: The identifier triplet containing only the type, authority, and last 4 digits of the value + ### Evaluation Functions diff --git a/pyproject.toml b/pyproject.toml index 74e4b85a..b2c5fb8a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,6 +43,7 @@ dev = [ "ruff", "mypy", "types-python-dateutil", + "faker", # Testing "pytest>=8.3", "pytest-cov", diff --git a/scripts/gen_seed_test_data.py b/scripts/gen_seed_test_data.py new file mode 100755 index 00000000..c4928948 --- /dev/null +++ b/scripts/gen_seed_test_data.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python +""" +scripts/gen_seed_test_data.py +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Script to generate test data for the /seed endpoint in the RecordLinker project. + +The script will emit a JSON object to STDOUT containing a list of clusters, each with a +list of PII records. By default, 100 clusters will be generated, each with a random number +of PII records up to 25. Those values can be adjusted see --help for more information. +""" + +import argparse +import random + +from faker import Faker + +from recordlinker import schemas +from recordlinker.schemas.identifier import Identifier +from recordlinker.schemas.pii import Address +from recordlinker.schemas.pii import Gender +from recordlinker.schemas.pii import Name +from recordlinker.schemas.pii import Race +from recordlinker.schemas.pii import Sex +from recordlinker.schemas.pii import Telecom + + +def _generate_random_identifiers(count, faker): + """ + Given a count of identifiers to generate, generate a list of + MRNs, SSNs, and Drivers Licenses. + """ + for idx in range(count): + if idx % 3 == 0: + # make mrn + yield Identifier(type="MR", value=faker.bothify(text="MRN-#######")) + if idx % 3 == 1: + # make ssn + yield Identifier(type="SS", value=faker.ssn()) + if idx % 3 == 2: + # make drivers_license + yield Identifier( + type="DL", value=faker.bothify(text="DL-######"), authority=faker.state_abbr() + ) + + +# Function to generate random data +def _generate_random_pii_record(faker): + return schemas.PIIRecord( + external_id=faker.uuid4(), + birth_date=faker.date_of_birth(minimum_age=0, maximum_age=100), + sex=random.choice(list(Sex)), + address=[ + Address( + line=[faker.street_address()], + city=faker.city(), + state=faker.state_abbr(), + postal_code=faker.zipcode(), + county=faker.city(), + country=faker.country_code(), + latitude=faker.latitude(), + longitude=faker.longitude(), + ) + ], + name=[ + Name( + family=faker.last_name(), + given=[faker.first_name()], + use=random.choice(["official", "usual", "nickname"]), + ) + ], + telecom=[ + Telecom( + value=faker.phone_number(), + system="phone", + use=random.choice(["home", "work", "mobile"]), + ) + ], + race=random.choice(list(Race)), + gender=random.choice(list(Gender)), + identifiers=list(_generate_random_identifiers(random.randint(1, 3), faker)), + ) + + +def main() -> None: + """ + Main entry point for the script. + """ + parser = argparse.ArgumentParser(description="Generate test data for the /seed endpoint") + parser.add_argument("--count", type=int, default=100, help="The number of clusters to generate") + parser.add_argument( + "--max-per-cluster", type=int, default=25, help="The maximum number of records per cluster" + ) + + args = parser.parse_args() + + faker = Faker() + clusters = [] + for _ in range(args.count): + cluster = schemas.Cluster( + external_person_id=f"EP:{str(faker.uuid4())}", + records=[ + _generate_random_pii_record(faker) + for _ in range(random.randint(1, args.max_per_cluster)) + ], + ) + clusters.append(cluster) + print(schemas.ClusterGroup(clusters=clusters).model_dump_json(indent=2)) + + +if __name__ == "__main__": + main() diff --git a/src/recordlinker/assets/initial_algorithms.json b/src/recordlinker/assets/initial_algorithms.json index 7a397ff1..1d5addd8 100644 --- a/src/recordlinker/assets/initial_algorithms.json +++ b/src/recordlinker/assets/initial_algorithms.json @@ -9,7 +9,7 @@ { "blocking_keys": [ "BIRTHDATE", - "MRN", + "IDENTIFIER", "SEX" ], "evaluators": [ @@ -75,7 +75,7 @@ { "blocking_keys": [ "BIRTHDATE", - "MRN", + "IDENTIFIER", "SEX" ], "evaluators": [ @@ -106,7 +106,7 @@ "CITY": 2.438553006137189, "FIRST_NAME": 6.849475906891162, "LAST_NAME": 6.350720397426025, - "MRN": 0.3051262572525359, + "IDENTIFIER:MR": 0.3051262572525359, "SEX": 0.7510419059643679, "STATE": 0.022376768992488694, "ZIP": 4.975031471124867 @@ -148,7 +148,7 @@ "CITY": 2.438553006137189, "FIRST_NAME": 6.849475906891162, "LAST_NAME": 6.350720397426025, - "MRN": 0.3051262572525359, + "IDENTIFIER:MR": 0.3051262572525359, "SEX": 0.7510419059643679, "STATE": 0.022376768992488694, "ZIP": 4.975031471124867 diff --git a/src/recordlinker/hl7/fhir.py b/src/recordlinker/hl7/fhir.py index f3cf10f9..08319caf 100644 --- a/src/recordlinker/hl7/fhir.py +++ b/src/recordlinker/hl7/fhir.py @@ -33,26 +33,18 @@ def fhir_record_to_pii_record(fhir_record: dict) -> schemas.PIIRecord: "birthDate": fhir_record.get("birthDate"), "sex": fhir_record.get("gender"), "address": fhir_record.get("address", []), - "mrn": None, - "ssn": None, "race": None, "gender": None, "telecom": fhir_record.get("telecom", []), - "drivers_license": None, + "identifiers": [], } for identifier in fhir_record.get("identifier", []): - for coding in identifier.get("type", {}).get("coding", []): - if coding.get("code") == "MR": - val["mrn"] = identifier.get("value") - elif coding.get("code") == "SS": - val["ssn"] = identifier.get("value") - elif coding.get("code") == "DL": - license_number = identifier.get("value") - authority = identifier.get("assigner", {}).get("identifier", {}).get("value", "") # Assuming `issuer` contains authority info - val["drivers_license"] = { - "value": license_number, - "authority": authority - } + for code in identifier.get("type", {}).get("coding", []): + val["identifiers"].append({ + "value": identifier.get("value"), + "type": code.get("code"), + "authority": identifier.get("assigner", {}).get("identifier", {}).get("value", ""), + }) for address in val["address"]: address["county"] = address.get("district", "") for extension in address.get("extension", []): diff --git a/src/recordlinker/linking/link.py b/src/recordlinker/linking/link.py index de47e558..9d50d970 100644 --- a/src/recordlinker/linking/link.py +++ b/src/recordlinker/linking/link.py @@ -51,7 +51,7 @@ def compare( details: dict[str, typing.Any] = {"patient.reference_id": str(patient.reference_id)} for e in evals: # TODO: can we do this check earlier? - feature = getattr(schemas.Feature, e.feature, None) + feature = schemas.Feature.parse(e.feature) if feature is None: raise ValueError(f"Invalid comparison field: {e.feature}") # Evaluate the comparison function and append the result to the list diff --git a/src/recordlinker/linking/matchers.py b/src/recordlinker/linking/matchers.py index 013beb63..1fc6b945 100644 --- a/src/recordlinker/linking/matchers.py +++ b/src/recordlinker/linking/matchers.py @@ -179,7 +179,7 @@ def compare_fuzzy_match( beyond which to classify the strings as a partial match. :return: A float indicating whether the features are a fuzzy match. """ - similarity_measure, threshold = _get_fuzzy_params(str(key), **kwargs) + similarity_measure, threshold = _get_fuzzy_params(str(key.attribute), **kwargs) comp_func = getattr(rapidfuzz.distance, similarity_measure).normalized_similarity for x in record.feature_iter(key): for y in patient.record.feature_iter(key): @@ -203,11 +203,11 @@ def compare_probabilistic_fuzzy_match( beyond which to classify the strings as a partial match. :return: A float of the score the feature comparison earned. """ - log_odds = kwargs.get("log_odds", {}).get(str(key)) + log_odds = kwargs.get("log_odds", {}).get(str(key.attribute)) if log_odds is None: raise ValueError(f"Log odds not found for feature {key}") - similarity_measure, threshold = _get_fuzzy_params(str(key), **kwargs) + similarity_measure, threshold = _get_fuzzy_params(str(key.attribute), **kwargs) comp_func = getattr(rapidfuzz.distance, similarity_measure).normalized_similarity max_score = 0.0 for x in patient.record.feature_iter(key): diff --git a/src/recordlinker/models/mpi.py b/src/recordlinker/models/mpi.py index f5cd646f..826e263d 100644 --- a/src/recordlinker/models/mpi.py +++ b/src/recordlinker/models/mpi.py @@ -121,7 +121,6 @@ class BlockingKey(enum.Enum): """ BIRTHDATE = ("BIRTHDATE", 1, "Date of birth as YYYY-MM-DD") - MRN = ("MRN", 2, "Last 4 characters of Medical record number") SEX = ("SEX", 3, "Sex at birth; M, F or U") ZIP = ("ZIP", 4, "5 digital US Postal Code") FIRST_NAME = ("FIRST_NAME", 5, "First 4 characters of the first name") @@ -129,6 +128,7 @@ class BlockingKey(enum.Enum): ADDRESS = ("ADDRESS", 7, "First 4 characters of the address") PHONE = ("PHONE", 8, "Last 4 characters of the phone number") EMAIL = ("EMAIL", 9, "First 4 characters of the email address") + IDENTIFIER = ("IDENTIFIER", 10, "Identifier triplet with only last 4 character of the value. Format \"type:authority:value\"") def __init__(self, value: str, _id: int, description: str): self._value = value diff --git a/src/recordlinker/schemas/__init__.py b/src/recordlinker/schemas/__init__.py index 3d9b5234..9eede76f 100644 --- a/src/recordlinker/schemas/__init__.py +++ b/src/recordlinker/schemas/__init__.py @@ -13,6 +13,7 @@ from .mpi import PatientRef from .mpi import PersonRef from .pii import Feature +from .pii import FeatureAttribute from .pii import PIIRecord from .seed import Cluster from .seed import ClusterGroup @@ -24,6 +25,7 @@ "AlgorithmPass", "AlgorithmSummary", "Feature", + "FeatureAttribute", "PIIRecord", "Prediction", "LinkInput", diff --git a/src/recordlinker/schemas/algorithm.py b/src/recordlinker/schemas/algorithm.py index 1455716e..442970f2 100644 --- a/src/recordlinker/schemas/algorithm.py +++ b/src/recordlinker/schemas/algorithm.py @@ -23,9 +23,19 @@ class Evaluator(pydantic.BaseModel): model_config = pydantic.ConfigDict(from_attributes=True, use_enum_values=True) - feature: Feature + feature: str = pydantic.Field(json_schema_extra={"enum": Feature.all_options()}) func: matchers.FeatureFunc + @pydantic.field_validator("feature", mode="before") + def validate_feature(cls, value): + """ + Validate the feature is a valid PII feature. + """ + try: + Feature.parse(value) + except ValueError as e: + raise ValueError(f"Invalid feature: '{value}'. {e}") + return value class AlgorithmPass(pydantic.BaseModel): """ diff --git a/src/recordlinker/schemas/identifier.py b/src/recordlinker/schemas/identifier.py new file mode 100644 index 00000000..88d783b4 --- /dev/null +++ b/src/recordlinker/schemas/identifier.py @@ -0,0 +1,197 @@ +import enum +import re +import typing + +import pydantic + + +class IdentifierType(enum.Enum): + """ + Enum for various identifier types. + """ + AC = "AC" + ACSN = "ACSN" + AIN = "AIN" + AM = "AM" + AMA = "AMA" + AN = "AN" + ANC = "ANC" + AND = "AND" + ANON = "ANON" + ANT = "ANT" + APRN = "APRN" + ASID = "ASID" + BA = "BA" + BC = "BC" + BCFN = "BCFN" + BCT = "BCT" + BR = "BR" + BRN = "BRN" + BSNR = "BSNR" + CAII = "CAII" + CC = "CC" + CONM = "CONM" + CY = "CY" + CZ = "CZ" + DC = "DC" + DCFN = "DCFN" + DDS = "DDS" + DEA = "DEA" + DFN = "DFN" + DI = "DI" + DL = "DL" + DN = "DN" + DO = "DO" + DP = "DP" + DPM = "DPM" + DR = "DR" + DS = "DS" + DSG = "DSG" + EI = "EI" + EN = "EN" + ESN = "ESN" + FDR = "FDR" + FDRFN = "FDRFN" + FGN = "FGN" + FI = "FI" + FILL = "FILL" + GI = "GI" + GIN = "GIN" + GL = "GL" + GN = "GN" + HC = "HC" + IND = "IND" + IRISTEM = "IRISTEM" + JHN = "JHN" + LACSN = "LACSN" + LANR = "LANR" + LI = "LI" + LN = "LN" + LR = "LR" + MA = "MA" + MB = "MB" + MC = "MC" + MCD = "MCD" + MCN = "MCN" + MCR = "MCR" + MCT = "MCT" + MD = "MD" + MI = "MI" + MR = "MR" + MRT = "MRT" + MS = "MS" + NBSNR = "NBSNR" + NCT = "NCT" + NE = "NE" + NH = "NH" + NI = "NI" + NII = "NII" + NIIP = "NIIP" + NP = "NP" + NPI = "NPI" + OBI = "OBI" + OD = "OD" + PA = "PA" + PC = "PC" + PCN = "PCN" + PE = "PE" + PEN = "PEN" + PGN = "PGN" + PHC = "PHC" + PHE = "PHE" + PHO = "PHO" + PI = "PI" + PIN = "PIN" + PLAC = "PLAC" + PN = "PN" + PNT = "PNT" + PPIN = "PPIN" + PPN = "PPN" + PRC = "PRC" + PRN = "PRN" + PT = "PT" + QA = "QA" + RI = "RI" + RN = "RN" + RPH = "RPH" + RR = "RR" + RRI = "RRI" + RRP = "RRP" + SAMN = "SAMN" + SB = "SB" + SID = "SID" + SL = "SL" + SN = "SN" + SNBSN = "SNBSN" + SNO = "SNO" + SP = "SP" + SR = "SR" + SRX = "SRX" + SS = "SS" + STN = "STN" + TAX = "TAX" + TN = "TN" + TPR = "TPR" + TRL = "TRL" + U = "U" + UDI = "UDI" + UPIN = "UPIN" + USID = "USID" + VN = "VN" + VP = "VP" + VS = "VS" + WC = "WC" + WCN = "WCN" + WP = "WP" + XV = "XV" + XX = "XX" + + def __str__(self): + """ + Return the value of the enum as a string. + """ + return self.value + + +class Identifier(pydantic.BaseModel): + """ + The schema for an Identifier record + """ + + model_config = pydantic.ConfigDict(extra="allow") + + type: IdentifierType + value: str + authority: typing.Optional[str] = None + + @classmethod + def model_construct( + cls, _fields_set: set[str] | None = None, **values: typing.Any + ) -> typing.Self: + """ + Construct a new instance of the Identifier model + """ + values["type"] = IdentifierType(values["type"]) + return super().model_construct(_fields_set=_fields_set, **values) + + @pydantic.field_validator("value", mode="before") + def parse_value(cls, value: str, info: pydantic.ValidationInfo): + """ + Parse the value string + """ + # NOTE: Define "type" before "value" in the field definitions to guarentee that it will be available here. + identifier_type = info.data["type"] + if identifier_type == IdentifierType.SS: + val = str(value).strip() + + if re.match(r"^\d{3}-\d{2}-\d{4}$", val): + return val + + if len(val) != 9 or not val.isdigit(): + return '' + + # Format back to the standard SSN format (XXX-XX-XXXX) + formatted_ssn = f"{val[:3]}-{val[3:5]}-{val[5:]}" + return formatted_ssn + + return value \ No newline at end of file diff --git a/src/recordlinker/schemas/pii.py b/src/recordlinker/schemas/pii.py index f831aa0c..5c7a8cf8 100644 --- a/src/recordlinker/schemas/pii.py +++ b/src/recordlinker/schemas/pii.py @@ -8,15 +8,16 @@ import pydantic from recordlinker import models +from recordlinker.schemas.identifier import Identifier +from recordlinker.schemas.identifier import IdentifierType -class Feature(enum.Enum): +class FeatureAttribute(enum.Enum): """ Enum for the different Patient attributes that can be used for comparison. """ BIRTHDATE = "BIRTHDATE" - MRN = "MRN" SEX = "SEX" GIVEN_NAME = "GIVEN_NAME" FIRST_NAME = "FIRST_NAME" @@ -25,7 +26,6 @@ class Feature(enum.Enum): CITY = "CITY" STATE = "STATE" ZIP = "ZIP" - SSN = "SSN" RACE = "RACE" GENDER = "GENDER" TELECOM = "TELECOM" @@ -33,14 +33,61 @@ class Feature(enum.Enum): EMAIL = "EMAIL" SUFFIX = "SUFFIX" COUNTY = "COUNTY" - DRIVERS_LICENSE = "DRIVERS_LICENSE" + IDENTIFIER = "IDENTIFIER" def __str__(self): """ Return the value of the enum as a string. """ - return self.value + return self.value + +class Feature(pydantic.BaseModel): + """ + The schema for a feature. + """ + + model_config = pydantic.ConfigDict(extra="allow") + + suffix: typing.Optional[IdentifierType] = None + attribute: FeatureAttribute + + @classmethod + def parse(cls, feature_string: str) -> typing.Self: + """ + Parse a feature string in the format 'FEATURE_ATTRIBUTE:SUFFIX' into a Feature object. + Args: + feature_string (str): The string to parse. + + Returns: + Feature: A Feature object with attribute and suffix populated. + """ + # Split the feature string on ":" + parts = feature_string.split(":", 1) + feature_attribute = FeatureAttribute(parts[0]) + + if len(parts) == 1: + return cls(attribute=feature_attribute) + + # If suffix is provided, ensure the attribute is IDENTIFIER and validate the suffix + if feature_attribute != FeatureAttribute.IDENTIFIER: + raise ValueError(f"Suffix is not allowed for attribute '{feature_attribute}'") + + feature_suffix = IdentifierType(parts[1]) + return cls(attribute=feature_attribute, suffix=feature_suffix) + + @classmethod + def all_options(cls) -> list[typing.Any]: + """ + Return a list of all possible Feature string values that can be used for comparison. + """ + options = [] + for feature in FeatureAttribute: + options.append(str(feature)) + if feature == FeatureAttribute.IDENTIFIER: + for identifier in IdentifierType: + options.append(f"{feature}:{identifier}") + return options class Sex(enum.Enum): """ @@ -161,18 +208,6 @@ def email(self) -> str | None: return None return self.value - -class DriversLicense(pydantic.BaseModel): - """ - The schema for a Drivers License record - """ - - model_config = pydantic.ConfigDict(extra="allow") - - value: str - authority: str - - class PIIRecord(pydantic.BaseModel): """ The schema for a PII record. @@ -185,14 +220,12 @@ class PIIRecord(pydantic.BaseModel): default=None, validation_alias=pydantic.AliasChoices("birth_date", "birthdate", "birthDate") ) sex: typing.Optional[Sex] = None - mrn: typing.Optional[str] = None address: typing.List[Address] = [] name: typing.List[Name] = [] telecom: typing.List[Telecom] = [] - ssn: typing.Optional[str] = None race: typing.Optional[Race] = None gender: typing.Optional[Gender] = None - drivers_license: typing.Optional[DriversLicense] = None + identifiers: typing.List[Identifier] = [] @classmethod def model_construct( @@ -209,7 +242,8 @@ def model_construct( obj.address = [Address.model_construct(**a) for a in values.get("address", [])] obj.name = [Name.model_construct(**n) for n in values.get("name", [])] obj.telecom = [Telecom.model_construct(**t) for t in values.get("telecom", [])] - obj.drivers_license = DriversLicense.model_construct(**values.get("drivers_license", {})) + obj.identifiers = [Identifier.model_construct(**i) for i in values.get("identifiers", [])] + return obj @pydantic.field_validator("external_id", mode="before") @@ -241,24 +275,6 @@ def parse_sex(cls, value): return Sex.FEMALE return Sex.UNKNOWN - @pydantic.field_validator("ssn", mode="before") - def parse_ssn(cls, value): - """ - Parse the ssn string - """ - if value: - val = str(value).strip() - - if re.match(r"^\d{3}-\d{2}-\d{4}$", val): - return val - - if len(val) != 9 or not val.isdigit(): - return None - - # Format back to the standard SSN format (XXX-XX-XXXX) - formatted_ssn = f"{val[:3]}-{val[3:5]}-{val[5:]}" - return formatted_ssn - @pydantic.field_validator("race", mode="before") def parse_race(cls, value): """ @@ -322,88 +338,86 @@ def feature_iter(self, feature: Feature) -> typing.Iterator[str]: Given a field name, return an iterator of all string values for that field. Empty strings are not included in the iterator. """ + if not isinstance(feature, Feature): raise ValueError(f"Invalid feature: {feature}") + + attribute = feature.attribute + identifier_suffix = feature.suffix - if feature == Feature.BIRTHDATE: + if attribute == FeatureAttribute.BIRTHDATE: if self.birth_date: yield str(self.birth_date) - elif feature == Feature.MRN: - if self.mrn: - yield self.mrn - elif feature == Feature.SEX: + elif attribute == FeatureAttribute.SEX: if self.sex: yield str(self.sex) - elif feature == Feature.ADDRESS: + elif attribute == FeatureAttribute.ADDRESS: for address in self.address: # The 2nd, 3rd, etc lines of an address are not as important as # the first line, so we only include the first line in the comparison. if address.line and address.line[0]: yield address.line[0] - elif feature == Feature.CITY: + elif attribute == FeatureAttribute.CITY: for address in self.address: if address.city: yield address.city - elif feature == Feature.STATE: + elif attribute == FeatureAttribute.STATE: for address in self.address: if address.state: yield address.state - elif feature == Feature.ZIP: + elif attribute == FeatureAttribute.ZIP: for address in self.address: if address.postal_code: # only use the first 5 digits for comparison yield address.postal_code[:5] - elif feature == Feature.GIVEN_NAME: + elif attribute == FeatureAttribute.GIVEN_NAME: for name in self.name: for given in name.given: if given: yield given - elif feature == Feature.FIRST_NAME: + elif attribute == FeatureAttribute.FIRST_NAME: for name in self.name: # We only want the first given name for comparison for given in name.given[0:1]: if given: yield given - elif feature == Feature.LAST_NAME: + elif attribute == FeatureAttribute.LAST_NAME: for name in self.name: if name.family: yield name.family - elif feature == Feature.SSN: - if self.ssn: - yield self.ssn - elif feature == Feature.RACE: + elif attribute == FeatureAttribute.RACE: if self.race: yield str(self.race) - elif feature == Feature.GENDER: + elif attribute == FeatureAttribute.GENDER: if self.gender: yield str(self.gender) - elif feature == Feature.TELECOM: + elif attribute == FeatureAttribute.TELECOM: for telecom in self.telecom: if telecom.value: yield telecom.value - elif feature == Feature.PHONE: + elif attribute == FeatureAttribute.PHONE: for telecom in self.telecom: number = telecom.phone_number() if number: yield number - elif feature == Feature.EMAIL: + elif attribute == FeatureAttribute.EMAIL: for telecom in self.telecom: email = telecom.email() if email: yield email - elif feature == Feature.SUFFIX: + elif attribute == FeatureAttribute.SUFFIX: for name in self.name: for suffix in name.suffix: if suffix: yield suffix - elif feature == Feature.COUNTY: + elif attribute == FeatureAttribute.COUNTY: for address in self.address: if address.county: yield address.county - elif feature == Feature.DRIVERS_LICENSE: - if self.drivers_license: - if self.drivers_license.value and self.drivers_license.authority: - yield f"{self.drivers_license.value}|{self.drivers_license.authority}" + elif attribute == FeatureAttribute.IDENTIFIER: + for identifier in self.identifiers: + if identifier_suffix is None or identifier_suffix == identifier.type: + yield f"{identifier.type}:{identifier.authority or ''}:{identifier.value}" def blocking_keys(self, key: models.BlockingKey) -> set[str]: """ @@ -418,23 +432,27 @@ def blocking_keys(self, key: models.BlockingKey) -> set[str]: if key == models.BlockingKey.BIRTHDATE: # NOTE: we could optimize here and remove the dashes from the date - vals.update(self.feature_iter(Feature.BIRTHDATE)) - elif key == models.BlockingKey.MRN: - vals.update({x[-4:] for x in self.feature_iter(Feature.MRN)}) + vals.update(self.feature_iter(Feature(attribute=FeatureAttribute.BIRTHDATE))) + elif key == models.BlockingKey.IDENTIFIER: + vals.update({ + f"{type_part}:{authority_part[:2]}:{value_part[-4:]}" + for x in self.feature_iter(Feature(attribute=FeatureAttribute.IDENTIFIER)) + for type_part, authority_part, value_part in [x.split(":", 2)] + }) elif key == models.BlockingKey.SEX: - vals.update(self.feature_iter(Feature.SEX)) + vals.update(self.feature_iter(Feature(attribute=FeatureAttribute.SEX))) elif key == models.BlockingKey.ZIP: - vals.update(self.feature_iter(Feature.ZIP)) + vals.update(self.feature_iter(Feature(attribute=FeatureAttribute.ZIP))) elif key == models.BlockingKey.FIRST_NAME: - vals.update({x[:4] for x in self.feature_iter(Feature.FIRST_NAME)}) + vals.update({x[:4] for x in self.feature_iter(Feature(attribute=FeatureAttribute.FIRST_NAME))}) elif key == models.BlockingKey.LAST_NAME: - vals.update({x[:4] for x in self.feature_iter(Feature.LAST_NAME)}) + vals.update({x[:4] for x in self.feature_iter(Feature(attribute=FeatureAttribute.LAST_NAME))}) elif key == models.BlockingKey.ADDRESS: - vals.update({x[:4] for x in self.feature_iter(Feature.ADDRESS)}) + vals.update({x[:4] for x in self.feature_iter(Feature(attribute=FeatureAttribute.ADDRESS))}) elif key == models.BlockingKey.PHONE: - vals.update({x[-4:] for x in self.feature_iter(Feature.PHONE)}) + vals.update({x[-4:] for x in self.feature_iter(Feature(attribute=FeatureAttribute.PHONE))}) elif key == models.BlockingKey.EMAIL: - vals.update({x[:4] for x in self.feature_iter(Feature.EMAIL)}) + vals.update({x[:4] for x in self.feature_iter(Feature(attribute=FeatureAttribute.EMAIL))}) # if any vals are longer than the BLOCKING_KEY_MAX_LENGTH, raise an error if any(len(x) > models.BLOCKING_VALUE_MAX_LENGTH for x in vals): diff --git a/tests/unit/assets/seed_test.json.gz b/tests/unit/assets/seed_test.json.gz index 19d63bab..b15b2ea1 100644 Binary files a/tests/unit/assets/seed_test.json.gz and b/tests/unit/assets/seed_test.json.gz differ diff --git a/tests/unit/assets/simple_patient_bundle_to_link_with_mpi.json b/tests/unit/assets/simple_patient_bundle_to_link_with_mpi.json index dbdce205..0bb22a5c 100644 --- a/tests/unit/assets/simple_patient_bundle_to_link_with_mpi.json +++ b/tests/unit/assets/simple_patient_bundle_to_link_with_mpi.json @@ -118,7 +118,7 @@ "type": { "coding": [ { - "code": "SSN", + "code": "SS", "system": "http://terminology.hl7.org/CodeSystem/v2-0203", "display": "Social security number" } diff --git a/tests/unit/hl7/test_fhir.py b/tests/unit/hl7/test_fhir.py index 6407feb0..034a1867 100644 --- a/tests/unit/hl7/test_fhir.py +++ b/tests/unit/hl7/test_fhir.py @@ -119,14 +119,23 @@ def test_fhir_record_to_pii_record(): assert pii_record.address[0].state == "Massachusetts" assert pii_record.address[0].postal_code == "99999" assert pii_record.address[0].county == "county" - assert pii_record.mrn == "1234567890" - assert pii_record.ssn == "111-22-3333" assert pii_record.telecom[0].value == "123-456-7890" assert pii_record.telecom[0].system == "phone" assert str(pii_record.race) == "WHITE" assert str(pii_record.gender) == "FEMALE" - assert pii_record.drivers_license.authority == "CA" - assert pii_record.drivers_license.value == "D1234567" + + # identifiers + assert pii_record.identifiers[0].value == "1234567890" + assert str(pii_record.identifiers[0].type) == "MR" + assert pii_record.identifiers[0].authority == "" + + assert pii_record.identifiers[1].value == "111-22-3333" + assert str(pii_record.identifiers[1].type) == "SS" + assert pii_record.identifiers[1].authority == "" + + assert pii_record.identifiers[2].value == "D1234567" + assert str(pii_record.identifiers[2].type) == "DL" + assert pii_record.identifiers[2].authority == "CA" def test_add_person_resource(): diff --git a/tests/unit/linking/test_link.py b/tests/unit/linking/test_link.py index 51ffa2e4..9433353a 100644 --- a/tests/unit/linking/test_link.py +++ b/tests/unit/linking/test_link.py @@ -98,6 +98,145 @@ def test_compare_no_match(self): assert link.compare(rec, pat, algorithm_pass) is False + def test_compare_identifier_match(self): + rec = schemas.PIIRecord( + **{ + "identifiers": [ + { + "type": "MR", + "authority": "CA", + "value": "123456789" + }, + { + "type": "SS", + "authority": "VA", + "value": "987-65-4321" + } + ] + } + ) + pat = models.Patient( + data={ + "identifiers": [ + { + "type": "MR", + "authority": "CA", + "value": "123456789" + }, + { + "type": "SS", + "authority": "VA", + "value": "987-65-4321" + } + ] + } + ) + + algorithm_pass = models.AlgorithmPass( + id=1, + algorithm_id=1, + blocking_keys=[1], + evaluators=[ + {"feature": "IDENTIFIER", "func": "func:recordlinker.linking.matchers.compare_match_all"}, + ], + rule="func:recordlinker.linking.matchers.rule_match", + kwargs={}, + ) + + assert link.compare(rec, pat, algorithm_pass) is True + + def test_compare_identifier_with_suffix(self): + rec = schemas.PIIRecord( + **{ + "identifiers": [ + { + "type": "MR", + "authority": "CA", + "value": "123456789" + }, + { + "type": "SS", + "authority": "VA", + "value": "111-11-1111" + } + ] + } + ) + pat = models.Patient( + data={ + "identifiers": [ + { + "type": "MR", + "authority": "CA", + "value": "123456789" + }, + { + "type": "SS", + "authority": "VA", + "value": "987-65-4321" + } + ] + } + ) + + algorithm_pass = models.AlgorithmPass( + id=1, + algorithm_id=1, + blocking_keys=[1], + evaluators=[ + {"feature": "IDENTIFIER:MR", "func": "func:recordlinker.linking.matchers.compare_match_all"}, + ], + rule="func:recordlinker.linking.matchers.rule_match", + kwargs={}, + ) + + #should pass as MR is the same for both + assert link.compare(rec, pat, algorithm_pass) is True + + algorithm_pass.evaluators = [{"feature": "IDENTIFIER:SS", "func": "func:recordlinker.linking.matchers.compare_match_all"}] + #should fail as SS is different for both + assert link.compare(rec, pat, algorithm_pass) is False + + def test_compare_invalid_feature(self): + rec = schemas.PIIRecord( + **{ + "name": [ + { + "given": [ + "John", + ], + "family": "Doe", + } + ] + } + ) + pat = models.Patient( + data={ + "name": [ + { + "given": [ + "John", + ], + "family": "Doey", + } + ] + } + ) + + algorithm_pass = models.AlgorithmPass( + id=1, + algorithm_id=1, + blocking_keys=[1], + evaluators=[ + {"feature": "FIRST_NAME:DL", "func": "func:recordlinker.linking.matchers.compare_match_all"}, + ], + rule="func:recordlinker.linking.matchers.rule_match", + kwargs={}, + ) + + with pytest.raises(ValueError): + link.compare(rec, pat, algorithm_pass) + class TestLinkRecordAgainstMpi: # TODO: Add test case for last name O'Neil diff --git a/tests/unit/linking/test_matchers.py b/tests/unit/linking/test_matchers.py index 2e8a5afe..d421f391 100644 --- a/tests/unit/linking/test_matchers.py +++ b/tests/unit/linking/test_matchers.py @@ -66,22 +66,22 @@ def test_compare_match_any(): pat2 = models.Patient(data={"name": [{"given": ["Michael"], "family": "Smith"}], "sex": "male"}) pat3 = models.Patient(data={"name": [{"family": "Smith"}, {"family": "Williams"}]}) - assert matchers.compare_match_any(record, pat1, schemas.Feature.GIVEN_NAME) - assert matchers.compare_match_any(record, pat1, schemas.Feature.FIRST_NAME) - assert not matchers.compare_match_any(record, pat1, schemas.Feature.LAST_NAME) - assert matchers.compare_match_any(record, pat1, schemas.Feature.BIRTHDATE) - assert not matchers.compare_match_any(record, pat1, schemas.Feature.ZIP) - - assert matchers.compare_match_any(record, pat2, schemas.Feature.GIVEN_NAME) - assert not matchers.compare_match_any(record, pat2, schemas.Feature.FIRST_NAME) - assert matchers.compare_match_any(record, pat2, schemas.Feature.LAST_NAME) - assert not matchers.compare_match_any(record, pat2, schemas.Feature.SEX) - assert not matchers.compare_match_any(record, pat1, schemas.Feature.ZIP) - - assert not matchers.compare_match_any(record, pat3, schemas.Feature.GIVEN_NAME) - assert not matchers.compare_match_any(record, pat3, schemas.Feature.FIRST_NAME) - assert matchers.compare_match_any(record, pat3, schemas.Feature.LAST_NAME) - assert not matchers.compare_match_any(record, pat3, schemas.Feature.BIRTHDATE) + assert matchers.compare_match_any(record, pat1, schemas.Feature(attribute=schemas.FeatureAttribute.GIVEN_NAME)) + assert matchers.compare_match_any(record, pat1, schemas.Feature(attribute=schemas.FeatureAttribute.FIRST_NAME)) + assert not matchers.compare_match_any(record, pat1, schemas.Feature(attribute=schemas.FeatureAttribute.LAST_NAME)) + assert matchers.compare_match_any(record, pat1, schemas.Feature(attribute=schemas.FeatureAttribute.BIRTHDATE)) + assert not matchers.compare_match_any(record, pat1, schemas.Feature(attribute=schemas.FeatureAttribute.ZIP)) + + assert matchers.compare_match_any(record, pat2, schemas.Feature(attribute=schemas.FeatureAttribute.GIVEN_NAME)) + assert not matchers.compare_match_any(record, pat2, schemas.Feature(attribute=schemas.FeatureAttribute.FIRST_NAME)) + assert matchers.compare_match_any(record, pat2, schemas.Feature(attribute=schemas.FeatureAttribute.LAST_NAME)) + assert not matchers.compare_match_any(record, pat2, schemas.Feature(attribute=schemas.FeatureAttribute.SEX)) + assert not matchers.compare_match_any(record, pat1, schemas.Feature(attribute=schemas.FeatureAttribute.ZIP)) + + assert not matchers.compare_match_any(record, pat3, schemas.Feature(attribute=schemas.FeatureAttribute.GIVEN_NAME)) + assert not matchers.compare_match_any(record, pat3, schemas.Feature(attribute=schemas.FeatureAttribute.FIRST_NAME)) + assert matchers.compare_match_any(record, pat3, schemas.Feature(attribute=schemas.FeatureAttribute.LAST_NAME)) + assert not matchers.compare_match_any(record, pat3, schemas.Feature(attribute=schemas.FeatureAttribute.BIRTHDATE)) with pytest.raises(ValueError): matchers.compare_match_any(record, pat1, "unknown") @@ -104,21 +104,21 @@ def test_compare_match_all(): ) pat3 = models.Patient(data={"name": [{"family": "Smith"}, {"family": "Harrison"}]}) - assert not matchers.compare_match_all(record, pat1, schemas.Feature.GIVEN_NAME) - assert matchers.compare_match_all(record, pat1, schemas.Feature.FIRST_NAME) - assert not matchers.compare_match_all(record, pat1, schemas.Feature.LAST_NAME) - assert matchers.compare_match_all(record, pat1, schemas.Feature.BIRTHDATE) - assert not matchers.compare_match_all(record, pat1, schemas.Feature.ZIP) + assert not matchers.compare_match_all(record, pat1, schemas.Feature(attribute=schemas.FeatureAttribute.GIVEN_NAME)) + assert matchers.compare_match_all(record, pat1, schemas.Feature(attribute=schemas.FeatureAttribute.FIRST_NAME)) + assert not matchers.compare_match_all(record, pat1, schemas.Feature(attribute=schemas.FeatureAttribute.LAST_NAME)) + assert matchers.compare_match_all(record, pat1, schemas.Feature(attribute=schemas.FeatureAttribute.BIRTHDATE)) + assert not matchers.compare_match_all(record, pat1, schemas.Feature(attribute=schemas.FeatureAttribute.ZIP)) - assert matchers.compare_match_all(record, pat2, schemas.Feature.GIVEN_NAME) - assert matchers.compare_match_all(record, pat2, schemas.Feature.FIRST_NAME) - assert not matchers.compare_match_all(record, pat2, schemas.Feature.LAST_NAME) - assert not matchers.compare_match_all(record, pat2, schemas.Feature.SEX) - assert not matchers.compare_match_all(record, pat2, schemas.Feature.ZIP) + assert matchers.compare_match_all(record, pat2, schemas.Feature(attribute=schemas.FeatureAttribute.GIVEN_NAME)) + assert matchers.compare_match_all(record, pat2, schemas.Feature(attribute=schemas.FeatureAttribute.FIRST_NAME)) + assert not matchers.compare_match_all(record, pat2, schemas.Feature(attribute=schemas.FeatureAttribute.LAST_NAME)) + assert not matchers.compare_match_all(record, pat2, schemas.Feature(attribute=schemas.FeatureAttribute.SEX)) + assert not matchers.compare_match_all(record, pat1, schemas.Feature(attribute=schemas.FeatureAttribute.ZIP)) - assert not matchers.compare_match_all(record, pat3, schemas.Feature.FIRST_NAME) - assert matchers.compare_match_all(record, pat3, schemas.Feature.LAST_NAME) - assert not matchers.compare_match_all(record, pat3, schemas.Feature.BIRTHDATE) + assert not matchers.compare_match_all(record, pat3, schemas.Feature(attribute=schemas.FeatureAttribute.FIRST_NAME)) + assert matchers.compare_match_all(record, pat3, schemas.Feature(attribute=schemas.FeatureAttribute.LAST_NAME)) + assert not matchers.compare_match_all(record, pat3, schemas.Feature(attribute=schemas.FeatureAttribute.BIRTHDATE)) with pytest.raises(ValueError): matchers.compare_match_all(record, pat1, "unknown") @@ -135,17 +135,17 @@ def test_compare_fuzzy_match(): pat2 = models.Patient(data={"name": [{"given": ["Michael"], "family": "Smtih"}], "sex": "male"}) pat3 = models.Patient(data={"name": [{"family": "Smyth"}, {"family": "Williams"}]}) - assert matchers.compare_fuzzy_match(record, pat1, schemas.Feature.FIRST_NAME) - assert not matchers.compare_fuzzy_match(record, pat1, schemas.Feature.LAST_NAME) + assert matchers.compare_fuzzy_match(record, pat1, schemas.Feature(attribute=schemas.FeatureAttribute.FIRST_NAME)) + assert not matchers.compare_fuzzy_match(record, pat1, schemas.Feature(attribute=schemas.FeatureAttribute.LAST_NAME)) - assert not matchers.compare_fuzzy_match(record, pat2, schemas.Feature.FIRST_NAME) - assert matchers.compare_fuzzy_match(record, pat2, schemas.Feature.LAST_NAME) + assert not matchers.compare_fuzzy_match(record, pat2, schemas.Feature(attribute=schemas.FeatureAttribute.FIRST_NAME)) + assert matchers.compare_fuzzy_match(record, pat2, schemas.Feature(attribute=schemas.FeatureAttribute.LAST_NAME)) - assert not matchers.compare_fuzzy_match(record, pat3, schemas.Feature.FIRST_NAME) - assert matchers.compare_fuzzy_match(record, pat3, schemas.Feature.LAST_NAME) + assert not matchers.compare_fuzzy_match(record, pat3, schemas.Feature(attribute=schemas.FeatureAttribute.FIRST_NAME)) + assert matchers.compare_fuzzy_match(record, pat3, schemas.Feature(attribute=schemas.FeatureAttribute.LAST_NAME)) with pytest.raises(ValueError): - matchers.compare_fuzzy_match(record, pat1, "first_name") + matchers.compare_fuzzy_match(record, pat1, schemas.Feature(attribute="first_name")) def test_compare_probabilistic_fuzzy_match(): @@ -153,7 +153,7 @@ def test_compare_probabilistic_fuzzy_match(): matchers.compare_probabilistic_fuzzy_match( schemas.PIIRecord(), models.Patient(), - schemas.Feature.MRN, + schemas.Feature(attribute=schemas.FeatureAttribute.IDENTIFIER), ) rec = schemas.PIIRecord( @@ -169,15 +169,15 @@ def test_compare_probabilistic_fuzzy_match(): } ) log_odds = { - schemas.Feature.FIRST_NAME.value: 4.0, - schemas.Feature.LAST_NAME.value: 6.5, - schemas.Feature.BIRTHDATE.value: 9.8, - schemas.Feature.ADDRESS.value: 3.7, + schemas.FeatureAttribute.FIRST_NAME.value: 4.0, + schemas.FeatureAttribute.LAST_NAME.value: 6.5, + schemas.FeatureAttribute.BIRTHDATE.value: 9.8, + schemas.FeatureAttribute.ADDRESS.value: 3.7, } assert ( matchers.compare_probabilistic_fuzzy_match( - rec, pat, schemas.Feature.FIRST_NAME, log_odds=log_odds + rec, pat, schemas.Feature(attribute=schemas.FeatureAttribute.FIRST_NAME), log_odds=log_odds ) == 4.0 ) @@ -185,7 +185,7 @@ def test_compare_probabilistic_fuzzy_match(): assert ( round( matchers.compare_probabilistic_fuzzy_match( - rec, pat, schemas.Feature.LAST_NAME, log_odds=log_odds + rec, pat, schemas.Feature(attribute=schemas.FeatureAttribute.LAST_NAME), log_odds=log_odds ), 3, ) @@ -195,7 +195,7 @@ def test_compare_probabilistic_fuzzy_match(): assert ( round( matchers.compare_probabilistic_fuzzy_match( - rec, pat, schemas.Feature.BIRTHDATE, log_odds=log_odds + rec, pat, schemas.Feature(attribute=schemas.FeatureAttribute.BIRTHDATE), log_odds=log_odds ), 3, ) @@ -205,7 +205,7 @@ def test_compare_probabilistic_fuzzy_match(): assert ( round( matchers.compare_probabilistic_fuzzy_match( - rec, pat, schemas.Feature.ADDRESS, log_odds=log_odds + rec, pat, schemas.Feature(attribute=schemas.FeatureAttribute.ADDRESS), log_odds=log_odds ), 3, ) diff --git a/tests/unit/routes/test_seed_router.py b/tests/unit/routes/test_seed_router.py index d70752b0..e22d196a 100644 --- a/tests/unit/routes/test_seed_router.py +++ b/tests/unit/routes/test_seed_router.py @@ -28,17 +28,19 @@ def test_too_many_clusters(self, client): ) def test_large_batch(self, client): + # NOTE: The seed_test.json file was generated with scripts/gen_seed_test_data.py + # rerun that script and adjust these values if the data format needs to change. data = load_test_json_asset("seed_test.json.gz") response = client.post("/seed", json=data) assert response.status_code == 201 persons = response.json()["persons"] assert len(persons) == 100 - assert len(persons[0]["patients"]) == 13 - assert len(persons[99]["patients"]) == 7 - assert sum(len(p["patients"]) for p in persons) == 1285 + assert len(persons[0]["patients"]) == 5 + assert len(persons[99]["patients"]) == 14 + assert sum(len(p["patients"]) for p in persons) == 1397 assert client.session.query(models.Person).count() == 100 - assert client.session.query(models.Patient).count() == 1285 - assert client.session.query(models.BlockingValue).count() == 10280 + assert client.session.query(models.Patient).count() == 1397 + assert client.session.query(models.BlockingValue).count() == 12603 @mock.patch("recordlinker.database.algorithm_service.default_algorithm") def test_seed_and_link(self, mock_algorithm, basic_algorithm, client): diff --git a/tests/unit/schemas/test_algorithm.py b/tests/unit/schemas/test_algorithm.py index ea188a19..721d0494 100644 --- a/tests/unit/schemas/test_algorithm.py +++ b/tests/unit/schemas/test_algorithm.py @@ -70,6 +70,16 @@ def test_validate_evaluators(self): rule="func:recordlinker.linking.matchers.rule_match", ) + evaluators = [ + {"feature": "FIRST_NAME:DL", "func": "func:recordlinker.linking.matchers.compare_match_any"} + ] + with pytest.raises(pydantic.ValidationError): + AlgorithmPass( + blocking_keys=[], + evaluators=evaluators, + rule="func:recordlinker.linking.matchers.rule_match", + ) + def test_validate_rule(self): rule = "invalid.func" with pytest.raises(pydantic.ValidationError): diff --git a/tests/unit/schemas/test_pii.py b/tests/unit/schemas/test_pii.py index f334ad3c..7049321f 100644 --- a/tests/unit/schemas/test_pii.py +++ b/tests/unit/schemas/test_pii.py @@ -19,7 +19,6 @@ class TestPIIRecord: def test_model_construct(self): data = { - "mrn": "99", "birth_date": "1980-2-1", "name": [ {"family": "Doe", "given": ["John", "L"]}, @@ -44,10 +43,19 @@ def test_model_construct(self): }, ], "telecom": [{"value": "555-123-4567"}, {"value": "555-987-6543"}], - "drivers_license": {"authority": "VA", "value": "D1234567"}, + "identifiers": [ + { + "type": "MR", + "value": "99", + }, + { + "type": "DL", + "value": "D1234567", + "authority": "VA", + } + ] } record = pii.PIIRecord.model_construct(**data) - assert record.mrn == "99" assert record.birth_date == "1980-2-1" assert record.name[0].family == "Doe" assert record.name[0].given == ["John", "L"] @@ -63,8 +71,14 @@ def test_model_construct(self): assert record.address[1].state == "CA" assert record.address[1].postal_code == "98765-4321" assert record.address[1].county == "county2" - assert record.drivers_license.value == "D1234567" - assert record.drivers_license.authority == "VA" + + #identifiers + assert str(record.identifiers[0].type) == "MR" + assert record.identifiers[0].value == "99" + + assert str(record.identifiers[1].type) == "DL" + assert record.identifiers[1].value == "D1234567" + assert record.identifiers[1].authority == "VA" def test_parse_external_id(self): record = pii.PIIRecord(external_id=uuid.UUID("7ca699d9-1986-4c0c-a0fd-ac4ae0dfa297")) @@ -113,14 +127,18 @@ def test_parse_sex(self): assert record.sex is None def test_parse_ssn(self): - record = pii.PIIRecord(ssn="123-45-6789") - assert record.ssn == "123-45-6789" - record = pii.PIIRecord(ssn=" 123-45-6789 ") - assert record.ssn == "123-45-6789" - record = pii.PIIRecord(ssn="1-2-3") - assert record.ssn is None + record = pii.PIIRecord(identifiers=[pii.Identifier(type="SS", value="123-45-6789")]) + assert record.identifiers[0].value == "123-45-6789" + #testing extra spaces + record = pii.PIIRecord(identifiers=[pii.Identifier(type="SS", value=" 123-45-6789 ")]) + assert record.identifiers[0].value == "123-45-6789" + #testing no dashes + record = pii.PIIRecord(identifiers=[pii.Identifier(type="SS", value="123456789")]) + assert record.identifiers[0].value == "123-45-6789" + record = pii.PIIRecord(identifiers=[pii.Identifier(type="SS", value="1-2-3")]) + assert record.identifiers[0].value == '' record = pii.PIIRecord() - assert record.ssn is None + assert record.identifiers == [] def test_parse_race(self): # testing verbose races @@ -206,8 +224,6 @@ def test_feature_iter(self): external_id="99", birth_date="1980-2-1", sex="male", - mrn="123456", - ssn="123-45-6789", race="unknown", gender="unknown", address=[ @@ -236,35 +252,52 @@ def test_feature_iter(self): pii.Telecom(value="(555) 987-6543", system="phone"), pii.Telecom(value="test@email.com", system="email"), ], - drivers_license=pii.DriversLicense(value="D1234567", authority="VA"), + identifiers=[ + { + "type": "MR", + "value": "123456", + }, + { + "type": "SS", + "value": "123-45-6789", + }, + { + "type": "DL", + "value": "D1234567", + "authority": "VA", + } + ], ) with pytest.raises(ValueError): list(record.feature_iter("external_id")) - assert list(record.feature_iter(pii.Feature.BIRTHDATE)) == ["1980-02-01"] - assert list(record.feature_iter(pii.Feature.MRN)) == ["123456"] - assert list(record.feature_iter(pii.Feature.SEX)) == ["M"] - assert list(record.feature_iter(pii.Feature.ADDRESS)) == ["123 Main St", "456 Elm St"] - assert list(record.feature_iter(pii.Feature.CITY)) == ["Anytown", "Somecity"] - assert list(record.feature_iter(pii.Feature.STATE)) == ["NY", "CA"] - assert list(record.feature_iter(pii.Feature.ZIP)) == ["12345", "98765"] - assert list(record.feature_iter(pii.Feature.GIVEN_NAME)) == ["John", "L", "Jane"] - assert list(record.feature_iter(pii.Feature.FIRST_NAME)) == ["John", "Jane"] - assert list(record.feature_iter(pii.Feature.LAST_NAME)) == ["Doe", "Smith"] - assert list(record.feature_iter(pii.Feature.SSN)) == ["123-45-6789"] - assert list(record.feature_iter(pii.Feature.RACE)) == ["UNKNOWN"] - assert list(record.feature_iter(pii.Feature.GENDER)) == ["UNKNOWN"] - assert list(record.feature_iter(pii.Feature.TELECOM)) == [ + assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.BIRTHDATE))) == ["1980-02-01"] + assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.SEX))) == ["M"] + assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.ADDRESS))) == ["123 Main St", "456 Elm St"] + assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.CITY))) == ["Anytown", "Somecity"] + assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.STATE))) == ["NY", "CA"] + assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.ZIP))) == ["12345", "98765"] + assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.GIVEN_NAME))) == ["John", "L", "Jane"] + assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.FIRST_NAME))) == ["John", "Jane"] + assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.LAST_NAME))) == ["Doe", "Smith"] + assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.RACE))) == ["UNKNOWN"] + assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.GENDER))) == ["UNKNOWN"] + assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.TELECOM))) == [ "555-123-4567", "(555) 987-6543", "test@email.com", ] - assert list(record.feature_iter(pii.Feature.PHONE)) == ["5559876543"] - assert list(record.feature_iter(pii.Feature.EMAIL)) == ["test@email.com"] - assert list(record.feature_iter(pii.Feature.SUFFIX)) == ["suffix", "suffix2"] - assert list(record.feature_iter(pii.Feature.COUNTY)) == ["county"] - assert list(record.feature_iter(pii.Feature.DRIVERS_LICENSE)) == ["D1234567|VA"] + assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.PHONE))) == ["5559876543"] + assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.EMAIL))) == ["test@email.com"] + assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.SUFFIX))) == ["suffix", "suffix2"] + assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.COUNTY))) == ["county"] + assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.IDENTIFIER))) == ["MR::123456", "SS::123-45-6789", "DL:VA:D1234567"] + + # IDENTIFIER with suffix + assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.IDENTIFIER, suffix="MR"))) == ["MR::123456"] + assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.IDENTIFIER, suffix="SS"))) == ["SS::123-45-6789"] + def test_blocking_keys_invalid(self): rec = pii.PIIRecord() @@ -273,9 +306,9 @@ def test_blocking_keys_invalid(self): @unittest.mock.patch("recordlinker.models.BLOCKING_VALUE_MAX_LENGTH", 1) def test_blocking_keys_value_too_long(self): - rec = pii.PIIRecord(**{"mrn": "123456789"}) + rec = pii.PIIRecord(**{"identifiers": [{"type": "MR", "value": "123456789"}]}) with pytest.raises(RuntimeError): - rec.blocking_keys(BlockingKey.MRN) + rec.blocking_keys(BlockingKey.IDENTIFIER) def test_blocking_keys_birthdate(self): rec = pii.PIIRecord(**{"dob": "01/01/1980"}) @@ -290,14 +323,21 @@ def test_blocking_keys_birthdate(self): assert rec.blocking_keys(BlockingKey.BIRTHDATE) == set() def test_blocking_keys_mrn_last_four(self): - rec = pii.PIIRecord(**{"ssn": "123456789"}) - assert rec.blocking_keys(BlockingKey.MRN) == set() - rec = pii.PIIRecord(**{"mrn": None}) - assert rec.blocking_keys(BlockingKey.MRN) == set() - rec = pii.PIIRecord(**{"mrn": "123456789"}) - assert rec.blocking_keys(BlockingKey.MRN) == {"6789"} - rec = pii.PIIRecord(**{"mrn": "89"}) - assert rec.blocking_keys(BlockingKey.MRN) == {"89"} + rec = pii.PIIRecord() + assert rec.blocking_keys(BlockingKey.IDENTIFIER) == set() + rec = pii.PIIRecord(**{"identifiers": []}) + assert rec.blocking_keys(BlockingKey.IDENTIFIER) == set() + rec = pii.PIIRecord(**{"identifiers": [{ "type": "MR", "value": "123456789" }]}) + assert rec.blocking_keys(BlockingKey.IDENTIFIER) == {"MR::6789"} + rec = pii.PIIRecord(**{"identifiers": [{ "type": "MR", "value": "89" }]}) + assert rec.blocking_keys(BlockingKey.IDENTIFIER) == {"MR::89"} + + #test multiple identifiers return correctly + rec = pii.PIIRecord(identifiers=[ + pii.Identifier(type="MR", value="123456789"), + pii.Identifier(type="SS", value="123456789"), + ]) + assert rec.blocking_keys(BlockingKey.IDENTIFIER) == {"MR::6789", "SS::6789"} def test_blocking_keys_sex(self): rec = pii.PIIRecord(**{"gender": "M"}) @@ -388,11 +428,20 @@ def test_blocking_keys_email_first_four(self): rec = pii.PIIRecord(**{"telecom": [{"value": "t@gmail.com", "system": "email"}, {"value": "bob@gmail.com", "system": "other"}]}) assert rec.blocking_keys(BlockingKey.EMAIL) == {"t@gm"} + def test_blocking_keys_identifier(self): + rec = pii.PIIRecord(**{"identifiers": []}) + assert rec.blocking_keys(BlockingKey.IDENTIFIER) == set() + rec = pii.PIIRecord(**{"identifiers": [{"type": "MR", "value": "123456789", "authority": "NY"}]}) + assert rec.blocking_keys(BlockingKey.IDENTIFIER) == {"MR:NY:6789"} + + #test only get first 2 characters of authority for blocking + rec = pii.PIIRecord(**{"identifiers": [{"type": "MR", "value": "123456789", "authority": "DMV"}]}) + assert rec.blocking_keys(BlockingKey.IDENTIFIER) == {"MR:DM:6789"} def test_blocking_values(self): rec = pii.PIIRecord( **{ - "mrn": "123456", + "identifiers": [{"type": "MR", "value": "3456"}], "birth_date": "1980-01-01", "name": [{"given": ["John", "William"], "family": "Doe"}], } @@ -401,8 +450,8 @@ def test_blocking_values(self): for key, val in rec.blocking_values(): if key == BlockingKey.BIRTHDATE: assert val == "1980-01-01" - elif key == BlockingKey.MRN: - assert val == "3456" + elif key == BlockingKey.IDENTIFIER: + assert val == "MR::3456" elif key == BlockingKey.FIRST_NAME: assert val == "John" elif key == BlockingKey.LAST_NAME: