ID Triplet Feature (#146)

## Description - Implemenet the ID Triplet as described in [RFC-001](#125) - Replace MRN, SSN and DRIVERS_LICENSE with the new IDENTIFIER triplet concept introduced in RFC-001. ## Related Issues closes #125 ## Additional Notes - [x] Update the `/link` endpoint to accept an identifier triplet - [x] Create blocking values on `IDENTIFIER` values - [x] Feature match on `IDENTIFIER` and/or `IDENTIFIER:XXX` values - [x] Add new test cases for blocking and feature matching on the new values - [x] Update documentation in references.md regarding the new blocking key and feature Co-authored-by: Eric Buckley <eric.buckley@gmail.com>
CDCgov · Jan 17, 2025 · 663222f · 663222f
1 parent 68503f4
commit 663222f
Show file tree

Hide file tree

Showing 21 changed files with 755 additions and 218 deletions.
diff --git a/.github/workflows/check_smoke_tests.yml b/.github/workflows/check_smoke_tests.yml
@@ -53,8 +53,8 @@ jobs:
         done
       
         # Run smoke tests and print the response
-        JSON_BODY_1='{"record": {"birth_date": "2053-11-07", "sex": "M", "mrn": "1234567890", "name":[{"family":"Shepard", "given":["John"]}]}}'
-        JSON_BODY_2='{"algorithm": "dibbs-enhanced", "record": {"birth_date": "2000-12-06", "sex": "M", "mrn": "9876543210", "name":[{"family":"Smith", "given":["William"]}]}}'
+        JSON_BODY_1='{"record": {"birth_date": "2053-11-07", "sex": "M", "identifiers":[{"value": "123456789", "type": "MR"}], "name":[{"family":"Shepard", "given":["John"]}]}}'
+        JSON_BODY_2='{"algorithm": "dibbs-enhanced", "record": {"birth_date": "2000-12-06", "sex": "M", "identifiers":[{"value": "9876543210", "type": "MR"}], "name":[{"family":"Smith", "given":["William"]}]}}'
 
         #basic tests
         RESPONSE_1=$(curl -s -X POST http://localhost:8080/link \

diff --git a/docs/site/reference.md b/docs/site/reference.md
@@ -15,14 +15,6 @@ linkage evaluation phase. The following features are supported:
 
 :   The patient's birthdate (normalized to `YYYY-MM-DD`).
 
-`MRN`
-
-:   The patient's medical record number.
-
-`SSN`
-
-:   The patient's social security number.
-
 `SEX`
 
 :   The patient's sex (normalized to `M`, `F`, or `U` for unknown).
@@ -83,9 +75,13 @@ linkage evaluation phase. The following features are supported:
 
 :   The patient's email address.
 
-`DRIVERS_LICENSE`
+`IDENTIFIER`
+
+:   An identifier for the patient.  Matching on this will check if any identifier type/authority/value combination matches.
 
-:   The patient's driver's license number.
+`IDENTIFIER:<type>`
+
+:   The patient's specific identifier type. For example, `IDENTIFIER:MR` would be the patient's medical record number.  Unlike `IDENTIFIER`, this will ONLY compare values of a specific type.  Valid type codes can be found here http://hl7.org/fhir/R4/v2/0203/index.html.
 
 
 ### Blocking Key Types
@@ -97,10 +93,6 @@ patient data and used during query retrieval. The following blocking key types a
 
 :   The patients birthdate in the format `YYYY-MM-DD`.
 
-`MRN` (ID: **2**)
-
-:   The last 4 characters of a patient's medical record number.
-
 `SEX` (ID: **3**)
 
 :   The patient's sex in the format of `M`, `F`, or `U` for unknown.
@@ -129,6 +121,10 @@ patient data and used during query retrieval. The following blocking key types a
 
 :   The first 4 characters of the patient's email address.
 
+`IDENTIFIER` (ID: **10**)
+
+:  The identifier triplet containing only the type, authority, and last 4 digits of the value
+
 
 ### Evaluation Functions
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -43,6 +43,7 @@ dev = [
     "ruff",
     "mypy",
     "types-python-dateutil",
+    "faker",
     # Testing
     "pytest>=8.3",
     "pytest-cov",

diff --git a/scripts/gen_seed_test_data.py b/scripts/gen_seed_test_data.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python
+"""
+scripts/gen_seed_test_data.py
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Script to generate test data for the /seed endpoint in the RecordLinker project.
+
+The script will emit a JSON object to STDOUT containing a list of clusters, each with a
+list of PII records. By default, 100 clusters will be generated, each with a random number
+of PII records up to 25. Those values can be adjusted see --help for more information.
+"""
+
+import argparse
+import random
+
+from faker import Faker
+
+from recordlinker import schemas
+from recordlinker.schemas.identifier import Identifier
+from recordlinker.schemas.pii import Address
+from recordlinker.schemas.pii import Gender
+from recordlinker.schemas.pii import Name
+from recordlinker.schemas.pii import Race
+from recordlinker.schemas.pii import Sex
+from recordlinker.schemas.pii import Telecom
+
+
+def _generate_random_identifiers(count, faker):
+    """
+    Given a count of identifiers to generate, generate a list of
+    MRNs, SSNs, and Drivers Licenses.
+    """
+    for idx in range(count):
+        if idx % 3 == 0:
+            # make mrn
+            yield Identifier(type="MR", value=faker.bothify(text="MRN-#######"))
+        if idx % 3 == 1:
+            # make ssn
+            yield Identifier(type="SS", value=faker.ssn())
+        if idx % 3 == 2:
+            # make drivers_license
+            yield Identifier(
+                type="DL", value=faker.bothify(text="DL-######"), authority=faker.state_abbr()
+            )
+
+
+# Function to generate random data
+def _generate_random_pii_record(faker):
+    return schemas.PIIRecord(
+        external_id=faker.uuid4(),
+        birth_date=faker.date_of_birth(minimum_age=0, maximum_age=100),
+        sex=random.choice(list(Sex)),
+        address=[
+            Address(
+                line=[faker.street_address()],
+                city=faker.city(),
+                state=faker.state_abbr(),
+                postal_code=faker.zipcode(),
+                county=faker.city(),
+                country=faker.country_code(),
+                latitude=faker.latitude(),
+                longitude=faker.longitude(),
+            )
+        ],
+        name=[
+            Name(
+                family=faker.last_name(),
+                given=[faker.first_name()],
+                use=random.choice(["official", "usual", "nickname"]),
+            )
+        ],
+        telecom=[
+            Telecom(
+                value=faker.phone_number(),
+                system="phone",
+                use=random.choice(["home", "work", "mobile"]),
+            )
+        ],
+        race=random.choice(list(Race)),
+        gender=random.choice(list(Gender)),
+        identifiers=list(_generate_random_identifiers(random.randint(1, 3), faker)),
+    )
+
+
+def main() -> None:
+    """
+    Main entry point for the script.
+    """
+    parser = argparse.ArgumentParser(description="Generate test data for the /seed endpoint")
+    parser.add_argument("--count", type=int, default=100, help="The number of clusters to generate")
+    parser.add_argument(
+        "--max-per-cluster", type=int, default=25, help="The maximum number of records per cluster"
+    )
+
+    args = parser.parse_args()
+
+    faker = Faker()
+    clusters = []
+    for _ in range(args.count):
+        cluster = schemas.Cluster(
+            external_person_id=f"EP:{str(faker.uuid4())}",
+            records=[
+                _generate_random_pii_record(faker)
+                for _ in range(random.randint(1, args.max_per_cluster))
+            ],
+        )
+        clusters.append(cluster)
+    print(schemas.ClusterGroup(clusters=clusters).model_dump_json(indent=2))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/recordlinker/assets/initial_algorithms.json b/src/recordlinker/assets/initial_algorithms.json
@@ -9,7 +9,7 @@
             {
                 "blocking_keys": [
                     "BIRTHDATE",
-                    "MRN",
+                    "IDENTIFIER",  
                     "SEX"
                 ],
                 "evaluators": [
@@ -75,7 +75,7 @@
             {
                 "blocking_keys": [
                     "BIRTHDATE",
-                    "MRN",
+                    "IDENTIFIER",
                     "SEX"
                 ],
                 "evaluators": [
@@ -106,7 +106,7 @@
                         "CITY": 2.438553006137189,
                         "FIRST_NAME": 6.849475906891162,
                         "LAST_NAME": 6.350720397426025,
-                        "MRN": 0.3051262572525359,
+                        "IDENTIFIER:MR": 0.3051262572525359,
                         "SEX": 0.7510419059643679,
                         "STATE": 0.022376768992488694,
                         "ZIP": 4.975031471124867
@@ -148,7 +148,7 @@
                         "CITY": 2.438553006137189,
                         "FIRST_NAME": 6.849475906891162,
                         "LAST_NAME": 6.350720397426025,
-                        "MRN": 0.3051262572525359,
+                        "IDENTIFIER:MR": 0.3051262572525359,
                         "SEX": 0.7510419059643679,
                         "STATE": 0.022376768992488694,
                         "ZIP": 4.975031471124867

diff --git a/src/recordlinker/hl7/fhir.py b/src/recordlinker/hl7/fhir.py
@@ -33,26 +33,18 @@ def fhir_record_to_pii_record(fhir_record: dict) -> schemas.PIIRecord:
         "birthDate": fhir_record.get("birthDate"),
         "sex": fhir_record.get("gender"),
         "address": fhir_record.get("address", []),
-        "mrn": None,
-        "ssn": None,
         "race": None,
         "gender": None,
         "telecom": fhir_record.get("telecom", []),
-        "drivers_license": None,
+        "identifiers": [],
     }
     for identifier in fhir_record.get("identifier", []):
-        for coding in identifier.get("type", {}).get("coding", []):
-            if coding.get("code") == "MR":
-                val["mrn"] = identifier.get("value")
-            elif coding.get("code") == "SS":
-                val["ssn"] = identifier.get("value")
-            elif coding.get("code") == "DL":
-                license_number = identifier.get("value")
-                authority = identifier.get("assigner", {}).get("identifier", {}).get("value", "")  # Assuming `issuer` contains authority info
-                val["drivers_license"] = {
-                    "value": license_number,
-                    "authority": authority
-                }
+        for code in identifier.get("type", {}).get("coding", []):
+            val["identifiers"].append({
+                "value": identifier.get("value"),
+                "type": code.get("code"),
+                "authority": identifier.get("assigner", {}).get("identifier", {}).get("value", ""),
+            })
     for address in val["address"]:
         address["county"] = address.get("district", "")
         for extension in address.get("extension", []):

diff --git a/src/recordlinker/linking/link.py b/src/recordlinker/linking/link.py
@@ -51,7 +51,7 @@ def compare(
     details: dict[str, typing.Any] = {"patient.reference_id": str(patient.reference_id)}
     for e in evals:
         # TODO: can we do this check earlier?
-        feature = getattr(schemas.Feature, e.feature, None)
+        feature = schemas.Feature.parse(e.feature)
         if feature is None:
             raise ValueError(f"Invalid comparison field: {e.feature}")
         # Evaluate the comparison function and append the result to the list

diff --git a/src/recordlinker/linking/matchers.py b/src/recordlinker/linking/matchers.py
@@ -179,7 +179,7 @@ def compare_fuzzy_match(
       beyond which to classify the strings as a partial match.
     :return: A float indicating whether the features are a fuzzy match.
     """
-    similarity_measure, threshold = _get_fuzzy_params(str(key), **kwargs)
+    similarity_measure, threshold = _get_fuzzy_params(str(key.attribute), **kwargs)
     comp_func = getattr(rapidfuzz.distance, similarity_measure).normalized_similarity
     for x in record.feature_iter(key):
         for y in patient.record.feature_iter(key):
@@ -203,11 +203,11 @@ def compare_probabilistic_fuzzy_match(
       beyond which to classify the strings as a partial match.
     :return: A float of the score the feature comparison earned.
     """
-    log_odds = kwargs.get("log_odds", {}).get(str(key))
+    log_odds = kwargs.get("log_odds", {}).get(str(key.attribute))
     if log_odds is None:
         raise ValueError(f"Log odds not found for feature {key}")
 
-    similarity_measure, threshold = _get_fuzzy_params(str(key), **kwargs)
+    similarity_measure, threshold = _get_fuzzy_params(str(key.attribute), **kwargs)
     comp_func = getattr(rapidfuzz.distance, similarity_measure).normalized_similarity
     max_score = 0.0
     for x in patient.record.feature_iter(key):

diff --git a/src/recordlinker/models/mpi.py b/src/recordlinker/models/mpi.py
@@ -121,14 +121,14 @@ class BlockingKey(enum.Enum):
     """
 
     BIRTHDATE = ("BIRTHDATE", 1, "Date of birth as YYYY-MM-DD")
-    MRN = ("MRN", 2, "Last 4 characters of Medical record number")
     SEX = ("SEX", 3, "Sex at birth; M, F or U")
     ZIP = ("ZIP", 4, "5 digital US Postal Code")
     FIRST_NAME = ("FIRST_NAME", 5, "First 4 characters of the first name")
     LAST_NAME = ("LAST_NAME", 6, "First 4 characters of the last name")
     ADDRESS = ("ADDRESS", 7, "First 4 characters of the address")
     PHONE = ("PHONE", 8, "Last 4 characters of the phone number")
     EMAIL = ("EMAIL", 9, "First 4 characters of the email address")
+    IDENTIFIER = ("IDENTIFIER", 10, "Identifier triplet with only last 4 character of the value. Format \"type:authority:value\"")
 
     def __init__(self, value: str, _id: int, description: str):
         self._value = value

diff --git a/src/recordlinker/schemas/__init__.py b/src/recordlinker/schemas/__init__.py
@@ -13,6 +13,7 @@
 from .mpi import PatientRef
 from .mpi import PersonRef
 from .pii import Feature
+from .pii import FeatureAttribute
 from .pii import PIIRecord
 from .seed import Cluster
 from .seed import ClusterGroup
@@ -24,6 +25,7 @@
     "AlgorithmPass",
     "AlgorithmSummary",
     "Feature",
+    "FeatureAttribute",
     "PIIRecord",
     "Prediction",
     "LinkInput",

diff --git a/src/recordlinker/schemas/algorithm.py b/src/recordlinker/schemas/algorithm.py
@@ -23,9 +23,19 @@ class Evaluator(pydantic.BaseModel):
 
     model_config = pydantic.ConfigDict(from_attributes=True, use_enum_values=True)
 
-    feature: Feature
+    feature: str = pydantic.Field(json_schema_extra={"enum": Feature.all_options()})
     func: matchers.FeatureFunc
 
+    @pydantic.field_validator("feature", mode="before")
+    def validate_feature(cls, value):
+        """
+        Validate the feature is a valid PII feature.
+        """
+        try:
+            Feature.parse(value)
+        except ValueError as e:
+            raise ValueError(f"Invalid feature: '{value}'. {e}")
+        return value
 
 class AlgorithmPass(pydantic.BaseModel):
     """