Skip to content

Commit

Permalink
ID Triplet Feature (#146)
Browse files Browse the repository at this point in the history
## Description
- Implemenet the ID Triplet as described in
[RFC-001](#125)
- Replace MRN, SSN and DRIVERS_LICENSE with the new IDENTIFIER triplet
concept introduced in RFC-001.

## Related Issues
closes #125 

## Additional Notes
- [x] Update the `/link` endpoint to accept an identifier triplet
- [x] Create blocking values on `IDENTIFIER` values
- [x] Feature match on `IDENTIFIER` and/or `IDENTIFIER:XXX` values
- [x] Add new test cases for blocking and feature matching on the new
values
- [x] Update documentation in references.md regarding the new blocking
key and feature

Co-authored-by: Eric Buckley <eric.buckley@gmail.com>
  • Loading branch information
cbrinson-rise8 and ericbuckley authored Jan 17, 2025
1 parent 68503f4 commit 663222f
Show file tree
Hide file tree
Showing 21 changed files with 755 additions and 218 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/check_smoke_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ jobs:
done
# Run smoke tests and print the response
JSON_BODY_1='{"record": {"birth_date": "2053-11-07", "sex": "M", "mrn": "1234567890", "name":[{"family":"Shepard", "given":["John"]}]}}'
JSON_BODY_2='{"algorithm": "dibbs-enhanced", "record": {"birth_date": "2000-12-06", "sex": "M", "mrn": "9876543210", "name":[{"family":"Smith", "given":["William"]}]}}'
JSON_BODY_1='{"record": {"birth_date": "2053-11-07", "sex": "M", "identifiers":[{"value": "123456789", "type": "MR"}], "name":[{"family":"Shepard", "given":["John"]}]}}'
JSON_BODY_2='{"algorithm": "dibbs-enhanced", "record": {"birth_date": "2000-12-06", "sex": "M", "identifiers":[{"value": "9876543210", "type": "MR"}], "name":[{"family":"Smith", "given":["William"]}]}}'
#basic tests
RESPONSE_1=$(curl -s -X POST http://localhost:8080/link \
Expand Down
24 changes: 10 additions & 14 deletions docs/site/reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,6 @@ linkage evaluation phase. The following features are supported:

: The patient's birthdate (normalized to `YYYY-MM-DD`).

`MRN`

: The patient's medical record number.

`SSN`

: The patient's social security number.

`SEX`

: The patient's sex (normalized to `M`, `F`, or `U` for unknown).
Expand Down Expand Up @@ -83,9 +75,13 @@ linkage evaluation phase. The following features are supported:

: The patient's email address.

`DRIVERS_LICENSE`
`IDENTIFIER`

: An identifier for the patient. Matching on this will check if any identifier type/authority/value combination matches.

: The patient's driver's license number.
`IDENTIFIER:<type>`

: The patient's specific identifier type. For example, `IDENTIFIER:MR` would be the patient's medical record number. Unlike `IDENTIFIER`, this will ONLY compare values of a specific type. Valid type codes can be found here http://hl7.org/fhir/R4/v2/0203/index.html.


### Blocking Key Types
Expand All @@ -97,10 +93,6 @@ patient data and used during query retrieval. The following blocking key types a

: The patients birthdate in the format `YYYY-MM-DD`.

`MRN` (ID: **2**)

: The last 4 characters of a patient's medical record number.

`SEX` (ID: **3**)

: The patient's sex in the format of `M`, `F`, or `U` for unknown.
Expand Down Expand Up @@ -129,6 +121,10 @@ patient data and used during query retrieval. The following blocking key types a

: The first 4 characters of the patient's email address.

`IDENTIFIER` (ID: **10**)

: The identifier triplet containing only the type, authority, and last 4 digits of the value


### Evaluation Functions

Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ dev = [
"ruff",
"mypy",
"types-python-dateutil",
"faker",
# Testing
"pytest>=8.3",
"pytest-cov",
Expand Down
112 changes: 112 additions & 0 deletions scripts/gen_seed_test_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
#!/usr/bin/env python
"""
scripts/gen_seed_test_data.py
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Script to generate test data for the /seed endpoint in the RecordLinker project.
The script will emit a JSON object to STDOUT containing a list of clusters, each with a
list of PII records. By default, 100 clusters will be generated, each with a random number
of PII records up to 25. Those values can be adjusted see --help for more information.
"""

import argparse
import random

from faker import Faker

from recordlinker import schemas
from recordlinker.schemas.identifier import Identifier
from recordlinker.schemas.pii import Address
from recordlinker.schemas.pii import Gender
from recordlinker.schemas.pii import Name
from recordlinker.schemas.pii import Race
from recordlinker.schemas.pii import Sex
from recordlinker.schemas.pii import Telecom


def _generate_random_identifiers(count, faker):
"""
Given a count of identifiers to generate, generate a list of
MRNs, SSNs, and Drivers Licenses.
"""
for idx in range(count):
if idx % 3 == 0:
# make mrn
yield Identifier(type="MR", value=faker.bothify(text="MRN-#######"))
if idx % 3 == 1:
# make ssn
yield Identifier(type="SS", value=faker.ssn())
if idx % 3 == 2:
# make drivers_license
yield Identifier(
type="DL", value=faker.bothify(text="DL-######"), authority=faker.state_abbr()
)


# Function to generate random data
def _generate_random_pii_record(faker):
return schemas.PIIRecord(
external_id=faker.uuid4(),
birth_date=faker.date_of_birth(minimum_age=0, maximum_age=100),
sex=random.choice(list(Sex)),
address=[
Address(
line=[faker.street_address()],
city=faker.city(),
state=faker.state_abbr(),
postal_code=faker.zipcode(),
county=faker.city(),
country=faker.country_code(),
latitude=faker.latitude(),
longitude=faker.longitude(),
)
],
name=[
Name(
family=faker.last_name(),
given=[faker.first_name()],
use=random.choice(["official", "usual", "nickname"]),
)
],
telecom=[
Telecom(
value=faker.phone_number(),
system="phone",
use=random.choice(["home", "work", "mobile"]),
)
],
race=random.choice(list(Race)),
gender=random.choice(list(Gender)),
identifiers=list(_generate_random_identifiers(random.randint(1, 3), faker)),
)


def main() -> None:
"""
Main entry point for the script.
"""
parser = argparse.ArgumentParser(description="Generate test data for the /seed endpoint")
parser.add_argument("--count", type=int, default=100, help="The number of clusters to generate")
parser.add_argument(
"--max-per-cluster", type=int, default=25, help="The maximum number of records per cluster"
)

args = parser.parse_args()

faker = Faker()
clusters = []
for _ in range(args.count):
cluster = schemas.Cluster(
external_person_id=f"EP:{str(faker.uuid4())}",
records=[
_generate_random_pii_record(faker)
for _ in range(random.randint(1, args.max_per_cluster))
],
)
clusters.append(cluster)
print(schemas.ClusterGroup(clusters=clusters).model_dump_json(indent=2))


if __name__ == "__main__":
main()
8 changes: 4 additions & 4 deletions src/recordlinker/assets/initial_algorithms.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
{
"blocking_keys": [
"BIRTHDATE",
"MRN",
"IDENTIFIER",
"SEX"
],
"evaluators": [
Expand Down Expand Up @@ -75,7 +75,7 @@
{
"blocking_keys": [
"BIRTHDATE",
"MRN",
"IDENTIFIER",
"SEX"
],
"evaluators": [
Expand Down Expand Up @@ -106,7 +106,7 @@
"CITY": 2.438553006137189,
"FIRST_NAME": 6.849475906891162,
"LAST_NAME": 6.350720397426025,
"MRN": 0.3051262572525359,
"IDENTIFIER:MR": 0.3051262572525359,
"SEX": 0.7510419059643679,
"STATE": 0.022376768992488694,
"ZIP": 4.975031471124867
Expand Down Expand Up @@ -148,7 +148,7 @@
"CITY": 2.438553006137189,
"FIRST_NAME": 6.849475906891162,
"LAST_NAME": 6.350720397426025,
"MRN": 0.3051262572525359,
"IDENTIFIER:MR": 0.3051262572525359,
"SEX": 0.7510419059643679,
"STATE": 0.022376768992488694,
"ZIP": 4.975031471124867
Expand Down
22 changes: 7 additions & 15 deletions src/recordlinker/hl7/fhir.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,26 +33,18 @@ def fhir_record_to_pii_record(fhir_record: dict) -> schemas.PIIRecord:
"birthDate": fhir_record.get("birthDate"),
"sex": fhir_record.get("gender"),
"address": fhir_record.get("address", []),
"mrn": None,
"ssn": None,
"race": None,
"gender": None,
"telecom": fhir_record.get("telecom", []),
"drivers_license": None,
"identifiers": [],
}
for identifier in fhir_record.get("identifier", []):
for coding in identifier.get("type", {}).get("coding", []):
if coding.get("code") == "MR":
val["mrn"] = identifier.get("value")
elif coding.get("code") == "SS":
val["ssn"] = identifier.get("value")
elif coding.get("code") == "DL":
license_number = identifier.get("value")
authority = identifier.get("assigner", {}).get("identifier", {}).get("value", "") # Assuming `issuer` contains authority info
val["drivers_license"] = {
"value": license_number,
"authority": authority
}
for code in identifier.get("type", {}).get("coding", []):
val["identifiers"].append({
"value": identifier.get("value"),
"type": code.get("code"),
"authority": identifier.get("assigner", {}).get("identifier", {}).get("value", ""),
})
for address in val["address"]:
address["county"] = address.get("district", "")
for extension in address.get("extension", []):
Expand Down
2 changes: 1 addition & 1 deletion src/recordlinker/linking/link.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def compare(
details: dict[str, typing.Any] = {"patient.reference_id": str(patient.reference_id)}
for e in evals:
# TODO: can we do this check earlier?
feature = getattr(schemas.Feature, e.feature, None)
feature = schemas.Feature.parse(e.feature)
if feature is None:
raise ValueError(f"Invalid comparison field: {e.feature}")
# Evaluate the comparison function and append the result to the list
Expand Down
6 changes: 3 additions & 3 deletions src/recordlinker/linking/matchers.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ def compare_fuzzy_match(
beyond which to classify the strings as a partial match.
:return: A float indicating whether the features are a fuzzy match.
"""
similarity_measure, threshold = _get_fuzzy_params(str(key), **kwargs)
similarity_measure, threshold = _get_fuzzy_params(str(key.attribute), **kwargs)
comp_func = getattr(rapidfuzz.distance, similarity_measure).normalized_similarity
for x in record.feature_iter(key):
for y in patient.record.feature_iter(key):
Expand All @@ -203,11 +203,11 @@ def compare_probabilistic_fuzzy_match(
beyond which to classify the strings as a partial match.
:return: A float of the score the feature comparison earned.
"""
log_odds = kwargs.get("log_odds", {}).get(str(key))
log_odds = kwargs.get("log_odds", {}).get(str(key.attribute))
if log_odds is None:
raise ValueError(f"Log odds not found for feature {key}")

similarity_measure, threshold = _get_fuzzy_params(str(key), **kwargs)
similarity_measure, threshold = _get_fuzzy_params(str(key.attribute), **kwargs)
comp_func = getattr(rapidfuzz.distance, similarity_measure).normalized_similarity
max_score = 0.0
for x in patient.record.feature_iter(key):
Expand Down
2 changes: 1 addition & 1 deletion src/recordlinker/models/mpi.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,14 +121,14 @@ class BlockingKey(enum.Enum):
"""

BIRTHDATE = ("BIRTHDATE", 1, "Date of birth as YYYY-MM-DD")
MRN = ("MRN", 2, "Last 4 characters of Medical record number")
SEX = ("SEX", 3, "Sex at birth; M, F or U")
ZIP = ("ZIP", 4, "5 digital US Postal Code")
FIRST_NAME = ("FIRST_NAME", 5, "First 4 characters of the first name")
LAST_NAME = ("LAST_NAME", 6, "First 4 characters of the last name")
ADDRESS = ("ADDRESS", 7, "First 4 characters of the address")
PHONE = ("PHONE", 8, "Last 4 characters of the phone number")
EMAIL = ("EMAIL", 9, "First 4 characters of the email address")
IDENTIFIER = ("IDENTIFIER", 10, "Identifier triplet with only last 4 character of the value. Format \"type:authority:value\"")

def __init__(self, value: str, _id: int, description: str):
self._value = value
Expand Down
2 changes: 2 additions & 0 deletions src/recordlinker/schemas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from .mpi import PatientRef
from .mpi import PersonRef
from .pii import Feature
from .pii import FeatureAttribute
from .pii import PIIRecord
from .seed import Cluster
from .seed import ClusterGroup
Expand All @@ -24,6 +25,7 @@
"AlgorithmPass",
"AlgorithmSummary",
"Feature",
"FeatureAttribute",
"PIIRecord",
"Prediction",
"LinkInput",
Expand Down
12 changes: 11 additions & 1 deletion src/recordlinker/schemas/algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,19 @@ class Evaluator(pydantic.BaseModel):

model_config = pydantic.ConfigDict(from_attributes=True, use_enum_values=True)

feature: Feature
feature: str = pydantic.Field(json_schema_extra={"enum": Feature.all_options()})
func: matchers.FeatureFunc

@pydantic.field_validator("feature", mode="before")
def validate_feature(cls, value):
"""
Validate the feature is a valid PII feature.
"""
try:
Feature.parse(value)
except ValueError as e:
raise ValueError(f"Invalid feature: '{value}'. {e}")
return value

class AlgorithmPass(pydantic.BaseModel):
"""
Expand Down
Loading

0 comments on commit 663222f

Please sign in to comment.