Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expand Feature Comparison Support #88

Merged
merged 8 commits into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions docs/process_for_adding_feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Adding a New Feature

### Add the New Feature to the `Feature` Class
- In [src/recordlinker/schemas/pii.py](https://github.com/CDCgov/RecordLinker/blob/a672d2b6409cbd1a08f729d94fba5692f57f6fc6/src/recordlinker/schemas/pii.py), add the new feature to the [Feature](https://github.com/CDCgov/RecordLinker/blob/a672d2b6409cbd1a08f729d94fba5692f57f6fc6/src/recordlinker/schemas/pii.py#L12C7-L12C14) enum class.

### Update the `PIIRecord` Schema
- In the same file, modify the [PIIRecord](https://github.com/CDCgov/RecordLinker/blob/c85f555e5da91d54eb8c51e3bdf0789d1e204b2f/src/recordlinker/schemas/pii.py#L97) class to include the new feature as a field.
- If the feature requires predefined values, create an enum to represent those values.

### Modify the `PIIRecord.feature_iter` Method
- Update the [PIIRecord.feature_iter](https://github.com/CDCgov/RecordLinker/blob/a672d2b6409cbd1a08f729d94fba5692f57f6fc6/src/recordlinker/schemas/pii.py#L246) method to return the value of the new feature when it's used for comparison.

### Extract the FHIR Field in `fhir_record_to_pii_record`
- In [src/recordlinker/linking/link.py](https://github.com/CDCgov/RecordLinker/blob/a672d2b6409cbd1a08f729d94fba5692f57f6fc6/src/recordlinker/linking/link.py), update the [fhir_record_to_pii_record](https://github.com/CDCgov/RecordLinker/blob/a672d2b6409cbd1a08f729d94fba5692f57f6fc6/src/recordlinker/linking/link.py#L26) function to map the relevant FHIR field to the new feature in [PIIRecord](https://github.com/CDCgov/RecordLinker/blob/c85f555e5da91d54eb8c51e3bdf0789d1e204b2f/src/recordlinker/schemas/pii.py#L97).

### Update the Tests
- Add or modify unit tests to verify that the new feature is properly extracted, mapped, and compared.
19 changes: 18 additions & 1 deletion src/recordlinker/linking/link.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,21 +33,38 @@
"birthDate": fhir_record.get("birthDate"),
"sex": fhir_record.get("gender"),
"address": fhir_record.get("address", []),
"phone": fhir_record.get("telecom", []),
"mrn": None,
"ssn": None,
"race": None,
"gender": None,
"telecom": fhir_record.get("telecom", []),
}
for identifier in fhir_record.get("identifier", []):
for coding in identifier.get("type", {}).get("coding", []):
if coding.get("code") == "MR":
val["mrn"] = identifier.get("value")
elif coding.get("code") == "SS":
val["ssn"] = identifier.get("value")
for address in val["address"]:
address["county"] = address.get("district", "")
for extension in address.get("extension", []):
if extension.get("url") == "http://hl7.org/fhir/StructureDefinition/geolocation":
for coord in extension.get("extension", []):
if coord.get("url") == "latitude":
address["latitude"] = coord.get("valueDecimal")
elif coord.get("url") == "longitude":
address["longitude"] = coord.get("valueDecimal")
for extension in fhir_record.get("extension", []):
if extension.get("url") == "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race":
for ext in extension.get("extension", []):
if ext.get("url") == "ombCategory":
val["race"] = ext.get("valueCoding", {}).get("display")
if extension.get("url") == "http://hl7.org/fhir/StructureDefinition/individual-genderIdentity":
for ext in extension.get("extension", []):
if ext.get("url") == "value":

Check warning on line 64 in src/recordlinker/linking/link.py

View check run for this annotation

Codecov / codecov/patch

src/recordlinker/linking/link.py#L59-L64

Added lines #L59 - L64 were not covered by tests
for coding in ext.get("valueCodeableConcept", {}).get("coding", []):
val["gender"] = coding.get("display")

return schemas.PIIRecord(**val)


Expand Down
16 changes: 8 additions & 8 deletions src/recordlinker/linking/matchers.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,10 +95,10 @@ def feature_match_any(
:param key: The name of the column being evaluated (e.g. "city").
:return: A float indicating whether the features are an exact match.
"""
rec_values = set(record.field_iter(key))
rec_values = set(record.feature_iter(key))
if not rec_values:
return 0
pat_values = set(patient.record.field_iter(key))
pat_values = set(patient.record.feature_iter(key))
return float(bool(rec_values & pat_values))


Expand All @@ -114,10 +114,10 @@ def feature_match_exact(
:param key: The name of the column being evaluated (e.g. "city").
:return: A float indicating whether the features are an exact match.
"""
rec_values = set(record.field_iter(key))
rec_values = set(record.feature_iter(key))
if not rec_values:
return 0
pat_values = set(patient.record.field_iter(key))
pat_values = set(patient.record.feature_iter(key))
return float(rec_values == pat_values)


Expand All @@ -138,8 +138,8 @@ def feature_match_fuzzy_string(
"""
similarity_measure, threshold = _get_fuzzy_params(str(key), **kwargs)
comp_func = getattr(rapidfuzz.distance, similarity_measure).normalized_similarity
for x in record.field_iter(key):
for y in patient.record.field_iter(key):
for x in record.feature_iter(key):
for y in patient.record.feature_iter(key):
score = comp_func(x, y)
if score >= threshold:
return 1
Expand Down Expand Up @@ -168,8 +168,8 @@ def feature_match_log_odds_fuzzy_compare(
similarity_measure, threshold = _get_fuzzy_params(str(key), **kwargs)
comp_func = getattr(rapidfuzz.distance, similarity_measure).normalized_similarity
max_score = 0.0
for x in patient.record.field_iter(key):
for y in record.field_iter(key):
for x in patient.record.feature_iter(key):
for y in record.feature_iter(key):
# for each permutation of values, find the score and record it if its
# larger than any previous score
max_score = max(comp_func(x, y), max_score)
Expand Down
152 changes: 142 additions & 10 deletions src/recordlinker/schemas/pii.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import datetime
import enum
import re
import typing

import dateutil.parser
Expand All @@ -22,6 +23,12 @@ class Feature(enum.Enum):
CITY = "CITY"
STATE = "STATE"
ZIP = "ZIP"
SSN = "SSN"
RACE = "RACE"
GENDER = "GENDER"
TELEPHONE = "TELEPHONE"
SUFFIX = "SUFFIX"
COUNTY = "COUNTY"

def __str__(self):
"""
Expand All @@ -45,6 +52,43 @@ def __str__(self):
"""
return self.value

class Race(enum.Enum):
"""
Enum for the Race field.
"""

AMERICAN_INDIAN = "AMERICAN_INDIAN"
ASIAN = "ASIAN"
BLACK = "BLACK"
HAWAIIAN = "HAWAIIAN"
WHITE = "WHITE"
OTHER = "OTHER"
ASKED_UNKNOWN = "ASKED_UNKNOWN"
UNKNOWN = "UNKNOWN"

def __str__(self):
"""
Return the value of the enum as a string.
"""
return self.value

class Gender(enum.Enum):
"""
Enum for the Gender field.
"""

FEMALE = "FEMALE"
MALE = "MALE"
NON_BINARY = "NON_BINARY"
ASKED_DECLINED = "ASKED_DECLINED"
UNKNOWN = "UNKNOWN"

def __str__(self):
"""
Return the value of the enum as a string.
"""
return self.value


class Name(pydantic.BaseModel):
"""
Expand All @@ -57,7 +101,7 @@ class Name(pydantic.BaseModel):
given: typing.List[str] = []
use: typing.Optional[str] = None
prefix: typing.List[str] = [] # future use
suffix: typing.List[str] = [] # future use
suffix: typing.List[str] = []


class Address(pydantic.BaseModel):
Expand All @@ -76,7 +120,7 @@ class Address(pydantic.BaseModel):
"postal_code", "postalcode", "postalCode", "zip_code", "zipcode", "zipCode", "zip"
),
)
county: typing.Optional[str] = None # future use
county: typing.Optional[str] = None
country: typing.Optional[str] = None
latitude: typing.Optional[float] = None
longitude: typing.Optional[float] = None
Expand Down Expand Up @@ -110,6 +154,9 @@ class PIIRecord(pydantic.BaseModel):
address: typing.List[Address] = []
name: typing.List[Name] = []
telecom: typing.List[Telecom] = []
ssn: typing.Optional[str] = None
race: typing.Optional[Race] = None
gender: typing.Optional[Gender] = None

@classmethod
def model_construct(cls, _fields_set: set[str] | None = None, **values: typing.Any) -> typing.Self:
Expand Down Expand Up @@ -154,8 +201,71 @@ def parse_sex(cls, value):
elif val in ["f", "female"]:
return Sex.FEMALE
return Sex.UNKNOWN

@pydantic.field_validator("ssn", mode="before")
def parse_ssn(cls, value):
"""
Parse the ssn string
"""
if value:
val = str(value).strip()

if re.match(r"^\d{3}-\d{2}-\d{4}$", val):
return val

if len(val) != 9 or not val.isdigit():
return None

# Format back to the standard SSN format (XXX-XX-XXXX)
formatted_ssn = f"{val[:3]}-{val[3:5]}-{val[5:]}"
return formatted_ssn

@pydantic.field_validator("race", mode="before")
def parse_race(cls, value):
"""
Prase the race string into a race enum
"""

race_mapping = [
(["american indian", "alaska native"], Race.AMERICAN_INDIAN),
(["asian"], Race.ASIAN),
(["black", "african american"], Race.BLACK),
(["white"], Race.WHITE),
(["hawaiian", "pacific islander"], Race.HAWAIIAN),
(["asked unknown", "asked but unknown"], Race.ASKED_UNKNOWN),
(["unknown"], Race.UNKNOWN),
]

def field_iter(self, feature: Feature) -> typing.Iterator[str]:
if value:
val = str(value).lower().strip()
for substrings, race in race_mapping:
if any(substring in val for substring in substrings):
return race
return Race.OTHER



@pydantic.field_validator("gender", mode="before")
def parse_gender(cls, value):
"""
Prase the gender string into a gender enum
"""
if value:
val = str(value).lower().strip()
try:
return Gender(val)
except ValueError:
if "female" in val:
return Gender.FEMALE
elif "male" in val:
return Gender.MALE
elif "nonbinary" in val:
return Gender.NON_BINARY
elif "declined" in val or "asked" in val:
return Gender.ASKED_DECLINED
return Gender.UNKNOWN

def feature_iter(self, feature: Feature) -> typing.Iterator[str]:
"""
Given a field name, return an iterator of all string values for that field.
Empty strings are not included in the iterator.
Expand Down Expand Up @@ -200,6 +310,28 @@ def field_iter(self, feature: Feature) -> typing.Iterator[str]:
for name in self.name:
if name.family:
yield name.family
elif feature == Feature.SSN:
if self.ssn:
yield self.ssn
elif feature == Feature.RACE:
if self.race:
yield str(self.race)
elif feature == Feature.GENDER:
if self.gender:
yield str(self.gender)
elif feature == Feature.TELEPHONE:
for telecom in self.telecom:
if telecom.value:
yield telecom.value
elif feature == Feature.SUFFIX:
for name in self.name:
for suffix in name.suffix:
if suffix:
yield suffix
elif feature == Feature.COUNTY:
for address in self.address:
if address.county:
yield address.county

def blocking_keys(self, key: models.BlockingKey) -> set[str]:
"""
Expand All @@ -214,19 +346,19 @@ def blocking_keys(self, key: models.BlockingKey) -> set[str]:

if key == models.BlockingKey.BIRTHDATE:
# NOTE: we could optimize here and remove the dashes from the date
vals.update(self.field_iter(Feature.BIRTHDATE))
vals.update(self.feature_iter(Feature.BIRTHDATE))
elif key == models.BlockingKey.MRN:
vals.update({x[-4:] for x in self.field_iter(Feature.MRN)})
vals.update({x[-4:] for x in self.feature_iter(Feature.MRN)})
elif key == models.BlockingKey.SEX:
vals.update(self.field_iter(Feature.SEX))
vals.update(self.feature_iter(Feature.SEX))
elif key == models.BlockingKey.ZIP:
vals.update(self.field_iter(Feature.ZIP))
vals.update(self.feature_iter(Feature.ZIP))
elif key == models.BlockingKey.FIRST_NAME:
vals.update({x[:4] for x in self.field_iter(Feature.FIRST_NAME)})
vals.update({x[:4] for x in self.feature_iter(Feature.FIRST_NAME)})
elif key == models.BlockingKey.LAST_NAME:
vals.update({x[:4] for x in self.field_iter(Feature.LAST_NAME)})
vals.update({x[:4] for x in self.feature_iter(Feature.LAST_NAME)})
elif key == models.BlockingKey.ADDRESS:
vals.update({x[:4] for x in self.field_iter(Feature.ADDRESS)})
vals.update({x[:4] for x in self.feature_iter(Feature.ADDRESS)})

# if any vals are longer than the BLOCKING_KEY_MAX_LENGTH, raise an error
if any(len(x) > models.BLOCKING_VALUE_MAX_LENGTH for x in vals):
Expand Down
Loading
Loading