CDCgov · cbrinson-rise8 · Oct 29, 2024 · Oct 18, 2024 · Oct 28, 2024 · Oct 28, 2024
@@ -0,0 +1,17 @@
+# Adding a New Feature
+
+### Add the New Feature to the `Feature` Class
+- In [src/recordlinker/schemas/pii.py](https://github.com/CDCgov/RecordLinker/blob/a672d2b6409cbd1a08f729d94fba5692f57f6fc6/src/recordlinker/schemas/pii.py), add the new feature to the [Feature](https://github.com/CDCgov/RecordLinker/blob/a672d2b6409cbd1a08f729d94fba5692f57f6fc6/src/recordlinker/schemas/pii.py#L12C7-L12C14) enum class.
+
+### Update the `PIIRecord` Schema
+- In the same file, modify the [PIIRecord](https://github.com/CDCgov/RecordLinker/blob/c85f555e5da91d54eb8c51e3bdf0789d1e204b2f/src/recordlinker/schemas/pii.py#L97) class to include the new feature as a field.
+- If the feature requires predefined values, create an enum to represent those values.
+
+### Modify the `PIIRecord.feature_iter` Method
+- Update the [PIIRecord.feature_iter](https://github.com/CDCgov/RecordLinker/blob/a672d2b6409cbd1a08f729d94fba5692f57f6fc6/src/recordlinker/schemas/pii.py#L246) method to return the value of the new feature when it's used for comparison.
+
+### Extract the FHIR Field in `fhir_record_to_pii_record`
+- In [src/recordlinker/linking/link.py](https://github.com/CDCgov/RecordLinker/blob/a672d2b6409cbd1a08f729d94fba5692f57f6fc6/src/recordlinker/linking/link.py), update the [fhir_record_to_pii_record](https://github.com/CDCgov/RecordLinker/blob/a672d2b6409cbd1a08f729d94fba5692f57f6fc6/src/recordlinker/linking/link.py#L26) function to map the relevant FHIR field to the new feature in [PIIRecord](https://github.com/CDCgov/RecordLinker/blob/c85f555e5da91d54eb8c51e3bdf0789d1e204b2f/src/recordlinker/schemas/pii.py#L97).
+
+### Update the Tests
+- Add or modify unit tests to verify that the new feature is properly extracted, mapped, and compared. 
@@ -33,21 +33,38 @@
         "birthDate": fhir_record.get("birthDate"),
         "sex": fhir_record.get("gender"),
         "address": fhir_record.get("address", []),
-        "phone": fhir_record.get("telecom", []),
         "mrn": None,
+        "ssn": None,
+        "race": None,
+        "gender": None,
+        "telecom": fhir_record.get("telecom", []),
     }
     for identifier in fhir_record.get("identifier", []):
         for coding in identifier.get("type", {}).get("coding", []):
             if coding.get("code") == "MR":
                 val["mrn"] = identifier.get("value")
+            elif coding.get("code") == "SS":
+                val["ssn"] = identifier.get("value")
     for address in val["address"]:
+        address["county"] = address.get("district", "")
         for extension in address.get("extension", []):
             if extension.get("url") == "http://hl7.org/fhir/StructureDefinition/geolocation":
                 for coord in extension.get("extension", []):
                     if coord.get("url") == "latitude":
                         address["latitude"] = coord.get("valueDecimal")
                     elif coord.get("url") == "longitude":
                         address["longitude"] = coord.get("valueDecimal")
+    for extension in fhir_record.get("extension", []):
+        if extension.get("url") == "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race":
+            for ext in extension.get("extension", []):
+                if ext.get("url") == "ombCategory":
+                    val["race"] = ext.get("valueCoding", {}).get("display")
+        if extension.get("url") == "http://hl7.org/fhir/StructureDefinition/individual-genderIdentity":
+            for ext in extension.get("extension", []):
+                if ext.get("url") == "value":
+                    for coding in ext.get("valueCodeableConcept", {}).get("coding", []):
+                        val["gender"] = coding.get("display")
+
     return schemas.PIIRecord(**val)
 
 

@@ -95,10 +95,10 @@ def feature_match_any(
     :param key: The name of the column being evaluated (e.g. "city").
     :return: A float indicating whether the features are an exact match.
     """
-    rec_values = set(record.field_iter(key))
+    rec_values = set(record.feature_iter(key))
     if not rec_values:
         return 0
-    pat_values = set(patient.record.field_iter(key))
+    pat_values = set(patient.record.feature_iter(key))
     return float(bool(rec_values & pat_values))
 
 
@@ -114,10 +114,10 @@ def feature_match_exact(
     :param key: The name of the column being evaluated (e.g. "city").
     :return: A float indicating whether the features are an exact match.
     """
-    rec_values = set(record.field_iter(key))
+    rec_values = set(record.feature_iter(key))
     if not rec_values:
         return 0
-    pat_values = set(patient.record.field_iter(key))
+    pat_values = set(patient.record.feature_iter(key))
     return float(rec_values == pat_values)
 
 
@@ -138,8 +138,8 @@ def feature_match_fuzzy_string(
     """
     similarity_measure, threshold = _get_fuzzy_params(str(key), **kwargs)
     comp_func = getattr(rapidfuzz.distance, similarity_measure).normalized_similarity
-    for x in record.field_iter(key):
-        for y in patient.record.field_iter(key):
+    for x in record.feature_iter(key):
+        for y in patient.record.feature_iter(key):
             score = comp_func(x, y)
             if score >= threshold:
                 return 1
@@ -168,8 +168,8 @@ def feature_match_log_odds_fuzzy_compare(
     similarity_measure, threshold = _get_fuzzy_params(str(key), **kwargs)
     comp_func = getattr(rapidfuzz.distance, similarity_measure).normalized_similarity
     max_score = 0.0
-    for x in patient.record.field_iter(key):
-        for y in record.field_iter(key):
+    for x in patient.record.feature_iter(key):
+        for y in record.feature_iter(key):
             # for each permutation of values, find the score and record it if its
             # larger than any previous score
             max_score = max(comp_func(x, y), max_score)

@@ -1,5 +1,6 @@
 import datetime
 import enum
+import re
 import typing
 
 import dateutil.parser
@@ -22,6 +23,12 @@ class Feature(enum.Enum):
     CITY = "CITY"
     STATE = "STATE"
     ZIP = "ZIP"
+    SSN = "SSN"
+    RACE = "RACE"
+    GENDER = "GENDER"
+    TELEPHONE = "TELEPHONE"
+    SUFFIX = "SUFFIX"
+    COUNTY = "COUNTY"
 
     def __str__(self):
         """
@@ -45,6 +52,43 @@ def __str__(self):
         """
         return self.value
 
+class Race(enum.Enum):
+    """
+    Enum for the Race field.
+    """
+
+    AMERICAN_INDIAN = "AMERICAN_INDIAN"
+    ASIAN = "ASIAN"
+    BLACK = "BLACK"
+    HAWAIIAN = "HAWAIIAN"
+    WHITE = "WHITE"
+    OTHER = "OTHER"
+    ASKED_UNKNOWN = "ASKED_UNKNOWN"
+    UNKNOWN = "UNKNOWN"
+
+    def __str__(self):
+        """
+        Return the value of the enum as a string.
+        """
+        return self.value
+
+class Gender(enum.Enum):
+    """
+    Enum for the Gender field.
+    """
+
+    FEMALE = "FEMALE"
+    MALE = "MALE"
+    NON_BINARY = "NON_BINARY"
+    ASKED_DECLINED = "ASKED_DECLINED"
+    UNKNOWN = "UNKNOWN"
+
+    def __str__(self):
+        """
+        Return the value of the enum as a string.
+        """
+        return self.value
+
 
 class Name(pydantic.BaseModel):
     """
@@ -57,7 +101,7 @@ class Name(pydantic.BaseModel):
     given: typing.List[str] = []
     use: typing.Optional[str] = None
     prefix: typing.List[str] = []  # future use
-    suffix: typing.List[str] = []  # future use
+    suffix: typing.List[str] = []
 
 
 class Address(pydantic.BaseModel):
@@ -76,7 +120,7 @@ class Address(pydantic.BaseModel):
             "postal_code", "postalcode", "postalCode", "zip_code", "zipcode", "zipCode", "zip"
         ),
     )
-    county: typing.Optional[str] = None  # future use
+    county: typing.Optional[str] = None
     country: typing.Optional[str] = None
     latitude: typing.Optional[float] = None
     longitude: typing.Optional[float] = None
@@ -110,6 +154,9 @@ class PIIRecord(pydantic.BaseModel):
     address: typing.List[Address] = []
     name: typing.List[Name] = []
     telecom: typing.List[Telecom] = []
+    ssn: typing.Optional[str] = None
+    race: typing.Optional[Race] = None
+    gender: typing.Optional[Gender] = None
 
     @classmethod
     def model_construct(cls, _fields_set: set[str] | None = None, **values: typing.Any) -> typing.Self:
@@ -154,8 +201,71 @@ def parse_sex(cls, value):
             elif val in ["f", "female"]:
                 return Sex.FEMALE
             return Sex.UNKNOWN
+
+    @pydantic.field_validator("ssn", mode="before")
+    def parse_ssn(cls, value):
+        """
+        Parse the ssn string 
+        """
+        if value:
+            val = str(value).strip()
+
+            if re.match(r"^\d{3}-\d{2}-\d{4}$", val):
+                return val 
+
+            if len(val) != 9 or not val.isdigit():
+                return None
+
+            # Format back to the standard SSN format (XXX-XX-XXXX)
+            formatted_ssn = f"{val[:3]}-{val[3:5]}-{val[5:]}"
+            return formatted_ssn
+
+    @pydantic.field_validator("race", mode="before")
+    def parse_race(cls, value):
+        """
+        Prase the race string into a race enum
+        """
+
+        race_mapping = [
+        (["american indian", "alaska native"], Race.AMERICAN_INDIAN),
+        (["asian"], Race.ASIAN),
+        (["black", "african american"], Race.BLACK),
+        (["white"], Race.WHITE),
+        (["hawaiian", "pacific islander"], Race.HAWAIIAN),
+        (["asked unknown", "asked but unknown"], Race.ASKED_UNKNOWN),
+        (["unknown"], Race.UNKNOWN),
+    ]
 
-    def field_iter(self, feature: Feature) -> typing.Iterator[str]:
+        if value:
+            val = str(value).lower().strip()
+            for substrings, race in race_mapping:
+                if any(substring in val for substring in substrings):
+                    return race
+            return Race.OTHER
+
+
+
+    @pydantic.field_validator("gender", mode="before")
+    def parse_gender(cls, value):
+        """
+        Prase the gender string into a gender enum
+        """
+        if value:
+            val = str(value).lower().strip()
+            try:
+                return Gender(val)
+            except ValueError:
+                if "female" in val:
+                    return Gender.FEMALE
+                elif "male" in val:
+                    return Gender.MALE
+                elif "nonbinary" in val:
+                    return Gender.NON_BINARY
+                elif "declined" in val or "asked" in val:
+                    return Gender.ASKED_DECLINED
+                return Gender.UNKNOWN
+
+    def feature_iter(self, feature: Feature) -> typing.Iterator[str]:
         """
         Given a field name, return an iterator of all string values for that field.
         Empty strings are not included in the iterator.
@@ -200,6 +310,28 @@ def field_iter(self, feature: Feature) -> typing.Iterator[str]:
             for name in self.name:
                 if name.family:
                     yield name.family
+        elif feature == Feature.SSN:
+            if self.ssn:
+                yield self.ssn
+        elif feature == Feature.RACE:
+            if self.race:
+                yield str(self.race)
+        elif feature == Feature.GENDER:
+            if self.gender:
+                yield str(self.gender)
+        elif feature == Feature.TELEPHONE:
+            for telecom in self.telecom:
+                if telecom.value:
+                    yield telecom.value
+        elif feature == Feature.SUFFIX:
+            for name in self.name:
+                for suffix in name.suffix:
+                    if suffix:
+                        yield suffix
+        elif feature == Feature.COUNTY:
+            for address in self.address:
+                if address.county:
+                    yield address.county
 
     def blocking_keys(self, key: models.BlockingKey) -> set[str]:
         """
@@ -214,19 +346,19 @@ def blocking_keys(self, key: models.BlockingKey) -> set[str]:
 
         if key == models.BlockingKey.BIRTHDATE:
             # NOTE: we could optimize here and remove the dashes from the date
-            vals.update(self.field_iter(Feature.BIRTHDATE))
+            vals.update(self.feature_iter(Feature.BIRTHDATE))
         elif key == models.BlockingKey.MRN:
-            vals.update({x[-4:] for x in self.field_iter(Feature.MRN)})
+            vals.update({x[-4:] for x in self.feature_iter(Feature.MRN)})
         elif key == models.BlockingKey.SEX:
-            vals.update(self.field_iter(Feature.SEX))
+            vals.update(self.feature_iter(Feature.SEX))
         elif key == models.BlockingKey.ZIP:
-            vals.update(self.field_iter(Feature.ZIP))
+            vals.update(self.feature_iter(Feature.ZIP))
         elif key == models.BlockingKey.FIRST_NAME:
-            vals.update({x[:4] for x in self.field_iter(Feature.FIRST_NAME)})
+            vals.update({x[:4] for x in self.feature_iter(Feature.FIRST_NAME)})
         elif key == models.BlockingKey.LAST_NAME:
-            vals.update({x[:4] for x in self.field_iter(Feature.LAST_NAME)})
+            vals.update({x[:4] for x in self.feature_iter(Feature.LAST_NAME)})
         elif key == models.BlockingKey.ADDRESS:
-            vals.update({x[:4] for x in self.field_iter(Feature.ADDRESS)})
+            vals.update({x[:4] for x in self.feature_iter(Feature.ADDRESS)})
 
         # if any vals are longer than the BLOCKING_KEY_MAX_LENGTH, raise an error
         if any(len(x) > models.BLOCKING_VALUE_MAX_LENGTH for x in vals):