Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bugfix: bulk insert serialization of data #136

Merged
merged 1 commit into from
Nov 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/recordlinker/database/mpi_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def bulk_insert_patients(
pat_data = [
{
"person_id": person and person.id,
"_data": record.to_json(prune_empty=True),
"_data": record.to_dict(prune_empty=True),
"external_patient_id": record.external_id,
"external_person_id": external_person_id,
"external_person_source": "IRIS" if external_person_id else None,
Expand Down
5 changes: 1 addition & 4 deletions src/recordlinker/models/mpi.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import enum
import json
import uuid

from sqlalchemy import orm
Expand Down Expand Up @@ -96,12 +95,10 @@ def record(self, value):
from recordlinker.schemas import pii

assert isinstance(value, pii.PIIRecord), "Expected a PIIRecord object"
# convert the data to a JSON string, then load it back as a dictionary
# this is necessary to ensure all data elements are JSON serializable
# recursively remove all None and unset values from the data
# this is an optimization to reduce the amount of data stored in the
# database, if a value is empty, no need to store it
self._data = json.loads(value.to_json(prune_empty=True))
self._data = value.to_dict(prune_empty=True)
if hasattr(self, "_record"):
# if the record property is cached, delete it
del self._record
Expand Down
10 changes: 10 additions & 0 deletions src/recordlinker/schemas/pii.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import datetime
import enum
import json
import re
import typing

Expand Down Expand Up @@ -288,6 +289,15 @@ def to_json(self, prune_empty: bool = False) -> str:
"""
return self.model_dump_json(exclude_unset=prune_empty, exclude_none=prune_empty)

def to_dict(self, prune_empty: bool = False) -> dict:
"""
Convert the PIIRecord object to a dictionary.
"""
# convert the data to a JSON string, then load it back as a dictionary
# this is necessary to ensure all data elements are JSON serializable
data = self.to_json(prune_empty=prune_empty)
return json.loads(data)

def feature_iter(self, feature: Feature) -> typing.Iterator[str]:
"""
Given a field name, return an iterator of all string values for that field.
Expand Down
7 changes: 3 additions & 4 deletions tests/unit/database/test_mpi_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
This module contains the unit tests for the recordlinker.database.mpi_service module.
"""

import json
import uuid

import pytest
Expand Down Expand Up @@ -253,7 +252,7 @@ def test_no_person(self, session):
patients = mpi_service.bulk_insert_patients(session, [rec], external_person_id="123456")
assert len(patients) == 1
assert patients[0].person_id is None
assert json.loads(patients[0].data) == {
assert patients[0].data == {
"name": [{"given": ["Johnathon"], "family": "Smith"}]
}
assert patients[0].external_person_id == "123456"
Expand All @@ -280,11 +279,11 @@ def test_with_person(self, session):
assert len(patients) == 2
assert patients[0].person_id == person.id
assert patients[1].person_id == person.id
assert json.loads(patients[0].data) == {
assert patients[0].data == {
"birth_date": "1950-01-01",
"name": [{"given": ["George"], "family": "Harrison"}],
}
assert json.loads(patients[1].data) == {
assert patients[1].data == {
"birth_date": "1950-01-01",
"name": [{"given": ["George", "Harold"], "family": "Harrison"}],
}
Expand Down
28 changes: 27 additions & 1 deletion tests/unit/routes/test_seed_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
This module contains the unit tests for the recordlinker.routes.seed_router module.
"""

import unittest.mock as mock

from conftest import load_test_json_asset

from recordlinker import models
Expand All @@ -20,7 +22,10 @@ def test_too_many_clusters(self, client):
data = {"clusters": [{"records": []} for _ in range(101)]}
response = client.post("/seed", json=data)
assert response.status_code == 422
assert response.json()["detail"][0]["msg"] == "Value error, Clusters must not exceed 100 records"
assert (
response.json()["detail"][0]["msg"]
== "Value error, Clusters must not exceed 100 records"
)

def test_large_batch(self, client):
data = load_test_json_asset("seed_test.json.gz")
Expand All @@ -35,6 +40,27 @@ def test_large_batch(self, client):
assert client.session.query(models.Patient).count() == 1285
assert client.session.query(models.BlockingValue).count() == 8995

@mock.patch("recordlinker.database.algorithm_service.default_algorithm")
def test_seed_and_link(self, mock_algorithm, basic_algorithm, client):
mock_algorithm.return_value = basic_algorithm
record = {
"birth_date": "1956-09-06",
"sex": "F",
"address": [{"line": ["581 Baker Club"], "postal_code": "80373"}],
"name": [
{
"family": "Cervantes",
"given": ["Jason"],
}
],
}
seed_resp = client.post("/seed", json={"clusters": [{"records": [record]}]})
assert seed_resp.status_code == 201
persons = seed_resp.json()["persons"]
assert len(persons) == 1
response = client.post("/link", json={"record": record})
assert response.status_code == 200


class TestReset:
def test_reset(self, client):
Expand Down
Loading