From 3db9565ecc5d5294243ce5c04ad9b7cf9bd90a2f Mon Sep 17 00:00:00 2001 From: fredkingham Date: Tue, 15 Nov 2022 22:25:02 +0000 Subject: [PATCH 1/6] WIP Duplicate patient checks, vpn is down so needs to be tested --- plugins/data_quality/checks/__init__.py | 8 +- .../data_quality/checks/find_duplicates.py | 160 ++++++++++++++++++ plugins/data_quality/checks/utils.py | 16 ++ .../templates/emails/duplicate_patients.html | 27 +++ 4 files changed, 210 insertions(+), 1 deletion(-) create mode 100644 plugins/data_quality/checks/find_duplicates.py create mode 100644 plugins/data_quality/checks/utils.py create mode 100644 plugins/data_quality/templates/emails/duplicate_patients.html diff --git a/plugins/data_quality/checks/__init__.py b/plugins/data_quality/checks/__init__.py index da1722690..43eb172cf 100644 --- a/plugins/data_quality/checks/__init__.py +++ b/plugins/data_quality/checks/__init__.py @@ -1,2 +1,8 @@ -daily = [] +from plugins.data_quality.checks import find_duplicates + + +daily = [ + find_duplicates.find_exact_duplicates, + find_duplicates.find_zero_leading_duplicates, +] monthly = [] diff --git a/plugins/data_quality/checks/find_duplicates.py b/plugins/data_quality/checks/find_duplicates.py new file mode 100644 index 000000000..409c8ad5f --- /dev/null +++ b/plugins/data_quality/checks/find_duplicates.py @@ -0,0 +1,160 @@ +import logging +from django.db.models import Count +from opal.core import subrecords +from opal.models import Patient +from elcid.models import Demographics +from . import utils +from plugins.data_quality import logger + + +SUBRECORDS_TO_IGNORE = set([ + 'InitialPatientLoad', + 'Demographics', + 'ContactInformation', + 'NextOfKinDetails', + 'GPDetails', + 'ExternalDemographics', +]) + + + +def find_exact_duplicates(): + """ + Returns + Returns the patients that are duplicates as a list of tuples + where the tuples looks like (from_patient, to_patient,) + ie the defunct patient and the patient to copy records to. + """ + dup_dempgraphics = ( + Demographics.objects.values("hospital_number") + .annotate(cnt=Count("id")) + .filter(cnt__gt=1) + ) + duplicate_hns = [i["hospital_number"] for i in dup_dempgraphics if i["hospital_number"].strip()] + if not duplicate_hns: + # No patients with duplicate hospital numbers our work here is done. + return + dup_patients = [] + for duplicate_hn in duplicate_hns: + patients = tuple(Patient.objects.filter( + demographics__hospital_number=duplicate_hn + )) + if len(patients) > 2: + # In theory this is not wrong, in practice it does not happen, so if it does + # something unexpected has happened. + raise ValueError(f'We have found {len(patients)} for hn {duplicate_hn}') + dup_patients.append(patients) + cleanable_duplicates, uncleanable_duplicates = calculate_deletable_undeletable() + cleanable_duplicates, uncleanable_duplicates = calculate_deletable_undeletable() + email_title = f'{len(cleanable_duplicates) + len (uncleanable_duplicates)} Exact hospital number duplicates' + email_context = { + "title": email_title, + "cleanable_duplicates": cleanable_duplicates, + "uncleanable_duplicates": uncleanable_duplicates + } + utils.send_email( + email_title, + "emails/duplicate_patients.html", + email_context + ) + + +def find_zero_leading_duplicates(): + with_zero_hns = Demographics.objects.filter( + hospital_number__startswith='0').values_list('hospital_number', flat=True + ) + dup_patients = [] + for with_zero in with_zero_hns: + without_zero = with_zero.lstrip('0') + # We have patient numbers like 000. Don't match + # these to an empty string + if len(without_zero) == 0: + continue + with_zero_patients = list(Patient.objects.filter( + demographics__hospital_number=with_zero + )) + without_zero_patients = list(Patient.objects.filter( + demographics__hospital_number=without_zero + )) + if len(with_zero_patients) > 1 or len(without_zero_patient) > 1: + # we expect these patients to be covered by the find_exact_duplicates + # check + continue + if without_zero_patients == 0: + # we have no patients with the zero stripped so lets continue + continue + dup_patients.append(with_zero[0], without_zero_patients[0]) + cleanable_duplicates, uncleanable_duplicates = calculate_deletable_undeletable() + email_title = f'{len(cleanable_duplicates) + len (uncleanable_duplicates)} Exact hospital number duplicates' + email_context = { + "title": email_title, + "cleanable_duplicates": cleanable_duplicates, + "uncleanable_duplicates": uncleanable_duplicates + } + utils.send_email( + email_title, + "emails/duplicate_patients.html", + email_context + ) + + +def has_records(patient): + """ + Returns a list of populated the subrecords connected to the patient + excluding the ones in SUBRECORD_TO_IGNORE + """ + result = [] + for subrecord in subrecords.episode_subrecords(): + if subrecord._is_singleton: + result.extend(subrecord.objects.filter(episode__patient_id=patient.id).exclude(updated=None)) + else: + result.extend(subrecord.objects.filter(episode__patient_id=patient.id)) + for subrecord in subrecords.patient_subrecords(): + if subrecord.__name__ in SUBRECORDS_TO_IGNORE: + continue + if subrecord._is_singleton: + result.extend(subrecord.objects.filter(patient_id=patient.id).exclude(updated=None)) + else: + result.extend(subrecord.objects.filter(patient_id=patient.id)) + return result + + +def calculate_deletable_undeletable(patient_tuples): + """ + Takes a list of tuples of patients that we think are the same. + + Sorts them into ones we can deal with and ones we can't. + + The first item in the tuple returned is a list of patient tuples where + we can safely delete the first one. + + The second item in the tuple returned is a list of patient tuples + where we do not know if we can delete them or not. + + ie + returns a tuple of ( + [(to_delete, to_keep)], + [(unmergable patient 1, unmergable patient 2)] + ) + """ + unmergable = [] + duplicate_patients = [] + for patients in patient_tuples: + patient_1 = patients[0] + patient_1_native_records = has_records(patient_1) + patient_2 = patients[1] + patient_2_native_records = has_records(patient_2) + if len(patient_1_native_records) and len(patient_2_native_records): + logger.info(f'====== Unable to merge hn {duplicate_hn} as they both have duplicate records') + logger.info(f'=== Patient 1 ({patient_1.id}) has:') + for record in patient_1_native_records: + logger.info(f'{record.__class__.__name__} {record.id}') + logger.info(f'=== Patient 2 ({patient_2.id}) has:') + for record in patient_2_native_records: + logger.info(f'{record.__class__.__name__} {record.id}') + unmergable.append((patient_1, patient_2,)) + elif len(patient_1_native_records) and not len(patient_2_native_records): + duplicate_patients.append((patient_2, patient_1)) + else: + duplicate_patients.append((patient_1, patient_2)) + return duplicate_patients, unmergable diff --git a/plugins/data_quality/checks/utils.py b/plugins/data_quality/checks/utils.py new file mode 100644 index 000000000..56bca05c4 --- /dev/null +++ b/plugins/data_quality/checks/utils.py @@ -0,0 +1,16 @@ +from django.core.mail import send_mail +from django.template.loader import render_to_string +from django.utils.html import strip_tags +from django.conf import settings + + +def send_email(title, template_name, context): + html_message = render_to_string(template_name, context) + plain_message = strip_tags(html_message) + send_mail( + title, + plain_message, + settings.DEFAULT_FROM_EMAIL, + settings.ADMINS, + html_message=html_message, + ) diff --git a/plugins/data_quality/templates/emails/duplicate_patients.html b/plugins/data_quality/templates/emails/duplicate_patients.html new file mode 100644 index 000000000..4f7c59995 --- /dev/null +++ b/plugins/data_quality/templates/emails/duplicate_patients.html @@ -0,0 +1,27 @@ + + +

{{ title }}

+

Cleanable patients

+ + + {% for patient_to_delete, patient_to_keep in cleanable_duplicates %} + + + + + {% endfor %} +
Patient to deletePatient to keep
{{ patient_to_delete }}{{ patient_to_keep }}
+ +

Unmergable patients

+ + + {% for patient_1, patient_2 in uncleanable_duplicates %} + + + + + {% endfor %} +
Patient 1Patient 2
{{ patient_1 }}{{ patient_2 }}
+ + + From b5b9c0c7137278158ac03bd97d7cd2b28b6327f5 Mon Sep 17 00:00:00 2001 From: fredkingham Date: Wed, 16 Nov 2022 08:49:34 +0000 Subject: [PATCH 2/6] Changes the duplicate feed to use the data_quality utils email function --- plugins/data_quality/checks/find_duplicates.py | 3 +-- plugins/data_quality/checks/utils.py | 16 ---------------- 2 files changed, 1 insertion(+), 18 deletions(-) delete mode 100644 plugins/data_quality/checks/utils.py diff --git a/plugins/data_quality/checks/find_duplicates.py b/plugins/data_quality/checks/find_duplicates.py index 409c8ad5f..ad82130b6 100644 --- a/plugins/data_quality/checks/find_duplicates.py +++ b/plugins/data_quality/checks/find_duplicates.py @@ -3,8 +3,7 @@ from opal.core import subrecords from opal.models import Patient from elcid.models import Demographics -from . import utils -from plugins.data_quality import logger +from plugins.data_quality import logger, utils SUBRECORDS_TO_IGNORE = set([ diff --git a/plugins/data_quality/checks/utils.py b/plugins/data_quality/checks/utils.py deleted file mode 100644 index 56bca05c4..000000000 --- a/plugins/data_quality/checks/utils.py +++ /dev/null @@ -1,16 +0,0 @@ -from django.core.mail import send_mail -from django.template.loader import render_to_string -from django.utils.html import strip_tags -from django.conf import settings - - -def send_email(title, template_name, context): - html_message = render_to_string(template_name, context) - plain_message = strip_tags(html_message) - send_mail( - title, - plain_message, - settings.DEFAULT_FROM_EMAIL, - settings.ADMINS, - html_message=html_message, - ) From 56eeda771780c230cd048e6b298d100c3755f263 Mon Sep 17 00:00:00 2001 From: fredkingham Date: Wed, 16 Nov 2022 09:01:29 +0000 Subject: [PATCH 3/6] Adds unit tests and fixes for the find duplicates check --- .../data_quality/checks/find_duplicates.py | 18 +- .../tests/test_find_duplicates.py | 177 ++++++++++++++++++ 2 files changed, 188 insertions(+), 7 deletions(-) create mode 100644 plugins/data_quality/tests/test_find_duplicates.py diff --git a/plugins/data_quality/checks/find_duplicates.py b/plugins/data_quality/checks/find_duplicates.py index ad82130b6..1f2fa0b6a 100644 --- a/plugins/data_quality/checks/find_duplicates.py +++ b/plugins/data_quality/checks/find_duplicates.py @@ -43,8 +43,7 @@ def find_exact_duplicates(): # something unexpected has happened. raise ValueError(f'We have found {len(patients)} for hn {duplicate_hn}') dup_patients.append(patients) - cleanable_duplicates, uncleanable_duplicates = calculate_deletable_undeletable() - cleanable_duplicates, uncleanable_duplicates = calculate_deletable_undeletable() + cleanable_duplicates, uncleanable_duplicates = calculate_deletable_undeletable(dup_patients) email_title = f'{len(cleanable_duplicates) + len (uncleanable_duplicates)} Exact hospital number duplicates' email_context = { "title": email_title, @@ -75,15 +74,18 @@ def find_zero_leading_duplicates(): without_zero_patients = list(Patient.objects.filter( demographics__hospital_number=without_zero )) - if len(with_zero_patients) > 1 or len(without_zero_patient) > 1: + if len(with_zero_patients) > 1 or len(without_zero_patients) > 1: # we expect these patients to be covered by the find_exact_duplicates # check continue - if without_zero_patients == 0: + if len(without_zero_patients) == 0: # we have no patients with the zero stripped so lets continue continue - dup_patients.append(with_zero[0], without_zero_patients[0]) - cleanable_duplicates, uncleanable_duplicates = calculate_deletable_undeletable() + dup_patients.append((with_zero_patients[0], without_zero_patients[0],)) + if len(dup_patients) == 0: + # if there are no duplicate patients, return + return + cleanable_duplicates, uncleanable_duplicates = calculate_deletable_undeletable(dup_patients) email_title = f'{len(cleanable_duplicates) + len (uncleanable_duplicates)} Exact hospital number duplicates' email_context = { "title": email_title, @@ -141,10 +143,12 @@ def calculate_deletable_undeletable(patient_tuples): for patients in patient_tuples: patient_1 = patients[0] patient_1_native_records = has_records(patient_1) + patient_1_hn = patient_1.demographics_set.get().hospital_number patient_2 = patients[1] patient_2_native_records = has_records(patient_2) + patient_2_hn = patient_2.demographics_set.get().hospital_number if len(patient_1_native_records) and len(patient_2_native_records): - logger.info(f'====== Unable to merge hn {duplicate_hn} as they both have duplicate records') + logger.info(f'====== Unable to merge hn {patient_1_hn}/{patient_2_hn} as they both have duplicate records') logger.info(f'=== Patient 1 ({patient_1.id}) has:') for record in patient_1_native_records: logger.info(f'{record.__class__.__name__} {record.id}') diff --git a/plugins/data_quality/tests/test_find_duplicates.py b/plugins/data_quality/tests/test_find_duplicates.py new file mode 100644 index 000000000..6adffa9d2 --- /dev/null +++ b/plugins/data_quality/tests/test_find_duplicates.py @@ -0,0 +1,177 @@ +from unittest import mock +from opal.core.test import OpalTestCase +from plugins.data_quality.checks import find_duplicates + + +@mock.patch('plugins.data_quality.utils.send_email') +class FindExactDuplicatesTestCase(OpalTestCase): + def test_finds_exact_cleanable_duplicates(self, send_email): + """ + Cleanable duplicates are when we can delete one without issue. + + In this case only patient 1 has a subrecord so + patient 2 can be deleted. + """ + patient_1, episode = self.new_patient_and_episode_please() + patient_1.demographics_set.update( + hospital_number='111111', + ) + diagnosis = episode.diagnosis_set.create() + diagnosis.condition = "Cough" + diagnosis.save() + patient_2, _ = self.new_patient_and_episode_please() + patient_2.demographics_set.update( + hospital_number='111111', + ) + find_duplicates.find_exact_duplicates() + call_args = send_email.call_args[0] + self.assertEqual( + call_args[0], '1 Exact hospital number duplicates' + ) + self.assertEqual( + call_args[2]["title"], '1 Exact hospital number duplicates' + ) + self.assertEqual( + call_args[2]["uncleanable_duplicates"], [] + ) + self.assertEqual( + call_args[2]["cleanable_duplicates"], [(patient_2, patient_1,)] + ) + self.assertEqual( + call_args[2]["uncleanable_duplicates"], [] + ) + + def test_finds_exact_uncleanable_duplicates(self, send_email): + """ + Cleanable duplicates are when we can delete one without issue. + + In this case only patient 1 has a subrecord so + patient 2 can be deleted. + """ + patient_1, episode_1 = self.new_patient_and_episode_please() + patient_1.demographics_set.update( + hospital_number='111111', + ) + diagnosis = episode_1.diagnosis_set.create() + diagnosis.condition = "Cough" + diagnosis.save() + patient_2, episode_2 = self.new_patient_and_episode_please() + patient_2.demographics_set.update( + hospital_number='111111', + ) + diagnosis = episode_2.diagnosis_set.create() + diagnosis.condition = "Fever" + diagnosis.save() + find_duplicates.find_exact_duplicates() + call_args = send_email.call_args[0] + self.assertEqual( + call_args[0], '1 Exact hospital number duplicates' + ) + self.assertEqual( + call_args[2]["title"], '1 Exact hospital number duplicates' + ) + self.assertEqual( + call_args[2]["cleanable_duplicates"], [] + ) + self.assertEqual( + call_args[2]["uncleanable_duplicates"], [(patient_1, patient_2,)] + ) + + def test_does_not_find_exact_duplicates(self, send_email): + patient_1, _ = self.new_patient_and_episode_please() + patient_1.demographics_set.update( + hospital_number='111111', + ) + patient_2, _ = self.new_patient_and_episode_please() + patient_2.demographics_set.update( + hospital_number='22222222', + ) + find_duplicates.find_exact_duplicates() + self.assertFalse(send_email.called) + + +@mock.patch('plugins.data_quality.utils.send_email') +class FindLeadingZeroDuplicatesTestCase(OpalTestCase): + def test_finds_leading_zero_cleanable_duplicates(self, send_email): + """ + Cleanable duplicates are when we can delete one without issue. + + In this case only patient 1 has a subrecord so + patient 2 can be deleted. + """ + patient_1, episode = self.new_patient_and_episode_please() + patient_1.demographics_set.update( + hospital_number='111111', + ) + diagnosis = episode.diagnosis_set.create() + diagnosis.condition = "Cough" + diagnosis.save() + patient_2, _ = self.new_patient_and_episode_please() + patient_2.demographics_set.update( + hospital_number='0111111', + ) + find_duplicates.find_zero_leading_duplicates() + call_args = send_email.call_args[0] + self.assertEqual( + call_args[0], '1 Exact hospital number duplicates' + ) + self.assertEqual( + call_args[2]["title"], '1 Exact hospital number duplicates' + ) + self.assertEqual( + call_args[2]["uncleanable_duplicates"], [] + ) + self.assertEqual( + call_args[2]["cleanable_duplicates"], [(patient_2, patient_1,)] + ) + self.assertEqual( + call_args[2]["uncleanable_duplicates"], [] + ) + + def test_finds_leading_zero_uncleanable_duplicates(self, send_email): + """ + Cleanable duplicates are when we can delete one without issue. + + In this case only patient 1 has a subrecord so + patient 2 can be deleted. + """ + patient_1, episode_1 = self.new_patient_and_episode_please() + patient_1.demographics_set.update( + hospital_number='111111', + ) + diagnosis = episode_1.diagnosis_set.create() + diagnosis.condition = "Cough" + diagnosis.save() + patient_2, episode_2 = self.new_patient_and_episode_please() + patient_2.demographics_set.update( + hospital_number='0111111', + ) + diagnosis = episode_2.diagnosis_set.create() + diagnosis.condition = "Fever" + diagnosis.save() + find_duplicates.find_zero_leading_duplicates() + call_args = send_email.call_args[0] + self.assertEqual( + call_args[0], '1 Exact hospital number duplicates' + ) + self.assertEqual( + call_args[2]["title"], '1 Exact hospital number duplicates' + ) + self.assertEqual( + call_args[2]["cleanable_duplicates"], [] + ) + self.assertEqual( + call_args[2]["uncleanable_duplicates"], [(patient_2,patient_1,)] + ) + + def test_does_not_find_leading_zero_duplicates(self, send_email): + patient_1, _ = self.new_patient_and_episode_please() + patient_1.demographics_set.update( + hospital_number='111111', + ) + patient_2, _ = self.new_patient_and_episode_please() + patient_2.demographics_set.update( + hospital_number='022222222', + ) + find_duplicates.find_zero_leading_duplicates() + self.assertFalse(send_email.called) From e7c7e9697e74c48dea4db6203e0c2d992d9a15c4 Mon Sep 17 00:00:00 2001 From: fredkingham Date: Wed, 16 Nov 2022 15:57:30 +0000 Subject: [PATCH 4/6] Adds in the ICU Handover model as there is a bug in the loader that means it does not handle multiple hns from the same patient --- plugins/data_quality/checks/find_duplicates.py | 1 + 1 file changed, 1 insertion(+) diff --git a/plugins/data_quality/checks/find_duplicates.py b/plugins/data_quality/checks/find_duplicates.py index 1f2fa0b6a..3fb7e28a2 100644 --- a/plugins/data_quality/checks/find_duplicates.py +++ b/plugins/data_quality/checks/find_duplicates.py @@ -13,6 +13,7 @@ 'NextOfKinDetails', 'GPDetails', 'ExternalDemographics', + 'ICUHandover' ]) From a712732e80eb81a99c6b3a49bea680c4ad714f39 Mon Sep 17 00:00:00 2001 From: fredkingham Date: Thu, 17 Nov 2022 15:18:15 +0000 Subject: [PATCH 5/6] Changes the duplicate patients report to show urls in the table --- plugins/data_quality/checks/find_duplicates.py | 6 ++++-- .../data_quality/templates/emails/duplicate_patients.html | 8 ++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/plugins/data_quality/checks/find_duplicates.py b/plugins/data_quality/checks/find_duplicates.py index 3fb7e28a2..f11a5c7fc 100644 --- a/plugins/data_quality/checks/find_duplicates.py +++ b/plugins/data_quality/checks/find_duplicates.py @@ -49,7 +49,8 @@ def find_exact_duplicates(): email_context = { "title": email_title, "cleanable_duplicates": cleanable_duplicates, - "uncleanable_duplicates": uncleanable_duplicates + "uncleanable_duplicates": uncleanable_duplicates, + "host": "http://192.168.21.203", } utils.send_email( email_title, @@ -91,7 +92,8 @@ def find_zero_leading_duplicates(): email_context = { "title": email_title, "cleanable_duplicates": cleanable_duplicates, - "uncleanable_duplicates": uncleanable_duplicates + "uncleanable_duplicates": uncleanable_duplicates, + "host": "http://192.168.21.203", } utils.send_email( email_title, diff --git a/plugins/data_quality/templates/emails/duplicate_patients.html b/plugins/data_quality/templates/emails/duplicate_patients.html index 4f7c59995..3fa1b69a3 100644 --- a/plugins/data_quality/templates/emails/duplicate_patients.html +++ b/plugins/data_quality/templates/emails/duplicate_patients.html @@ -6,8 +6,8 @@

Cleanable patients

Patient to deletePatient to keep {% for patient_to_delete, patient_to_keep in cleanable_duplicates %} - {{ patient_to_delete }} - {{ patient_to_keep }} + {{ host }}/#/patient/{{ patient_to_delete.id }} + {{ host }}/#/patient/{{ patient_to_keep.id }} {% endfor %} @@ -17,8 +17,8 @@

Unmergable patients

Patient 1Patient 2 {% for patient_1, patient_2 in uncleanable_duplicates %} - {{ patient_1 }} - {{ patient_2 }} + {{ host }}/#/patient/{{ patient_1.id }} + {{ host }}/#/patient/{{ patient_2.id }} {% endfor %} From 3c0dd1702820bee49c982bebd836da0bd013b68f Mon Sep 17 00:00:00 2001 From: fredkingham Date: Thu, 17 Nov 2022 15:24:39 +0000 Subject: [PATCH 6/6] Fixes title in the leading zeros email to be accruate --- plugins/data_quality/checks/find_duplicates.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/data_quality/checks/find_duplicates.py b/plugins/data_quality/checks/find_duplicates.py index f11a5c7fc..36597e60a 100644 --- a/plugins/data_quality/checks/find_duplicates.py +++ b/plugins/data_quality/checks/find_duplicates.py @@ -88,7 +88,7 @@ def find_zero_leading_duplicates(): # if there are no duplicate patients, return return cleanable_duplicates, uncleanable_duplicates = calculate_deletable_undeletable(dup_patients) - email_title = f'{len(cleanable_duplicates) + len (uncleanable_duplicates)} Exact hospital number duplicates' + email_title = f'{len(cleanable_duplicates) + len (uncleanable_duplicates)} hospital number duplicates where there are leading zeros' email_context = { "title": email_title, "cleanable_duplicates": cleanable_duplicates,