Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add tests for pseudonymisation #214

Merged
merged 15 commits into from
Jan 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ jobs:
run: |
python -m pip install --upgrade pip
pip install -r pixl_dcmd/src/requirements.txt
pip install -e pixl_core

- name: Run tests
working-directory: pixl_dcmd
Expand Down
2 changes: 1 addition & 1 deletion pixl_core/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ authors = [
]
description = ""
readme = "README.md"
requires-python = ">=3.10"
requires-python = ">=3.9"
classifiers = [
"Programming Language :: Python :: 3"
]
Expand Down
2 changes: 1 addition & 1 deletion pixl_dcmd/bin/run-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ BIN_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
PACKAGE_DIR="${BIN_DIR%/*}"
cd "$PACKAGE_DIR"

ENV=test pytest src/pixl_dcmd/tests
pytest src/pixl_dcmd/tests
17 changes: 7 additions & 10 deletions pixl_dcmd/src/pixl_dcmd/_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,22 +13,19 @@
# limitations under the License.

"""Interaction with the PIXL database."""
from decouple import config

from core.database import Image
from sqlalchemy import URL, create_engine
from sqlalchemy.orm import sessionmaker

from pixl_dcmd._config import cli_config

connection_config = cli_config["postgres"]

url = URL.create(
drivername="postgresql+psycopg2",
username=connection_config["username"],
password=connection_config["password"],
host=connection_config["host"],
port=connection_config["port"],
database=connection_config["database"],
username=config("PIXL_DB_USER", default="None"),
password=config("PIXL_DB_PASSWORD", default="None"),
host=config("PIXL_DB_HOST", default="None"),
port=config("PIXL_DB_PORT", default=1),
database=config("PIXL_DB_NAME", default="None"),
)

engine = create_engine(url)
Expand Down Expand Up @@ -61,7 +58,7 @@ def query_db(mrn: str, accession_number: str) -> Image:
.filter(
Image.accession_number == accession_number,
Image.mrn == mrn,
Image.exported_at is None,
Image.exported_at == None, # noqa: E711
)
.one()
)
Expand Down
56 changes: 56 additions & 0 deletions pixl_dcmd/src/pixl_dcmd/_datetime.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# Copyright (c) 2022 University College London Hospitals NHS Foundation Trust
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Datetime helper functions."""
import logging
from random import randint
from typing import Any

import arrow


def combine_date_time(a_date: str, a_time: str) -> Any:
"""Turn date string and time string into arrow object."""
date_time_str = f"{a_date} {a_time}"

# TODO: Should Timezone be hardcoded?
# https://github.com/UCLH-Foundry/PIXL/issues/151
tz = "Europe/London"

try:
new_date_time = arrow.get(date_time_str, tzinfo=tz)
except arrow.parser.ParserError:
logging.exception(
f"Failed to parse the datetime string '{date_time_str}'"
f"falling back to a random time in 1970"
)
new_date_time = arrow.get("1970-01-01T00:00:00+00:00")
new_date_time = new_date_time.shift(seconds=randint(10**2, 10**7))

return new_date_time


def format_date_time(a_date_time: str) -> Any:
"""Turn date-time string into arrow object."""
if "." not in a_date_time:
a_date_time += ".000000"

if a_date_time[8] != " ":
a_date_time = a_date_time[0:8] + " " + a_date_time[8:]

if arrow.get(a_date_time, "YYYYMMDD HHmmss.SSSSSS"):
a_date = "{s}".format(s=arrow.get(a_date_time).format("YYYYMMDD"))
a_time = "{s}".format(s=arrow.get(a_date_time).format("HHmmss.SSSSSS"))

return combine_date_time(a_date, a_time)
70 changes: 70 additions & 0 deletions pixl_dcmd/src/pixl_dcmd/_deid_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
"""Helper functions for de-identification."""
import hashlib
import logging
import re


def get_encrypted_uid(uid: str, salt: bytes) -> str:
"""
Hashes the suffix of a DICOM UID with the given salt.

This function retains the prefix, while sha512-hashing the subcomponents
of the suffix. The number of digits per subcomponent is retained in the
encrypted UID. This also ensures that no UID is greater than 64 chars.
No leading zeros are permitted in a subcomponent unless the subcomponent
has a length of 1.

Original UID: 1.2.124.113532.10.122.1.203.20051130.122937.2950157
Encrypted UID: 1.2.124.113532.74.696.4.703.80155569.949794.5833842

Encrypting the UIDs this way ensures that no time information remains but
that a input UID will always result in the same output UID, for a given salt.

Note. that while no application should ever rely on the structure of a UID,
there is a possibility that the were the anonyimised data to be push to the
originating scanner (or scanner type), the data may not be recognised.
"""
uid_elements = uid.split(".")

prefix = ".".join(uid_elements[:4])
suffix = ".".join(uid_elements[4:])
logging.debug(f"\t\tPrefix: {prefix}")
logging.debug(f"\t\tSuffix: {suffix}")

# Get subcomponents of suffix as array.
suffix_elements = uid_elements[4:]
enc_element = [""] * len(suffix_elements)

# For each subcomponent of the suffix:
for idx, item in enumerate(suffix_elements):
h = hashlib.sha512()
h.update(item.encode("utf-8")) # Add subcomponent.
h.update(salt) # Apply salt.

# If subcomponent has a length of one, allow a leading zero, otherwise
# strip leading zeros.
# Regex removes any non-numeric chars.
if len(item) == 1:
enc_element[idx] = re.sub("[^0-9]", "", h.hexdigest())[: len(item)]
else:
enc_element[idx] = re.sub("[^0-9]", "", h.hexdigest()).lstrip("0")[
: len(item)
]

# Return original prefix and encrypted suffix.
return prefix + "." + ".".join(enc_element[:])


def get_bounded_age(age: str) -> str:
"""Bounds patient age between 18 and 89"""
if age[3] != "Y":
return "018Y"

age_as_int = int(age[0:3])
if age_as_int < 18:
return "018Y"

if age_as_int > 89:
return "089Y"

return age
110 changes: 3 additions & 107 deletions pixl_dcmd/src/pixl_dcmd/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,17 @@
# limitations under the License.
from __future__ import annotations

import hashlib
import logging
import re
from io import BytesIO
from os import PathLike
from random import randint
from typing import Any, BinaryIO, Union

import arrow
import requests
from decouple import config
from pydicom import Dataset, dcmwrite
from pixl_dcmd._database import insert_new_uid_into_db_entity, query_db
from pixl_dcmd._deid_helpers import get_bounded_age, get_encrypted_uid
from pixl_dcmd._datetime import combine_date_time, format_date_time

DicomDataSetType = Union[Union[str, bytes, PathLike[Any]], BinaryIO]

Expand Down Expand Up @@ -75,108 +73,6 @@ def remove_overlays(dataset: Dataset) -> Dataset:
return dataset


def get_encrypted_uid(uid: str, salt: bytes) -> str:
"""
Hashes the suffix of a DICOM UID with the given salt.

This function retains the prefix, while sha512-hashing the subcomponents
of the suffix. The number of digits per subcomponent is retained in the
encrypted UID. This also ensures that no UID is greater than 64 chars.
No leading zeros are permitted in a subcomponent unless the subcomponent
has a length of 1.

Original UID: 1.2.124.113532.10.122.1.203.20051130.122937.2950157
Encrypted UID: 1.2.124.113532.74.696.4.703.80155569.949794.5833842

Encrypting the UIDs this way ensures that no time information remains but
that a input UID will always result in the same output UID, for a given salt.

Note. that while no application should ever rely on the structure of a UID,
there is a possibility that the were the anonyimised data to be push to the
originating scanner (or scanner type), the data may not be recognised.
"""
uid_elements = uid.split(".")

prefix = ".".join(uid_elements[:4])
suffix = ".".join(uid_elements[4:])
logging.debug(f"\t\tPrefix: {prefix}")
logging.debug(f"\t\tSuffix: {suffix}")

# Get subcomponents of suffix as array.
suffix_elements = uid_elements[4:]
enc_element = [""] * len(suffix_elements)

# For each subcomponent of the suffix:
for idx, item in enumerate(suffix_elements):
h = hashlib.sha512()
h.update(item.encode("utf-8")) # Add subcomponent.
h.update(salt) # Apply salt.

# If subcomponent has a length of one, allow a leading zero, otherwise
# strip leading zeros.
# Regex removes any non-numeric chars.
if len(item) == 1:
enc_element[idx] = re.sub("[^0-9]", "", h.hexdigest())[: len(item)]
else:
enc_element[idx] = re.sub("[^0-9]", "", h.hexdigest()).lstrip("0")[
: len(item)
]

# Return original prefix and encrypted suffix.
return prefix + "." + ".".join(enc_element[:])


def get_bounded_age(age: str) -> str:
"""Bounds patient age between 18 and 89"""
if age[3] != "Y":
return "018Y"

age_as_int = int(age[0:3])
if age_as_int < 18:
return "018Y"

if age_as_int > 89:
return "089Y"

return age


def combine_date_time(a_date: str, a_time: str) -> Any:
"""Turn date string and time string into arrow object."""
date_time_str = f"{a_date} {a_time}"

# TODO: Should Timezone be hardcoded?
# https://github.com/UCLH-Foundry/PIXL/issues/151
tz = "Europe/London"

try:
new_date_time = arrow.get(date_time_str, tzinfo=tz)
except arrow.parser.ParserError:
logging.exception(
f"Failed to parse the datetime string '{date_time_str}'"
f"falling back to a random time in 1970"
)
new_date_time = arrow.get("1970-01-01T00:00:00+00:00")
new_date_time = new_date_time.shift(seconds=randint(10**2, 10**7))

return new_date_time


def format_date_time(a_date_time: str) -> Any:
"""Turn date-time string into arrow object."""
if "." not in a_date_time:
a_date_time += ".000000"

if a_date_time[8] != " ":
a_date_time = a_date_time[0:8] + " " + a_date_time[8:]

if arrow.get(a_date_time, "YYYYMMDD HHmmss.SSSSSS"):
a_date = "{s}".format(s=arrow.get(a_date_time).format("YYYYMMDD"))
a_time = "{s}".format(s=arrow.get(a_date_time).format("HHmmss.SSSSSS"))

return combine_date_time(a_date, a_time)


def enforce_whitelist(dataset: dict, tags: dict) -> dict:
"""Delete any tags not in the tagging scheme."""
# For every element:
Expand Down Expand Up @@ -215,7 +111,7 @@ def apply_tag_scheme(dataset: dict, tags: dict) -> dict:
mrn = dataset[0x0010, 0x0020].value # Patient ID
accession_number = dataset[0x0008, 0x0050].value # Accession Number

salt_plaintext = mrn + accession_number
salt_plaintext = config("SALT_VALUE")

HASHER_API_AZ_NAME = config("HASHER_API_AZ_NAME")
HASHER_API_PORT = config("HASHER_API_PORT")
Expand Down
12 changes: 6 additions & 6 deletions pixl_dcmd/src/pixl_dcmd/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,16 +31,16 @@ def rows_in_session(db_session) -> Session:
extract = Extract(slug="i-am-a-project")

image_exported = Image(
accession_number="123",
accession_number="AA12345601",
study_date=STUDY_DATE,
mrn="mrn",
mrn="987654321",
extract=extract,
exported_at=datetime.datetime.now(tz=UTC),
)
image_not_exported = Image(
accession_number="234",
accession_number="AA12345605",
study_date=STUDY_DATE,
mrn="mrn",
mrn="987654321",
extract=extract,
)
with db_session:
Expand Down Expand Up @@ -68,15 +68,15 @@ def db_engine(monkeymodule) -> Engine:
:returns Engine: Engine for use in other setup fixtures
"""
# SQLite doesnt support schemas, so remove pixl schema from engine options
execution_options = {"schema_translate_map": {"pixl": None}}
execution_options = {"schema_translate_map": {"pipeline": None}}
engine = create_engine(
"sqlite:///:memory:",
execution_options=execution_options,
echo=True,
echo_pool="debug",
future=True,
)
monkeymodule.setattr("pixl_cli._database.engine", engine)
monkeymodule.setattr("pixl_dcmd._database.engine", engine)

Base.metadata.create_all(engine)
yield engine
Expand Down
Loading
Loading