diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 39eb13f05..66f466865 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -112,6 +112,7 @@ jobs: run: | python -m pip install --upgrade pip pip install -r pixl_dcmd/src/requirements.txt + pip install -e pixl_core - name: Run tests working-directory: pixl_dcmd diff --git a/pixl_core/pyproject.toml b/pixl_core/pyproject.toml index 6c9e10dd6..e5dc852a6 100644 --- a/pixl_core/pyproject.toml +++ b/pixl_core/pyproject.toml @@ -6,7 +6,7 @@ authors = [ ] description = "" readme = "README.md" -requires-python = ">=3.10" +requires-python = ">=3.9" classifiers = [ "Programming Language :: Python :: 3" ] diff --git a/pixl_dcmd/bin/run-tests.sh b/pixl_dcmd/bin/run-tests.sh index bf4a51789..024afc408 100755 --- a/pixl_dcmd/bin/run-tests.sh +++ b/pixl_dcmd/bin/run-tests.sh @@ -18,4 +18,4 @@ BIN_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) PACKAGE_DIR="${BIN_DIR%/*}" cd "$PACKAGE_DIR" -ENV=test pytest src/pixl_dcmd/tests +pytest src/pixl_dcmd/tests diff --git a/pixl_dcmd/src/pixl_dcmd/_database.py b/pixl_dcmd/src/pixl_dcmd/_database.py index a9195d1d6..46b6506c1 100644 --- a/pixl_dcmd/src/pixl_dcmd/_database.py +++ b/pixl_dcmd/src/pixl_dcmd/_database.py @@ -13,22 +13,19 @@ # limitations under the License. """Interaction with the PIXL database.""" +from decouple import config from core.database import Image from sqlalchemy import URL, create_engine from sqlalchemy.orm import sessionmaker -from pixl_dcmd._config import cli_config - -connection_config = cli_config["postgres"] - url = URL.create( drivername="postgresql+psycopg2", - username=connection_config["username"], - password=connection_config["password"], - host=connection_config["host"], - port=connection_config["port"], - database=connection_config["database"], + username=config("PIXL_DB_USER", default="None"), + password=config("PIXL_DB_PASSWORD", default="None"), + host=config("PIXL_DB_HOST", default="None"), + port=config("PIXL_DB_PORT", default=1), + database=config("PIXL_DB_NAME", default="None"), ) engine = create_engine(url) @@ -61,7 +58,7 @@ def query_db(mrn: str, accession_number: str) -> Image: .filter( Image.accession_number == accession_number, Image.mrn == mrn, - Image.exported_at is None, + Image.exported_at == None, # noqa: E711 ) .one() ) diff --git a/pixl_dcmd/src/pixl_dcmd/_datetime.py b/pixl_dcmd/src/pixl_dcmd/_datetime.py new file mode 100644 index 000000000..7d6110b54 --- /dev/null +++ b/pixl_dcmd/src/pixl_dcmd/_datetime.py @@ -0,0 +1,56 @@ +# Copyright (c) 2022 University College London Hospitals NHS Foundation Trust +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Datetime helper functions.""" +import logging +from random import randint +from typing import Any + +import arrow + + +def combine_date_time(a_date: str, a_time: str) -> Any: + """Turn date string and time string into arrow object.""" + date_time_str = f"{a_date} {a_time}" + + # TODO: Should Timezone be hardcoded? + # https://github.com/UCLH-Foundry/PIXL/issues/151 + tz = "Europe/London" + + try: + new_date_time = arrow.get(date_time_str, tzinfo=tz) + except arrow.parser.ParserError: + logging.exception( + f"Failed to parse the datetime string '{date_time_str}'" + f"falling back to a random time in 1970" + ) + new_date_time = arrow.get("1970-01-01T00:00:00+00:00") + new_date_time = new_date_time.shift(seconds=randint(10**2, 10**7)) + + return new_date_time + + +def format_date_time(a_date_time: str) -> Any: + """Turn date-time string into arrow object.""" + if "." not in a_date_time: + a_date_time += ".000000" + + if a_date_time[8] != " ": + a_date_time = a_date_time[0:8] + " " + a_date_time[8:] + + if arrow.get(a_date_time, "YYYYMMDD HHmmss.SSSSSS"): + a_date = "{s}".format(s=arrow.get(a_date_time).format("YYYYMMDD")) + a_time = "{s}".format(s=arrow.get(a_date_time).format("HHmmss.SSSSSS")) + + return combine_date_time(a_date, a_time) diff --git a/pixl_dcmd/src/pixl_dcmd/_deid_helpers.py b/pixl_dcmd/src/pixl_dcmd/_deid_helpers.py new file mode 100644 index 000000000..9c1316f71 --- /dev/null +++ b/pixl_dcmd/src/pixl_dcmd/_deid_helpers.py @@ -0,0 +1,70 @@ +"""Helper functions for de-identification.""" +import hashlib +import logging +import re + + +def get_encrypted_uid(uid: str, salt: bytes) -> str: + """ + Hashes the suffix of a DICOM UID with the given salt. + + This function retains the prefix, while sha512-hashing the subcomponents + of the suffix. The number of digits per subcomponent is retained in the + encrypted UID. This also ensures that no UID is greater than 64 chars. + No leading zeros are permitted in a subcomponent unless the subcomponent + has a length of 1. + + Original UID: 1.2.124.113532.10.122.1.203.20051130.122937.2950157 + Encrypted UID: 1.2.124.113532.74.696.4.703.80155569.949794.5833842 + + Encrypting the UIDs this way ensures that no time information remains but + that a input UID will always result in the same output UID, for a given salt. + + Note. that while no application should ever rely on the structure of a UID, + there is a possibility that the were the anonyimised data to be push to the + originating scanner (or scanner type), the data may not be recognised. + """ + uid_elements = uid.split(".") + + prefix = ".".join(uid_elements[:4]) + suffix = ".".join(uid_elements[4:]) + logging.debug(f"\t\tPrefix: {prefix}") + logging.debug(f"\t\tSuffix: {suffix}") + + # Get subcomponents of suffix as array. + suffix_elements = uid_elements[4:] + enc_element = [""] * len(suffix_elements) + + # For each subcomponent of the suffix: + for idx, item in enumerate(suffix_elements): + h = hashlib.sha512() + h.update(item.encode("utf-8")) # Add subcomponent. + h.update(salt) # Apply salt. + + # If subcomponent has a length of one, allow a leading zero, otherwise + # strip leading zeros. + # Regex removes any non-numeric chars. + if len(item) == 1: + enc_element[idx] = re.sub("[^0-9]", "", h.hexdigest())[: len(item)] + else: + enc_element[idx] = re.sub("[^0-9]", "", h.hexdigest()).lstrip("0")[ + : len(item) + ] + + # Return original prefix and encrypted suffix. + return prefix + "." + ".".join(enc_element[:]) + + +def get_bounded_age(age: str) -> str: + """Bounds patient age between 18 and 89""" + if age[3] != "Y": + return "018Y" + + age_as_int = int(age[0:3]) + if age_as_int < 18: + return "018Y" + + if age_as_int > 89: + return "089Y" + + return age diff --git a/pixl_dcmd/src/pixl_dcmd/main.py b/pixl_dcmd/src/pixl_dcmd/main.py index 1e40b198b..33a817626 100644 --- a/pixl_dcmd/src/pixl_dcmd/main.py +++ b/pixl_dcmd/src/pixl_dcmd/main.py @@ -13,19 +13,17 @@ # limitations under the License. from __future__ import annotations -import hashlib import logging -import re from io import BytesIO from os import PathLike -from random import randint from typing import Any, BinaryIO, Union -import arrow import requests from decouple import config from pydicom import Dataset, dcmwrite from pixl_dcmd._database import insert_new_uid_into_db_entity, query_db +from pixl_dcmd._deid_helpers import get_bounded_age, get_encrypted_uid +from pixl_dcmd._datetime import combine_date_time, format_date_time DicomDataSetType = Union[Union[str, bytes, PathLike[Any]], BinaryIO] @@ -75,108 +73,6 @@ def remove_overlays(dataset: Dataset) -> Dataset: return dataset -def get_encrypted_uid(uid: str, salt: bytes) -> str: - """ - Hashes the suffix of a DICOM UID with the given salt. - - This function retains the prefix, while sha512-hashing the subcomponents - of the suffix. The number of digits per subcomponent is retained in the - encrypted UID. This also ensures that no UID is greater than 64 chars. - No leading zeros are permitted in a subcomponent unless the subcomponent - has a length of 1. - - Original UID: 1.2.124.113532.10.122.1.203.20051130.122937.2950157 - Encrypted UID: 1.2.124.113532.74.696.4.703.80155569.949794.5833842 - - Encrypting the UIDs this way ensures that no time information remains but - that a input UID will always result in the same output UID, for a given salt. - - Note. that while no application should ever rely on the structure of a UID, - there is a possibility that the were the anonyimised data to be push to the - originating scanner (or scanner type), the data may not be recognised. - """ - uid_elements = uid.split(".") - - prefix = ".".join(uid_elements[:4]) - suffix = ".".join(uid_elements[4:]) - logging.debug(f"\t\tPrefix: {prefix}") - logging.debug(f"\t\tSuffix: {suffix}") - - # Get subcomponents of suffix as array. - suffix_elements = uid_elements[4:] - enc_element = [""] * len(suffix_elements) - - # For each subcomponent of the suffix: - for idx, item in enumerate(suffix_elements): - h = hashlib.sha512() - h.update(item.encode("utf-8")) # Add subcomponent. - h.update(salt) # Apply salt. - - # If subcomponent has a length of one, allow a leading zero, otherwise - # strip leading zeros. - # Regex removes any non-numeric chars. - if len(item) == 1: - enc_element[idx] = re.sub("[^0-9]", "", h.hexdigest())[: len(item)] - else: - enc_element[idx] = re.sub("[^0-9]", "", h.hexdigest()).lstrip("0")[ - : len(item) - ] - - # Return original prefix and encrypted suffix. - return prefix + "." + ".".join(enc_element[:]) - - -def get_bounded_age(age: str) -> str: - """Bounds patient age between 18 and 89""" - if age[3] != "Y": - return "018Y" - - age_as_int = int(age[0:3]) - if age_as_int < 18: - return "018Y" - - if age_as_int > 89: - return "089Y" - - return age - - -def combine_date_time(a_date: str, a_time: str) -> Any: - """Turn date string and time string into arrow object.""" - date_time_str = f"{a_date} {a_time}" - - # TODO: Should Timezone be hardcoded? - # https://github.com/UCLH-Foundry/PIXL/issues/151 - tz = "Europe/London" - - try: - new_date_time = arrow.get(date_time_str, tzinfo=tz) - except arrow.parser.ParserError: - logging.exception( - f"Failed to parse the datetime string '{date_time_str}'" - f"falling back to a random time in 1970" - ) - new_date_time = arrow.get("1970-01-01T00:00:00+00:00") - new_date_time = new_date_time.shift(seconds=randint(10**2, 10**7)) - - return new_date_time - - -def format_date_time(a_date_time: str) -> Any: - """Turn date-time string into arrow object.""" - if "." not in a_date_time: - a_date_time += ".000000" - - if a_date_time[8] != " ": - a_date_time = a_date_time[0:8] + " " + a_date_time[8:] - - if arrow.get(a_date_time, "YYYYMMDD HHmmss.SSSSSS"): - a_date = "{s}".format(s=arrow.get(a_date_time).format("YYYYMMDD")) - a_time = "{s}".format(s=arrow.get(a_date_time).format("HHmmss.SSSSSS")) - - return combine_date_time(a_date, a_time) - - def enforce_whitelist(dataset: dict, tags: dict) -> dict: """Delete any tags not in the tagging scheme.""" # For every element: @@ -215,7 +111,7 @@ def apply_tag_scheme(dataset: dict, tags: dict) -> dict: mrn = dataset[0x0010, 0x0020].value # Patient ID accession_number = dataset[0x0008, 0x0050].value # Accession Number - salt_plaintext = mrn + accession_number + salt_plaintext = config("SALT_VALUE") HASHER_API_AZ_NAME = config("HASHER_API_AZ_NAME") HASHER_API_PORT = config("HASHER_API_PORT") diff --git a/pixl_dcmd/src/pixl_dcmd/tests/conftest.py b/pixl_dcmd/src/pixl_dcmd/tests/conftest.py index f3ab1ea8b..af0fc018c 100644 --- a/pixl_dcmd/src/pixl_dcmd/tests/conftest.py +++ b/pixl_dcmd/src/pixl_dcmd/tests/conftest.py @@ -31,16 +31,16 @@ def rows_in_session(db_session) -> Session: extract = Extract(slug="i-am-a-project") image_exported = Image( - accession_number="123", + accession_number="AA12345601", study_date=STUDY_DATE, - mrn="mrn", + mrn="987654321", extract=extract, exported_at=datetime.datetime.now(tz=UTC), ) image_not_exported = Image( - accession_number="234", + accession_number="AA12345605", study_date=STUDY_DATE, - mrn="mrn", + mrn="987654321", extract=extract, ) with db_session: @@ -68,7 +68,7 @@ def db_engine(monkeymodule) -> Engine: :returns Engine: Engine for use in other setup fixtures """ # SQLite doesnt support schemas, so remove pixl schema from engine options - execution_options = {"schema_translate_map": {"pixl": None}} + execution_options = {"schema_translate_map": {"pipeline": None}} engine = create_engine( "sqlite:///:memory:", execution_options=execution_options, @@ -76,7 +76,7 @@ def db_engine(monkeymodule) -> Engine: echo_pool="debug", future=True, ) - monkeymodule.setattr("pixl_cli._database.engine", engine) + monkeymodule.setattr("pixl_dcmd._database.engine", engine) Base.metadata.create_all(engine) yield engine diff --git a/pixl_dcmd/src/pixl_dcmd/tests/test_pixl_dcmd.py b/pixl_dcmd/src/pixl_dcmd/tests/test_datetime.py similarity index 51% rename from pixl_dcmd/src/pixl_dcmd/tests/test_pixl_dcmd.py rename to pixl_dcmd/src/pixl_dcmd/tests/test_datetime.py index 0e818719f..5b25b5e34 100644 --- a/pixl_dcmd/src/pixl_dcmd/tests/test_pixl_dcmd.py +++ b/pixl_dcmd/src/pixl_dcmd/tests/test_datetime.py @@ -11,53 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import annotations -import pydicom +"""Test datetime helpers.""" import pytest -from pydicom.data import get_testdata_files -from pathlib import Path -from pixl_dcmd.main import ( - combine_date_time, - format_date_time, - get_bounded_age, - get_encrypted_uid, - remove_overlays, -) - - -def test_encrypt_uid_1() -> None: - """Checks whether UID is hashed with salt=1234567890.""" - test_uid = "1.2.124.113532.10.122.1.203.20051130.122937.2950157" - test_salt = b"1234567890" - expected_uid = "1.2.124.113532.28.570.5.537.30525294.945722.5900125" - assert get_encrypted_uid(test_uid, test_salt) == expected_uid - - -def test_encrypt_uid_2() -> None: - """Checks whether UID is hashed with salt=ABCDEFGHIJ.""" - test_uid = "1.2.124.113532.10.122.1.203.20051130.122937.2950157" - test_salt = b"ABCDEFGHIJ" - expected_uid = "1.2.124.113532.66.684.0.649.78590783.565647.7283900" - assert get_encrypted_uid(test_uid, test_salt) == expected_uid - - -@pytest.mark.parametrize( - ("test_ages", "expected_ages"), - [ - ("005D", "018Y"), - ("010M", "018Y"), - ("017Y", "018Y"), - ("018Y", "018Y"), - ("030Y", "030Y"), - ("089Y", "089Y"), - ("100Y", "089Y"), - ], -) -def test_age_bounding(test_ages: str, expected_ages: str) -> None: - """Checks ages are bounded between 18 >= x <= 89.""" - assert get_bounded_age(test_ages) == expected_ages +from pixl_dcmd._datetime import combine_date_time, format_date_time @pytest.mark.parametrize( @@ -96,18 +54,3 @@ def test_date_time_format(orig_date_time: str, output_date_time: str) -> None: format_date_time(orig_date_time).format("YYYYMMDD HHmmss.SSSSSS") == output_date_time ) - - -def test_remove_overlay_plane() -> None: - """Checks that overlay planes are removed.""" - fpath = get_testdata_files("MR-SIEMENS-DICOM-WithOverlays.dcm")[0] - ds = pydicom.dcmread(fpath) - assert (0x6000, 0x3000) in ds - - ds_minus_overlays = remove_overlays(ds) - assert (0x6000, 0x3000) not in ds_minus_overlays - - -# TODO: def test_anonymisation -# https://github.com/UCLH-Foundry/PIXL/issues/132 -fpath = Path(__file__).parents[4] / "test/resources/Dicom1.dcm" diff --git a/pixl_dcmd/src/pixl_dcmd/tests/test_deid_helpers.py b/pixl_dcmd/src/pixl_dcmd/tests/test_deid_helpers.py new file mode 100644 index 000000000..d1f2168a7 --- /dev/null +++ b/pixl_dcmd/src/pixl_dcmd/tests/test_deid_helpers.py @@ -0,0 +1,51 @@ +# Copyright (c) 2022 University College London Hospitals NHS Foundation Trust +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for anonymisation of DICOM data.""" +import pytest + +from pixl_dcmd._deid_helpers import get_bounded_age, get_encrypted_uid + + +def test_encrypt_uid_1() -> None: + """Checks whether UID is hashed with salt=1234567890.""" + test_uid = "1.2.124.113532.10.122.1.203.20051130.122937.2950157" + test_salt = b"1234567890" + expected_uid = "1.2.124.113532.28.570.5.537.30525294.945722.5900125" + assert get_encrypted_uid(test_uid, test_salt) == expected_uid + + +def test_encrypt_uid_2() -> None: + """Checks whether UID is hashed with salt=ABCDEFGHIJ.""" + test_uid = "1.2.124.113532.10.122.1.203.20051130.122937.2950157" + test_salt = b"ABCDEFGHIJ" + expected_uid = "1.2.124.113532.66.684.0.649.78590783.565647.7283900" + assert get_encrypted_uid(test_uid, test_salt) == expected_uid + + +@pytest.mark.parametrize( + ("test_ages", "expected_ages"), + [ + ("005D", "018Y"), + ("010M", "018Y"), + ("017Y", "018Y"), + ("018Y", "018Y"), + ("030Y", "030Y"), + ("089Y", "089Y"), + ("100Y", "089Y"), + ], +) +def test_age_bounding(test_ages: str, expected_ages: str) -> None: + """Checks ages are bounded between 18 >= x <= 89.""" + assert get_bounded_age(test_ages) == expected_ages diff --git a/pixl_dcmd/src/pixl_dcmd/tests/test_main.py b/pixl_dcmd/src/pixl_dcmd/tests/test_main.py new file mode 100644 index 000000000..230c8527b --- /dev/null +++ b/pixl_dcmd/src/pixl_dcmd/tests/test_main.py @@ -0,0 +1,87 @@ +# Copyright (c) 2022 University College London Hospitals NHS Foundation Trust +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import pathlib + +import pydicom +import pytest +import sqlalchemy +import yaml +from pydicom.data import get_testdata_files + +from core.database import Image +from pixl_dcmd.main import ( + apply_tag_scheme, + remove_overlays, +) + + +@pytest.fixture(scope="module") +def tag_scheme() -> dict: + """Read the tag scheme from orthanc raw.""" + tag_file = ( + pathlib.Path(__file__).parents[4] + / "orthanc/orthanc-anon/plugin/tag-operations.yaml" + ) + return yaml.safe_load(tag_file.read_text()) + + +def test_remove_overlay_plane() -> None: + """Checks that overlay planes are removed.""" + fpath = get_testdata_files("MR-SIEMENS-DICOM-WithOverlays.dcm")[0] + ds = pydicom.dcmread(fpath) + assert (0x6000, 0x3000) in ds + + ds_minus_overlays = remove_overlays(ds) + assert (0x6000, 0x3000) not in ds_minus_overlays + + +# TODO: Produce more complete test coverage for anonymisation +# https://github.com/UCLH-Foundry/PIXL/issues/132 +def test_image_already_exported_throws(rows_in_session, tag_scheme): + """ + GIVEN a dicom image which has no un-exported rows in the pipeline database + WHEN the dicom tag scheme is applied + THEN an exception will be thrown as + """ + exported_dicom = pathlib.Path(__file__).parents[4] / "test/resources/Dicom1.dcm" + input_dataset = pydicom.dcmread(exported_dicom) + + with pytest.raises(sqlalchemy.exc.NoResultFound): + apply_tag_scheme(input_dataset, tag_scheme) + + +def test_pseudo_identifier_processing(rows_in_session, tag_scheme): + """ + GIVEN a dicom image that hasn't been exported in the pipeline db + WHEN the dicom tag scheme is applied + THEN the patient identifier tag should be the mrn and accession hashed + and the pipeline db row should now have the fake hash + """ + exported_dicom = pathlib.Path(__file__).parents[4] / "test/resources/Dicom2.dcm" + input_dataset = pydicom.dcmread(exported_dicom) + + accession_number = "AA12345605" + mrn = "987654321" + fake_hash = "-".join(list(f"{mrn}{accession_number}")).encode("utf-8") + + output_dataset = apply_tag_scheme(input_dataset, tag_scheme) + image = ( + rows_in_session.query(Image) + .filter(Image.accession_number == "AA12345605") + .one() + ) + assert output_dataset[0x0010, 0x0020].value == fake_hash + assert image.hashed_identifier == fake_hash diff --git a/pixl_dcmd/src/requirements.txt b/pixl_dcmd/src/requirements.txt index 3addd56ca..7635e3a6e 100644 --- a/pixl_dcmd/src/requirements.txt +++ b/pixl_dcmd/src/requirements.txt @@ -1,9 +1,6 @@ arrow==1.2.3 black==22.8.0 pydicom==2.3.0 -flake8==5.0.4 -flake8-return==1.1.3 -isort==5.10.1 mypy==0.991 pytest==7.1.3 logger==1.4