diff --git a/transforms/README-list.md b/transforms/README-list.md index 3bace1586..beeefd8d3 100644 --- a/transforms/README-list.md +++ b/transforms/README-list.md @@ -41,6 +41,8 @@ Note: This list includes the transforms that were part of the release starting w ## Release notes: +### 1.0.0.a5 + Added Pii Redactor ### 1.0.0.a4 Added missing ray implementation for lang_id, doc_quality, tokenization and filter Added ray notebooks for lang id, Doc Quality, tokenization, and Filter diff --git a/transforms/language/pii_redactor/python/Dockerfile b/transforms/language/pii_redactor/Dockerfile.python similarity index 71% rename from transforms/language/pii_redactor/python/Dockerfile rename to transforms/language/pii_redactor/Dockerfile.python index ae12571d8..4062d5d73 100644 --- a/transforms/language/pii_redactor/python/Dockerfile +++ b/transforms/language/pii_redactor/Dockerfile.python @@ -9,6 +9,7 @@ RUN pip install --no-cache-dir pytest RUN useradd -ms /bin/bash dpk USER dpk WORKDIR /home/dpk + ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries @@ -18,20 +19,9 @@ RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME} # END OF STEPS destined for a data-prep-kit base image -COPY --chown=dpk:root src/ src/ -COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root dpk_pii_redactor/ dpk_pii_redactor/ COPY --chown=dpk:root requirements.txt requirements.txt -RUN pip install --no-cache-dir -e . - -# copy transform main() entry point to the image -COPY ./src/pii_redactor_transform_python.py . - -# copy some of the samples in -COPY ./src/pii_redactor_local.py local/ - -# copy test -COPY test/ test/ -COPY test-data/ test-data/ +RUN pip install -r requirements.txt # Set environment ENV PYTHONPATH /home/dpk diff --git a/transforms/language/pii_redactor/ray/Dockerfile b/transforms/language/pii_redactor/Dockerfile.ray similarity index 59% rename from transforms/language/pii_redactor/ray/Dockerfile rename to transforms/language/pii_redactor/Dockerfile.ray index a8ce793cf..a95ce7cbe 100644 --- a/transforms/language/pii_redactor/ray/Dockerfile +++ b/transforms/language/pii_redactor/Dockerfile.ray @@ -1,4 +1,5 @@ ARG BASE_IMAGE=docker.io/rayproject/ray:2.24.0-py310 + FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images @@ -10,7 +11,7 @@ RUN pip install --upgrade --no-cache-dir pip # install pytest RUN pip install --no-cache-dir pytest -ARG PIP_INSTALL_EXTRA_ARGS +ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). @@ -18,25 +19,12 @@ COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] ## Copy the python version of the tansform -COPY --chmod=775 --chown=ray:root python-transform/ python-transform/ -RUN cd python-transform && pip install --no-cache-dir -e . - -#COPY requirements.txt requirements.txt -#RUN pip install --no-cache-dir -r requirements.txt - -COPY --chmod=775 --chown=ray:root src/ src/ -COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml -RUN pip install --no-cache-dir -e . - -# copy the main() entry point to the image -COPY ./src/pii_redactor_transform_ray.py . - -# copy some of the samples in -COPY ./src/pii_redactor_local_ray.py local/ +COPY --chown=ray:users dpk_pii_redactor/ dpk_pii_redactor/ +COPY --chown=ray:users requirements.txt requirements.txt +RUN pip install -r requirements.txt -# copy test -COPY test/ test/ -COPY test-data/ test-data/ +# Grant non-root users the necessary permissions to the ray directory +RUN chmod 755 /home/ray # Set environment ENV PYTHONPATH /home/ray diff --git a/transforms/language/pii_redactor/Makefile b/transforms/language/pii_redactor/Makefile new file mode 100644 index 000000000..050505a95 --- /dev/null +++ b/transforms/language/pii_redactor/Makefile @@ -0,0 +1,18 @@ +REPOROOT=../../.. +# Use make help, to see the available rules +include $(REPOROOT)/transforms/.make.cicd.targets + +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=$(shell basename `pwd`) + +################################################################################ + + +publish: + @echo "Skip... do nothing! pushing CI/CD over a cliff with OSError on text_encoder " \ No newline at end of file diff --git a/transforms/language/pii_redactor/Makefile.disable b/transforms/language/pii_redactor/Makefile.disable deleted file mode 100644 index 8764d0dc2..000000000 --- a/transforms/language/pii_redactor/Makefile.disable +++ /dev/null @@ -1,79 +0,0 @@ -REPOROOT=../../.. -# Use make help, to see the available rules -include $(REPOROOT)/.make.defaults - -setup:: - @# Help: Recursively make $@ all subdirs - $(MAKE) RULE=$@ .recurse - -clean:: - @# Help: Recursively make $@ all subdirs - $(MAKE) RULE=$@ .recurse - -build:: - @# Help: Recursively make $@ in subdirs - $(MAKE) RULE=$@ .recurse -venv:: - @# Help: Recursively make $@ in subdirs - $(MAKE) RULE=$@ .recurse - -image:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -publish:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -set-versions: - @# Help: Recursively $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -test-image:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -test:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -test-src:: - @# Help: Recursively make $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -kind-load-image:: - @# Help: Recursively make $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -docker-load-image:: - @# Help: Recursively make $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -docker-save-image:: - @# Help: Recursively make $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -.PHONY: workflow-venv -workflow-venv: - if [ -e kfp_ray ]; then \ - $(MAKE) -C kfp_ray workflow-venv; \ - fi - -.PHONY: workflow-test -workflow-test: - if [ -e kfp_ray ]; then \ - $(MAKE) -C kfp_ray workflow-test; \ - fi - -.PHONY: workflow-upload -workflow-upload: - if [ -e kfp_ray ]; then \ - $(MAKE) -C kfp_ray workflow-upload; \ - fi - -.PHONY: workflow-build -workflow-build: - if [ -e kfp_ray ]; then \ - $(MAKE) -C kfp_ray workflow-build; \ - fi - diff --git a/transforms/language/pii_redactor/README.md b/transforms/language/pii_redactor/README.md index a299068d0..7a87b81c7 100644 --- a/transforms/language/pii_redactor/README.md +++ b/transforms/language/pii_redactor/README.md @@ -1,13 +1,105 @@ + + # PII Redactor Transform -* [python](python/README.md) - provides the base python-based transformation -implementation. -* [ray](ray/README.md) - enables the running of the base python transformation -in a Ray runtime -* [kfp](kfp_ray/README.md) - enables running the ray docker image -in a kubernetes cluster using a generated `yaml` file. - +This transform redacts Personally Identifiable Information (PII) from the input data. + +The transform leverages the [Microsoft Presidio SDK](https://microsoft.github.io/presidio/) for PII detection and uses the Flair recognizer for entity recognition. + + +## Contributors + +- Sowmya.L.R (lrsowmya@gmail.com) + + +### Supported Entities + +The transform detects the following PII entities by default: +- **PERSON**: Names of individuals +- **EMAIL_ADDRESS**: Email addresses +- **ORGANIZATION**: Names of organizations +- **DATE_TIME**: Dates and times +- **PHONE_NUMBER**: Phone number +- **CREDIT_CARD**: Credit card numbers + +You can configure the entities to detect by passing the required entities as argument param ( **--pii_redactor_entities** ). +To know more about different entity types supported - [Entities](https://microsoft.github.io/presidio/supported_entities/) + +### Redaction Techniques + +Two redaction techniques are supported: +- **replace**: Replaces detected PII with a placeholder (default) +- **redact**: Removes the detected PII from the text + +You can choose the redaction technique by passing it as an argument parameter (**--pii_redactor_operator**). + +## Input and Output + +### Input + +The input data should be a `py.Table` with a column containing the text where PII detection and redaction will be applied. By default, this column is named `contents`. + +**Example Input Table Structure:** Table 1: Sample input to the pii redactor transform + +| contents | doc_id | +|---------------------|--------| +| My name is John Doe | doc001 | +| I work at apple | doc002 | + + +### Output + +The output table will include the original columns plus an additional column `new_contents` which is configurable with redacted text and `detected_pii` +column consisting the type of PII entities detected in that document for replace operator. + +**Example Output Table Structure for replace operator:** + +| contents | doc_id | new_contents | detected_pii | +|---------------------|--------|--------------------------|------------------| +| My name is John Doe | doc001 | My name is `` | `[PERSON]` | +| I work at apple | doc002 | I work at `` | `[ORGANIZATION]` | + +When `redact` operator is chosen the output will look like below + +**Example Output Table Structure for redact operator** + +| contents | doc_id | new_contents | detected_pii | +|---------------------|--------|--------------------------|------------------| +| My name is John Doe | doc001 | My name is | `[PERSON]` | +| I work at apple | doc002 | I work at | `[ORGANIZATION]` | + +### Launched Command Line Options +The following command line arguments are available in addition to +the options provided by +the [python launcher](../../../data-processing-lib/doc/python-launcher-options.md). + +``` + --pii_redactor_entities PII_ENTITIES + list of PII entities to be captured for example: ["PERSON", "EMAIL"] + --pii_redactor_operator REDACTOR_OPERATOR + Two redaction techniques are supported - replace(default), redact + --pii_redactor_transformed_contents PII_TRANSFORMED_CONTENT_COLUMN_NAME + Mention the column name in which transformed contents will be added. This is required argument. + --pii_redactor_score_threshold SCORE_THRESHOLD + The score_threshold is a parameter that sets the minimum confidence score required for an entity to be considered a match. + Provide a value above 0.6 +``` +## PII Redactor Ray Transform +Please see the set of +[transform project conventions](../../README.md#transform-project-conventions) +for details on general project conventions, transform configuration, +testing and IDE set up. + +## Summary +This project wraps the pii redactor transform with a Ray runtime. + +### Launched Command Line Options +In addition to those available to the transform as defined here, +the set of +[ray launcher options](../../../data-processing-lib/doc/ray-launcher-options.md) are available. + +### Transforming data using the transform image +To use the transform image to transform your data, please refer to the +[running images quickstart](../../../doc/quick-start/run-transform-image.md), +substituting the name of this transform image and runtime as appropriate. diff --git a/transforms/language/pii_redactor/dpk_pii_redactor/__init__.py b/transforms/language/pii_redactor/dpk_pii_redactor/__init__.py new file mode 100644 index 000000000..29621e921 --- /dev/null +++ b/transforms/language/pii_redactor/dpk_pii_redactor/__init__.py @@ -0,0 +1 @@ +from .transform import * diff --git a/transforms/language/pii_redactor/python/src/flair_recognizer.py b/transforms/language/pii_redactor/dpk_pii_redactor/flair_recognizer.py similarity index 100% rename from transforms/language/pii_redactor/python/src/flair_recognizer.py rename to transforms/language/pii_redactor/dpk_pii_redactor/flair_recognizer.py diff --git a/transforms/language/pii_redactor/python/src/pii_redactor_local.py b/transforms/language/pii_redactor/dpk_pii_redactor/local.py similarity index 97% rename from transforms/language/pii_redactor/python/src/pii_redactor_local.py rename to transforms/language/pii_redactor/dpk_pii_redactor/local.py index baa6d3894..d1d8428ce 100644 --- a/transforms/language/pii_redactor/python/src/pii_redactor_local.py +++ b/transforms/language/pii_redactor/dpk_pii_redactor/local.py @@ -12,7 +12,7 @@ import os from data_processing.data_access import DataAccessLocal -from pii_redactor_transform import ( +from dpk_pii_redactor.transform import ( PIIRedactorTransform, doc_transformed_contents_key, supported_entities_key, diff --git a/transforms/language/pii_redactor/python/src/pii_redactor_local_python.py b/transforms/language/pii_redactor/dpk_pii_redactor/local_python.py similarity index 90% rename from transforms/language/pii_redactor/python/src/pii_redactor_local_python.py rename to transforms/language/pii_redactor/dpk_pii_redactor/local_python.py index 59eb5accf..16c428bc8 100644 --- a/transforms/language/pii_redactor/python/src/pii_redactor_local_python.py +++ b/transforms/language/pii_redactor/dpk_pii_redactor/local_python.py @@ -15,8 +15,8 @@ from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.utils import ParamsUtils -from pii_redactor_transform import doc_transformed_contents_cli_param -from pii_redactor_transform_python import PIIRedactorPythonTransformConfiguration +from dpk_pii_redactor.transform import doc_transformed_contents_cli_param +from dpk_pii_redactor.transform_python import PIIRedactorPythonTransformConfiguration # create parameters diff --git a/transforms/language/pii_redactor/python/src/pii_analyzer.py b/transforms/language/pii_redactor/dpk_pii_redactor/pii_analyzer.py similarity index 98% rename from transforms/language/pii_redactor/python/src/pii_analyzer.py rename to transforms/language/pii_redactor/dpk_pii_redactor/pii_analyzer.py index 894c7ec35..28da6290e 100644 --- a/transforms/language/pii_redactor/python/src/pii_analyzer.py +++ b/transforms/language/pii_redactor/dpk_pii_redactor/pii_analyzer.py @@ -12,7 +12,7 @@ import logging import spacy -from flair_recognizer import FlairRecognizer +from dpk_pii_redactor.flair_recognizer import FlairRecognizer from presidio_analyzer import AnalyzerEngine, RecognizerRegistry from presidio_analyzer.nlp_engine import NlpEngineProvider diff --git a/transforms/language/pii_redactor/python/src/pii_anonymizer.py b/transforms/language/pii_redactor/dpk_pii_redactor/pii_anonymizer.py similarity index 100% rename from transforms/language/pii_redactor/python/src/pii_anonymizer.py rename to transforms/language/pii_redactor/dpk_pii_redactor/pii_anonymizer.py diff --git a/transforms/language/pii_redactor/dpk_pii_redactor/ray/__init__.py b/transforms/language/pii_redactor/dpk_pii_redactor/ray/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/transforms/language/pii_redactor/ray/src/pii_redactor_local_ray.py b/transforms/language/pii_redactor/dpk_pii_redactor/ray/local.py similarity index 96% rename from transforms/language/pii_redactor/ray/src/pii_redactor_local_ray.py rename to transforms/language/pii_redactor/dpk_pii_redactor/ray/local.py index fab32f035..b76644c79 100644 --- a/transforms/language/pii_redactor/ray/src/pii_redactor_local_ray.py +++ b/transforms/language/pii_redactor/dpk_pii_redactor/ray/local.py @@ -15,7 +15,7 @@ from data_processing.utils import ParamsUtils from data_processing_ray.runtime.ray import RayTransformLauncher -from pii_redactor_transform_ray import PIIRedactorRayTransformConfiguration +from dpk_pii_redactor.ray.transform import PIIRedactorRayTransformConfiguration # create parameters diff --git a/transforms/language/pii_redactor/ray/src/pii_redactor_s3_ray.py b/transforms/language/pii_redactor/dpk_pii_redactor/ray/s3.py similarity index 96% rename from transforms/language/pii_redactor/ray/src/pii_redactor_s3_ray.py rename to transforms/language/pii_redactor/dpk_pii_redactor/ray/s3.py index 644f859e9..1e409874a 100644 --- a/transforms/language/pii_redactor/ray/src/pii_redactor_s3_ray.py +++ b/transforms/language/pii_redactor/dpk_pii_redactor/ray/s3.py @@ -15,7 +15,7 @@ from data_processing.utils import ParamsUtils from data_processing_ray.runtime.ray import RayTransformLauncher -from pii_redactor_transform_ray import PIIRedactorRayTransformConfiguration +from dpk_pii_redactor.ray.transform import PIIRedactorRayTransformConfiguration print(os.environ) diff --git a/transforms/language/pii_redactor/ray/src/pii_redactor_transform_ray.py b/transforms/language/pii_redactor/dpk_pii_redactor/ray/transform.py similarity index 56% rename from transforms/language/pii_redactor/ray/src/pii_redactor_transform_ray.py rename to transforms/language/pii_redactor/dpk_pii_redactor/ray/transform.py index 0f4cf4b07..6ecadcbca 100644 --- a/transforms/language/pii_redactor/ray/src/pii_redactor_transform_ray.py +++ b/transforms/language/pii_redactor/dpk_pii_redactor/ray/transform.py @@ -10,21 +10,13 @@ # limitations under the License. ################################################################################ -import time -from argparse import ArgumentParser, Namespace -from typing import Any - -import pyarrow as pa -from data_processing.runtime.pure_python.runtime_configuration import ( - PythonTransformRuntimeConfiguration, -) -from data_processing.transform import AbstractTableTransform, TransformConfiguration -from data_processing.utils import CLIArgumentProvider, get_logger +import sys +from data_processing.utils import ParamsUtils, get_logger from data_processing_ray.runtime.ray import RayTransformLauncher from data_processing_ray.runtime.ray.runtime_configuration import ( RayTransformRuntimeConfiguration, ) -from pii_redactor_transform import PIIRedactorTransformConfiguration +from dpk_pii_redactor.transform import PIIRedactorTransformConfiguration logger = get_logger(__name__) @@ -43,6 +35,31 @@ def __init__(self): super().__init__(transform_config=PIIRedactorTransformConfiguration()) +class PIIRedactor: + def __init__(self, **kwargs): + self.params = {} + for key in kwargs: + self.params[key] = kwargs[key] + try: + local_conf = {k: self.params[k] for k in ("input_folder", "output_folder")} + self.params["data_local_config"] = ParamsUtils.convert_to_ast(local_conf) + del self.params["input_folder"], self.params["output_folder"] + except: + pass + try: + worker_options = {k: self.params[k] for k in ("num_cpus", "memory")} + self.params["runtime_worker_options"] = ParamsUtils.convert_to_ast(worker_options) + del self.params["num_cpus"], self.params["memory"] + except: + pass + + def transform(self): + sys.argv = ParamsUtils.dict_to_req(d=(self.params)) + launcher = RayTransformLauncher(PIIRedactorRayTransformConfiguration()) + return_code = launcher.launch() + return return_code + + if __name__ == "__main__": launcher = RayTransformLauncher(PIIRedactorRayTransformConfiguration()) logger.info("Launching pii redactor transform") diff --git a/transforms/language/pii_redactor/python/src/pii_redactor_transform.py b/transforms/language/pii_redactor/dpk_pii_redactor/transform.py similarity index 98% rename from transforms/language/pii_redactor/python/src/pii_redactor_transform.py rename to transforms/language/pii_redactor/dpk_pii_redactor/transform.py index 6c1d1c17f..386d021b7 100644 --- a/transforms/language/pii_redactor/python/src/pii_redactor_transform.py +++ b/transforms/language/pii_redactor/dpk_pii_redactor/transform.py @@ -21,8 +21,8 @@ import pyarrow as pa from data_processing.transform import AbstractTableTransform, TransformConfiguration from data_processing.utils import CLIArgumentProvider, TransformUtils, get_logger -from pii_analyzer import PIIAnalyzerEngine -from pii_anonymizer import PIIAnonymizer +from dpk_pii_redactor.pii_analyzer import PIIAnalyzerEngine +from dpk_pii_redactor.pii_anonymizer import PIIAnonymizer short_name = "pii_redactor" diff --git a/transforms/language/pii_redactor/python/src/pii_redactor_transform_python.py b/transforms/language/pii_redactor/dpk_pii_redactor/transform_python.py similarity index 58% rename from transforms/language/pii_redactor/python/src/pii_redactor_transform_python.py rename to transforms/language/pii_redactor/dpk_pii_redactor/transform_python.py index c42f887f8..38c6b7798 100644 --- a/transforms/language/pii_redactor/python/src/pii_redactor_transform_python.py +++ b/transforms/language/pii_redactor/dpk_pii_redactor/transform_python.py @@ -8,12 +8,13 @@ # See the License for the specific language governing permissions and # limitations under the License. ################################################################################ +import sys from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.runtime.pure_python.runtime_configuration import ( PythonTransformRuntimeConfiguration, ) -from data_processing.utils import get_logger -from pii_redactor_transform import PIIRedactorTransformConfiguration +from data_processing.utils import ParamsUtils, get_logger +from dpk_pii_redactor.transform import PIIRedactorTransformConfiguration log = get_logger(__name__) @@ -28,6 +29,27 @@ def __init__(self): super().__init__(transform_config=PIIRedactorTransformConfiguration()) +class PIIRedactor: + def __init__(self, **kwargs): + self.params = {} + for key in kwargs: + self.params[key] = kwargs[key] + # if input_folder and output_folder are specified, then assume it is represent data_local_config + try: + local_conf = {k: self.params[k] for k in ("input_folder", "output_folder")} + self.params["data_local_config"] = ParamsUtils.convert_to_ast(local_conf) + del self.params["input_folder"] + del self.params["output_folder"] + except: + pass + + def transform(self): + sys.argv = ParamsUtils.dict_to_req(d=(self.params)) + launcher = PythonTransformLauncher(PIIRedactorPythonTransformConfiguration()) + return_code = launcher.launch() + return return_code + + if __name__ == "__main__": launcher = PythonTransformLauncher(PIIRedactorPythonTransformConfiguration()) log.info("Launching pii redactor transform") diff --git a/transforms/language/pii_redactor/kfp_ray/Makefile b/transforms/language/pii_redactor/kfp_ray/Makefile index 370f85cb0..858db1b0a 100644 --- a/transforms/language/pii_redactor/kfp_ray/Makefile +++ b/transforms/language/pii_redactor/kfp_ray/Makefile @@ -2,10 +2,15 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows -# Include the common configuration for this transform -include ../transform.config +SRC_DIR=${CURDIR}/../ +# Use the docker image that is built for ray runtime +TRANSFORM_RUNTIME=ray +## override settings in .make.default as they assume old structure with ray being the current folder +DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-$(TRANSFORM_RUNTIME) +DOCKER_LOCAL_IMAGE=$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) -SRC_DIR=${CURDIR}/../ray/ +# Only build the image with -f Dockerfile.ray +BUILD_SPECIFIC_RUNTIME=ray PYTHON_WF := $(shell find ./ -name '*_wf.py') YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF}) @@ -17,27 +22,6 @@ clean: @# Help: Clean up the virtual environment. rm -rf ${REPOROOT}/transforms/venv -venv:: - -build:: - -setup:: - -test:: - -test-src:: - -test-image:: - -publish:: - -image:: - -kind-load-image:: - -docker-load-image:: - -docker-save-image:: .PHONY: workflow-build workflow-build: workflow-venv @@ -45,10 +29,19 @@ workflow-build: workflow-venv .PHONY: workflow-test workflow-test: workflow-build - $(MAKE) .workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=pii_redactor_wf.yaml + $(MAKE) TRANSFORM_SRC=${SRC_DIR} \ + TRANSFORM_RUNTIME=$(TRANSFORM_RUNTIME) \ + TRANSFORM_NAME=$(TRANSFORM_NAME) \ + BUILD_SPECIFIC_RUNTIME=$(BUILD_SPECIFIC_RUNTIME) \ + DOCKER_REMOTE_IMAGE=$(DOCKER_REGISTRY_ENDPOINT)/$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) \ + PIPELINE_FILE=$(TRANSFORM_NAME)_wf.yaml .workflows.test-pipeline .PHONY: workflow-upload workflow-upload: workflow-build @for file in $(YAML_WF); do \ $(MAKE) .workflows.upload-pipeline PIPELINE_FILE=$$file; \ done + + + + diff --git a/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py b/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py index b01d51fdf..b05aecd69 100644 --- a/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py +++ b/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py @@ -24,7 +24,7 @@ task_image = "quay.io/dataprep1/data-prep-kit/pii-redactor-ray:latest" # the name of the job script -EXEC_SCRIPT_NAME: str = "pii_redactor_transform_ray.py" +EXEC_SCRIPT_NAME: str = "-m dpk_pii_redactor.ray.transform" # components base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" diff --git a/transforms/language/pii_redactor/piiredactor-ray.ipynb b/transforms/language/pii_redactor/piiredactor-ray.ipynb new file mode 100644 index 000000000..49f22965c --- /dev/null +++ b/transforms/language/pii_redactor/piiredactor-ray.ipynb @@ -0,0 +1,325 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "afd55886-5f5b-4794-838e-ef8179fb0394", + "metadata": {}, + "source": [ + "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n", + "```\n", + "make venv \n", + "source venv/bin/activate \n", + "pip install jupyterlab\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "%%capture\n", + "## This is here as a reference only\n", + "# Users and application developers must use the right tag for the latest from pypi\n", + "%pip install \"data-prep-toolkit-transforms[ray,pii_redactor]==1.0.0a5\"" + ] + }, + { + "cell_type": "markdown", + "id": "407fd4e4-265d-4ec7-bbc9-b43158f5f1f3", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "##### **** Configure the transform parameters. \n", + "```\n", + " --pii_redactor_entities PII_ENTITIES\n", + " list of PII entities to be captured for example: [\"PERSON\", \"EMAIL\"]\n", + " --pii_redactor_operator REDACTOR_OPERATOR\n", + " Two redaction techniques are supported - replace(default), redact \n", + " --pii_redactor_transformed_contents PII_TRANSFORMED_CONTENT_COLUMN_NAME\n", + " Mention the column name in which transformed contents will be added. This is required argument. \n", + " --pii_redactor_score_threshold SCORE_THRESHOLD\n", + " The score_threshold is a parameter that sets the minimum confidence score required for an entity to be considered a match. Provide a value above 0.6\n", + "```\n", + "#####" + ] + }, + { + "cell_type": "markdown", + "id": "ebf1f782-0e61-485c-8670-81066beb734c", + "metadata": {}, + "source": [ + "##### ***** Import required classes and modules" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", + "metadata": {}, + "outputs": [], + "source": [ + "from dpk_pii_redactor.ray.transform import PIIRedactor\n", + "from data_processing.utils import GB" + ] + }, + { + "cell_type": "markdown", + "id": "7234563c-2924-4150-8a31-4aec98c1bf33", + "metadata": {}, + "source": [ + "##### ***** Setup runtime parameters and invoke the transform" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "95737436", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "17:15:38 INFO - pipeline id pipeline_id\n", + "17:15:38 INFO - code location None\n", + "17:15:38 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'memory': 2147483648, 'max_restarts': -1}\n", + "17:15:38 INFO - actor creation delay 0\n", + "17:15:38 INFO - job details {'job category': 'preprocessing', 'job name': 'pii_redactor', 'job type': 'ray', 'job id': 'job_id'}\n", + "17:15:38 INFO - data factory data_ is using local data access: input_folder - ray/test-data/input output_folder - output\n", + "17:15:38 INFO - data factory data_ max_files -1, n_sample -1\n", + "17:15:38 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "17:15:38 INFO - Running locally\n", + "2025-01-16 17:15:39,562\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "17:16:09 INFO - Completed execution in 0.513 min, execution result 0\n" + ] + } + ], + "source": [ + "%%capture\n", + "PIIRedactor(input_folder='ray/test-data/input',\n", + " output_folder= 'output',\n", + " run_locally= True,\n", + " num_cpus= 0.8,\n", + " memory= 2 * GB,\n", + " runtime_num_workers = 3,\n", + " runtime_creation_delay = 0,\n", + " pii_redactor_entities = [\"PERSON\", \"EMAIL_ADDRESS\"],\n", + " pii_redactor_operator = \"replace\",\n", + " pii_redactor_transformed_contents = \"title\").transform()" + ] + }, + { + "cell_type": "markdown", + "id": "c3df5adf-4717-4a03-864d-9151cd3f134b", + "metadata": {}, + "source": [ + "##### **** The specified folder will include the transformed parquet files." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "7276fe84-6512-4605-ab65-747351e13a7c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['output/metadata.json', 'output/test1.parquet']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import glob\n", + "glob.glob(\"output/*\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "845a75cf-f4a9-467d-87fa-ccbac1c9beb8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
contentsdoc_id
0I am Tom Chandlerdoc1
1My website is www.tomchandler.comdoc2
2Contact me at greek@yahoo.comdoc3
\n", + "
" + ], + "text/plain": [ + " contents doc_id\n", + "0 I am Tom Chandler doc1\n", + "1 My website is www.tomchandler.com doc2\n", + "2 Contact me at greek@yahoo.com doc3" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "pd.read_parquet('ray/test-data/input/test1.parquet', engine='pyarrow')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c9a2c725-6596-4ee5-8869-b6ec155153b1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
detected_piititlecontentsdoc_id
0[PERSON]I am <PERSON>I am Tom Chandlerdoc1
1[]My website is www.tomchandler.comMy website is www.tomchandler.comdoc2
2[EMAIL_ADDRESS]Contact me at <EMAIL_ADDRESS>Contact me at greek@yahoo.comdoc3
\n", + "
" + ], + "text/plain": [ + " detected_pii title \\\n", + "0 [PERSON] I am \n", + "1 [] My website is www.tomchandler.com \n", + "2 [EMAIL_ADDRESS] Contact me at \n", + "\n", + " contents doc_id \n", + "0 I am Tom Chandler doc1 \n", + "1 My website is www.tomchandler.com doc2 \n", + "2 Contact me at greek@yahoo.com doc3 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.read_parquet('output/test1.parquet', engine='pyarrow')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6ad102df-1038-4b86-9e8e-021de5fe0b8b", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transforms/language/pii_redactor/piiredactor.ipynb b/transforms/language/pii_redactor/piiredactor.ipynb new file mode 100644 index 000000000..8f010039a --- /dev/null +++ b/transforms/language/pii_redactor/piiredactor.ipynb @@ -0,0 +1,181 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "afd55886-5f5b-4794-838e-ef8179fb0394", + "metadata": {}, + "source": [ + "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n", + "```\n", + "make venv \n", + "source venv/bin/activate \n", + "pip install jupyterlab\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "## This is here as a reference only\n", + "# Users and application developers must use the right tag for the latest from pypi\n", + "%pip install \"data-prep-toolkit-transforms[pii_redactor]==1.0.0a5\"" + ] + }, + { + "cell_type": "markdown", + "id": "407fd4e4-265d-4ec7-bbc9-b43158f5f1f3", + "metadata": {}, + "source": [ + "##### **** Configure the transform parameters. \n", + "```\n", + " --pii_redactor_entities PII_ENTITIES\n", + " list of PII entities to be captured for example: [\"PERSON\", \"EMAIL\"]\n", + " --pii_redactor_operator REDACTOR_OPERATOR\n", + " Two redaction techniques are supported - replace(default), redact \n", + " --pii_redactor_transformed_contents PII_TRANSFORMED_CONTENT_COLUMN_NAME\n", + " Mention the column name in which transformed contents will be added. This is required argument. \n", + " --pii_redactor_score_threshold SCORE_THRESHOLD\n", + " The score_threshold is a parameter that sets the minimum confidence score required for an entity to be considered a match. Provide a value above 0.6\n", + "```\n", + "#####" + ] + }, + { + "cell_type": "markdown", + "id": "ebf1f782-0e61-485c-8670-81066beb734c", + "metadata": {}, + "source": [ + "##### ***** Import required classes and modules" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", + "metadata": {}, + "outputs": [], + "source": [ + "from dpk_pii_redactor.transform_python import PIIRedactor" + ] + }, + { + "cell_type": "markdown", + "id": "7234563c-2924-4150-8a31-4aec98c1bf33", + "metadata": {}, + "source": [ + "##### ***** Setup runtime parameters and invoke the transform" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "95737436", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "17:17:13 INFO - pipeline id pipeline_id\n", + "17:17:13 INFO - code location None\n", + "17:17:13 INFO - data factory data_ is using local data access: input_folder - ray/test-data/input output_folder - output\n", + "17:17:13 INFO - data factory data_ max_files -1, n_sample -1\n", + "17:17:13 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "17:17:13 INFO - orchestrator pii_redactor started at 2025-01-16 17:17:13\n", + "17:17:13 INFO - Number of files is 1, source profile {'max_file_size': 0.0023164749145507812, 'min_file_size': 0.0023164749145507812, 'total_file_size': 0.0023164749145507812}\n", + "17:17:13 INFO - Loading model from flair/ner-english-large\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2025-01-16 17:17:23,474 SequenceTagger predicts: Dictionary with 20 tags: , O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, , \n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "17:17:24 INFO - Completed 1 files (100.0%) in 0.005 min\n", + "17:17:24 INFO - Done processing 1 files, waiting for flush() completion.\n", + "17:17:24 INFO - done flushing in 0.0 sec\n", + "17:17:24 INFO - Completed execution in 0.177 min, execution result 0\n" + ] + } + ], + "source": [ + "%%capture\n", + "PIIRedactor(input_folder='ray/test-data/input',\n", + " output_folder= 'output',\n", + " pii_redactor_entities = [\"PERSON\", \"EMAIL_ADDRESS\"],\n", + " pii_redactor_operator = \"replace\",\n", + " pii_redactor_transformed_contents = \"title\").transform()" + ] + }, + { + "cell_type": "markdown", + "id": "c3df5adf-4717-4a03-864d-9151cd3f134b", + "metadata": {}, + "source": [ + "##### **** The specified folder will include the transformed parquet files." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "7276fe84-6512-4605-ab65-747351e13a7c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['output/metadata.json', 'output/test1.parquet']" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import glob\n", + "glob.glob(\"output/*\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "845a75cf-f4a9-467d-87fa-ccbac1c9beb8", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transforms/language/pii_redactor/python/.dockerignore b/transforms/language/pii_redactor/python/.dockerignore deleted file mode 100644 index f7275bbbd..000000000 --- a/transforms/language/pii_redactor/python/.dockerignore +++ /dev/null @@ -1 +0,0 @@ -venv/ diff --git a/transforms/language/pii_redactor/python/Makefile b/transforms/language/pii_redactor/python/Makefile deleted file mode 100644 index 50161da6e..000000000 --- a/transforms/language/pii_redactor/python/Makefile +++ /dev/null @@ -1,64 +0,0 @@ -# Define the root of the local git clone for the common rules to be able -# know where they are running from. -REPOROOT=../../../.. - -# Set this, before including .make.defaults, to -# 1 if requirements reference the latest code in the data processing library -# in this repo (that is not yet published to pypi). This is the default setting. -# 0 if the transforms DPK dependencies are on wheels published to -# pypi (e.g. data-prep-toolkit=0.2.1) -#USE_REPO_LIB_SRC=1 - -# Include a library of common .transform.* targets which most -# transforms should be able to reuse. However, feel free -# to override/redefine the rules below. -include $(REPOROOT)/transforms/.make.transforms - -# Include the common configuration for this transform -include ../transform.config - -venv:: .transforms.python-venv - -test:: .transforms.python-test - -clean:: .transforms.clean - -image:: .transforms.python-image - -test-src:: .transforms.test-src - -setup:: .transforms.setup - -build:: build-dist image - -publish: publish-image - -publish-image:: .transforms.publish-image-python - -setup:: .transforms.setup - -# distribution versions is the same as image version. -set-versions: - $(MAKE) TRANSFORM_PYTHON_VERSION=$(PII_REDACTOR_PYTHON_VERSION) TOML_VERSION=$(PII_REDACTOR_PYTHON_VERSION) .transforms.set-versions - -build-dist:: set-versions .defaults.build-dist - -publish-dist:: .defaults.publish-dist - -test-image:: .transforms.python-test-image - -run-cli-sample: .transforms.run-cli-python-sample - -run-local-sample: .transforms.run-local-sample - -run-local-python-sample: .transforms.run-local-python-sample - -#run-s3-ray-sample: .transforms.run-s3-ray-sample - -minio-start: .minio-start - -kind-load-image:: .transforms.kind-load-image - -docker-load-image: .defaults.docker-load-image - -docker-save-image: .defaults.docker-save-image diff --git a/transforms/language/pii_redactor/python/README.md b/transforms/language/pii_redactor/python/README.md deleted file mode 100644 index b361e0c75..000000000 --- a/transforms/language/pii_redactor/python/README.md +++ /dev/null @@ -1,80 +0,0 @@ -# PII Redactor Transform - -This transform redacts Personally Identifiable Information (PII) from the input data. - -The transform leverages the [Microsoft Presidio SDK](https://microsoft.github.io/presidio/) for PII detection and uses the Flair recognizer for entity recognition. - -### Supported Entities - -The transform detects the following PII entities by default: -- **PERSON**: Names of individuals -- **EMAIL_ADDRESS**: Email addresses -- **ORGANIZATION**: Names of organizations -- **DATE_TIME**: Dates and times -- **PHONE_NUMBER**: Phone number -- **CREDIT_CARD**: Credit card numbers - -You can configure the entities to detect by passing the required entities as argument param ( **--pii_redactor_entities** ). -To know more about different entity types supported - [Entities](https://microsoft.github.io/presidio/supported_entities/) - -### Redaction Techniques - -Two redaction techniques are supported: -- **replace**: Replaces detected PII with a placeholder (default) -- **redact**: Removes the detected PII from the text - -You can choose the redaction technique by passing it as an argument parameter (**--pii_redactor_operator**). - -## Input and Output - -### Input - -The input data should be a `py.Table` with a column containing the text where PII detection and redaction will be applied. By default, this column is named `contents`. - -**Example Input Table Structure:** Table 1: Sample input to the pii redactor transform - -| contents | doc_id | -|---------------------|--------| -| My name is John Doe | doc001 | -| I work at apple | doc002 | - - -### Output - -The output table will include the original columns plus an additional column `new_contents` which is configurable with redacted text and `detected_pii` -column consisting the type of PII entities detected in that document for replace operator. - -**Example Output Table Structure for replace operator:** - -| contents | doc_id | new_contents | detected_pii | -|---------------------|--------|--------------------------|------------------| -| My name is John Doe | doc001 | My name is `` | `[PERSON]` | -| I work at apple | doc002 | I work at `` | `[ORGANIZATION]` | - -When `redact` operator is chosen the output will look like below - -**Example Output Table Structure for redact operator** - -| contents | doc_id | new_contents | detected_pii | -|---------------------|--------|--------------------------|------------------| -| My name is John Doe | doc001 | My name is | `[PERSON]` | -| I work at apple | doc002 | I work at | `[ORGANIZATION]` | - -## Running - -### Launched Command Line Options -The following command line arguments are available in addition to -the options provided by -the [python launcher](../../../../data-processing-lib/doc/python-launcher-options.md). - -``` - --pii_redactor_entities PII_ENTITIES - list of PII entities to be captured for example: ["PERSON", "EMAIL"] - --pii_redactor_operator REDACTOR_OPERATOR - Two redaction techniques are supported - replace(default), redact - --pii_redactor_transformed_contents PII_TRANSFORMED_CONTENT_COLUMN_NAME - Mention the column name in which transformed contents will be added. This is required argument. - --pii_redactor_score_threshold SCORE_THRESHOLD - The score_threshold is a parameter that sets the minimum confidence score required for an entity to be considered a match. - Provide a value above 0.6 -``` \ No newline at end of file diff --git a/transforms/language/pii_redactor/python/pyproject.toml b/transforms/language/pii_redactor/python/pyproject.toml deleted file mode 100644 index a136ea5e4..000000000 --- a/transforms/language/pii_redactor/python/pyproject.toml +++ /dev/null @@ -1,45 +0,0 @@ -[project] -name = "dpk_pii_redactor_transform_python" -version = "0.2.4.dev0" -requires-python = ">=3.10,<3.13" -description = "PII redactor Transform for Python" -license = {text = "Apache-2.0"} -readme = {file = "README.md", content-type = "text/markdown"} -authors = [ - { name = "Sowmya.L.R", email = "lrsowmya@gmail.com" }, -] -dynamic = ["dependencies"] - -[build-system] -requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.dynamic] -dependencies = {file = ["requirements.txt"]} - -[project.optional-dependencies] -dev = [ - "twine", - "pytest>=7.3.2", - "pytest-dotenv>=0.5.2", - "pytest-env>=1.0.0", - "pre-commit>=3.3.2", - "pytest-cov>=4.1.0", - "pytest-mock>=3.10.0", - "moto==5.0.5", - "markupsafe==2.0.1", -] - -[options] -package_dir = ["src","test"] - -[options.packages.find] -where = ["src/"] - -[tool.pytest.ini_options] -# Currently we use low coverage since we have to run tests separately (see makefile) -#addopts = "--cov --cov-report term-missing --cov-fail-under 25" -markers = ["unit: unit tests", "integration: integration tests"] - -[tool.coverage.run] -include = ["src/*"] diff --git a/transforms/language/pii_redactor/ray/Makefile b/transforms/language/pii_redactor/ray/Makefile deleted file mode 100644 index e52494534..000000000 --- a/transforms/language/pii_redactor/ray/Makefile +++ /dev/null @@ -1,67 +0,0 @@ -# Define the root of the local git clone for the common rules to be able -# know where they are running from. -REPOROOT=../../../.. - -# Set this, before including .make.defaults, to -# 1 if requirements reference the latest code in the data processing library -# in this repo (that is not yet published to pypi). This is the default setting. -# 0 if the transforms DPK dependencies are on wheels published to -# pypi (e.g. data-prep-toolkit=0.2.1) -#USE_REPO_LIB_SRC=1 - -# Include a library of common .transform.* targets which most -# transforms should be able to reuse. However, feel free -# to override/redefine the rules below. -include $(REPOROOT)/transforms/.make.transforms - -# Include the common configuration for this transform -include ../transform.config - -BASE_IMAGE=${RAY_BASE_IMAGE} -venv:: .transforms.ray-venv - -test:: .transforms.ray-test - -clean:: .transforms.clean - -image:: .transforms.ray-image - -test-src:: .transforms.test-src - -setup:: .transforms.setup - -#test-image:: .transforms.ray-test-image -test-image:: - @echo "Skip... do nothing! pushing CI/CD over a cliff with OSError on text_encoder " - -build:: build-dist image - -publish: publish-image - -#publish-image:: .transforms.publish-image-ray -publish-image:: - @echo "Skip... do nothing! pushing CI/CD over a cliff with OSError on text_encoder " - -setup:: .transforms.setup - -# set the version of python transform that this depends on. -set-versions: - $(MAKE) TRANSFORM_PYTHON_VERSION=${PII_REDACTOR_PYTHON_VERSION} TOML_VERSION=$(PII_REDACTOR_PYTHON_VERSION) .transforms.set-versions - -build-dist:: set-versions .defaults.build-dist - -publish-dist:: .defaults.publish-dist - -run-cli-sample: .transforms.run-cli-ray-sample - -run-local-sample: .transforms.run-local-ray-sample - -run-s3-sample: .transforms.run-s3-ray-sample - -minio-start: .minio-start - -kind-load-image:: .transforms.kind-load-image - -docker-load-image: .defaults.docker-load-image - -docker-save-image: .defaults.docker-save-image diff --git a/transforms/language/pii_redactor/ray/README.md b/transforms/language/pii_redactor/ray/README.md deleted file mode 100644 index 1c3a4b144..000000000 --- a/transforms/language/pii_redactor/ray/README.md +++ /dev/null @@ -1,49 +0,0 @@ -# PII Redactor Ray Transform -Please see the set of -[transform project conventions](../../../README.md#transform-project-conventions) -for details on general project conventions, transform configuration, -testing and IDE set up. - -## Summary -This project wraps the [pii redactor transform](../python) with a Ray runtime. - -## Configuration and command line Options - -PII redactor configuration and command line options are the same as for the [base python](../python) transform and in additional it also supports other options refer [pii redactor transform](../python/README.md) . - -## Running - -### Launched Command Line Options -In addition to those available to the transform as defined in [here](../python/README.md), -the set of -[ray launcher](../../../../data-processing-lib/doc/ray-launcher-options.md) are available. - -### Running the samples -To run the samples, use the following `make` targets - -* `run-cli-sample` - runs src/pii_redactor_transform.py using command line args -* `run-local-sample` - runs src/pii_redactor_local_ray.py -* `run-s3-sample` - runs src/pii_redactor_s3_ray.py - * Requires prior installation of minio, depending on your platform (e.g., from [here](https://min.io/docs/minio/macos/index.html) - and [here](https://min.io/docs/minio/linux/index.html) - and invocation of `make minio-start` to load data into local minio for S3 access. - -These targets will activate the virtual environment and set up any configuration needed. -Use the `-n` option of `make` to see the detail of what is done to run the sample. - -For example, -```shell -make run-cli-sample -... -``` -Then -```shell -ls output -``` -To see results of the transform. - -### Transforming data using the transform image - -To use the transform image to transform your data, please refer to the -[running images quickstart](../../../../doc/quick-start/run-transform-image.md), -substituting the name of this transform image and runtime as appropriate. diff --git a/transforms/language/pii_redactor/ray/pyproject.toml b/transforms/language/pii_redactor/ray/pyproject.toml deleted file mode 100644 index b3cf0283d..000000000 --- a/transforms/language/pii_redactor/ray/pyproject.toml +++ /dev/null @@ -1,50 +0,0 @@ -[project] -name = "dpk_pii_redactor_transform_ray" -version = "0.2.4.dev0" -requires-python = ">=3.10,<3.13" -description = "PII Redactor Ray Transform" -license = {text = "Apache-2.0"} -readme = {file = "README.md", content-type = "text/markdown"} -authors = [ - { name = "David Wood", email = "dawood@us.ibm.com" }, - { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, -] -dependencies = [ - "dpk_pii_redactor_transform_python==0.2.4.dev0", - "data-prep-toolkit[ray]>=0.2.4.dev0", - "presidio-analyzer>=2.2.355", - "presidio-anonymizer>=2.2.355", - "flair>=0.14.0", - "pandas>=2.2.2", -] - -[build-system] -requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] -build-backend = "setuptools.build_meta" - -[project.optional-dependencies] -dev = [ - "twine", - "pytest>=7.3.2", - "pytest-dotenv>=0.5.2", - "pytest-env>=1.0.0", - "pre-commit>=3.3.2", - "pytest-cov>=4.1.0", - "pytest-mock>=3.10.0", - "moto==5.0.5", - "markupsafe==2.0.1", -] - -[options] -package_dir = ["src","test"] - -[options.packages.find] -where = ["src/"] - -[tool.pytest.ini_options] -# Currently we use low coverage since we have to run tests separately (see makefile) -#addopts = "--cov --cov-report term-missing --cov-fail-under 25" -markers = ["unit: unit tests", "integration: integration tests"] - -[tool.coverage.run] -include = ["src/*"] diff --git a/transforms/language/pii_redactor/python/requirements.txt b/transforms/language/pii_redactor/requirements.txt similarity index 75% rename from transforms/language/pii_redactor/python/requirements.txt rename to transforms/language/pii_redactor/requirements.txt index 2e4ca197e..915c234e5 100644 --- a/transforms/language/pii_redactor/python/requirements.txt +++ b/transforms/language/pii_redactor/requirements.txt @@ -1,4 +1,3 @@ -data-prep-toolkit>=0.2.3 presidio-analyzer>=2.2.355 presidio-anonymizer>=2.2.355 flair>=0.14.0 diff --git a/transforms/language/pii_redactor/python/test-data/input/pii_test_data.parquet b/transforms/language/pii_redactor/test-data/input/pii_test_data.parquet similarity index 100% rename from transforms/language/pii_redactor/python/test-data/input/pii_test_data.parquet rename to transforms/language/pii_redactor/test-data/input/pii_test_data.parquet diff --git a/transforms/language/pii_redactor/python/test/test_data.py b/transforms/language/pii_redactor/test/test_data.py similarity index 100% rename from transforms/language/pii_redactor/python/test/test_data.py rename to transforms/language/pii_redactor/test/test_data.py diff --git a/transforms/language/pii_redactor/python/test/test_pii_analyzer.py b/transforms/language/pii_redactor/test/test_pii_analyzer.py similarity index 97% rename from transforms/language/pii_redactor/python/test/test_pii_analyzer.py rename to transforms/language/pii_redactor/test/test_pii_analyzer.py index 75d237218..36446d6d0 100644 --- a/transforms/language/pii_redactor/python/test/test_pii_analyzer.py +++ b/transforms/language/pii_redactor/test/test_pii_analyzer.py @@ -10,7 +10,7 @@ ################################################################################ import pytest -from pii_analyzer import PIIAnalyzerEngine +from dpk_pii_redactor.pii_analyzer import PIIAnalyzerEngine @pytest.fixture(scope="module") diff --git a/transforms/language/pii_redactor/python/test/test_pii_anonymizer.py b/transforms/language/pii_redactor/test/test_pii_anonymizer.py similarity index 93% rename from transforms/language/pii_redactor/python/test/test_pii_anonymizer.py rename to transforms/language/pii_redactor/test/test_pii_anonymizer.py index 5bcfd4aa3..2bb693c34 100644 --- a/transforms/language/pii_redactor/python/test/test_pii_anonymizer.py +++ b/transforms/language/pii_redactor/test/test_pii_anonymizer.py @@ -10,8 +10,8 @@ ################################################################################ import pytest -from pii_analyzer import PIIAnalyzerEngine -from pii_anonymizer import PIIAnonymizer +from dpk_pii_redactor.pii_analyzer import PIIAnalyzerEngine +from dpk_pii_redactor.pii_anonymizer import PIIAnonymizer @pytest.fixture(scope="module") diff --git a/transforms/language/pii_redactor/ray/test/test_pii_redactor_ray.py b/transforms/language/pii_redactor/test/test_pii_redactor_ray.py similarity index 89% rename from transforms/language/pii_redactor/ray/test/test_pii_redactor_ray.py rename to transforms/language/pii_redactor/test/test_pii_redactor_ray.py index 360ed4286..0458116ab 100644 --- a/transforms/language/pii_redactor/ray/test/test_pii_redactor_ray.py +++ b/transforms/language/pii_redactor/test/test_pii_redactor_ray.py @@ -16,8 +16,8 @@ AbstractTransformLauncherTest, ) from data_processing_ray.runtime.ray import RayTransformLauncher -from pii_redactor_transform import doc_transformed_contents_cli_param -from pii_redactor_transform_ray import PIIRedactorRayTransformConfiguration +from dpk_pii_redactor.transform import doc_transformed_contents_cli_param +from dpk_pii_redactor.ray.transform import PIIRedactorRayTransformConfiguration class TestRayPIIRedactorTransform(AbstractTransformLauncherTest): @@ -27,7 +27,7 @@ class TestRayPIIRedactorTransform(AbstractTransformLauncherTest): """ def get_test_transform_fixtures(self) -> list[tuple]: - basedir = "../test-data" + basedir = "../ray/test-data" basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), basedir)) fixtures = [] launcher = RayTransformLauncher(PIIRedactorRayTransformConfiguration()) diff --git a/transforms/language/pii_redactor/python/test/test_pii_redactor_redact_anonamize.py b/transforms/language/pii_redactor/test/test_pii_redactor_redact_anonamize.py similarity index 97% rename from transforms/language/pii_redactor/python/test/test_pii_redactor_redact_anonamize.py rename to transforms/language/pii_redactor/test/test_pii_redactor_redact_anonamize.py index 71d3bfb25..c5fd53f71 100644 --- a/transforms/language/pii_redactor/python/test/test_pii_redactor_redact_anonamize.py +++ b/transforms/language/pii_redactor/test/test_pii_redactor_redact_anonamize.py @@ -12,7 +12,7 @@ from data_processing.test_support.transform.table_transform_test import ( AbstractTableTransformTest, ) -from pii_redactor_transform import ( +from dpk_pii_redactor.transform import ( PIIRedactorTransform, doc_transformed_contents_key, redaction_operator_key, diff --git a/transforms/language/pii_redactor/python/test/test_pii_redactor_transform.py b/transforms/language/pii_redactor/test/test_pii_redactor_transform.py similarity index 93% rename from transforms/language/pii_redactor/python/test/test_pii_redactor_transform.py rename to transforms/language/pii_redactor/test/test_pii_redactor_transform.py index 867d0e7d1..7cfc2dfbf 100644 --- a/transforms/language/pii_redactor/python/test/test_pii_redactor_transform.py +++ b/transforms/language/pii_redactor/test/test_pii_redactor_transform.py @@ -12,7 +12,7 @@ from data_processing.test_support.transform.table_transform_test import ( AbstractTableTransformTest, ) -from pii_redactor_transform import PIIRedactorTransform, doc_transformed_contents_key +from dpk_pii_redactor.transform import PIIRedactorTransform, doc_transformed_contents_key from test_data import expected_metadata_list, expected_table, table diff --git a/transforms/language/pii_redactor/transform.config b/transforms/language/pii_redactor/transform.config deleted file mode 100644 index c06adf82c..000000000 --- a/transforms/language/pii_redactor/transform.config +++ /dev/null @@ -1,20 +0,0 @@ -# -# This is intended to be included across the Makefiles provided within -# a given transform's directory tree, so must use compatible syntax. -# -################################################################################ -# This defines the name of the transform and is used to match against -# expected files and is used to define the transform's image name. -TRANSFORM_NAME=pii_redactor - -################################################################################ -# This defines the transforms' version number as would be used -# when publishing the wheel. In general, only the micro version -# number should be advanced relative to the DPK_VERSION. -# -# If you change the versions numbers, be sure to run "make set-versions" to -# update version numbers across the transform (e.g., pyproject.toml). -PII_REDACTOR_PYTHON_VERSION=$(DPK_VERSION) -PII_REDACTOR_RAY_VERSION=$(PII_REDACTOR_PYTHON_VERSION) -PII_REDACTOR_SPARK_VERSION=$(PII_REDACTOR_PYTHON_VERSION) - diff --git a/transforms/pyproject.toml b/transforms/pyproject.toml index 5936cba31..4e3dde1f3 100644 --- a/transforms/pyproject.toml +++ b/transforms/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_transforms" -version = "1.0.0a4" +version = "1.0.0a5" requires-python = ">=3.10,<3.13" keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] description = "Data Preparation Toolkit Transforms using Ray" @@ -31,8 +31,7 @@ all = { file = [ "code/code_profiler/requirements.txt", -##### pii_redactor seem to be failing UT -## "language/pii_redactor/python/requirements.txt", +"language/pii_redactor/requirements.txt", "universal/profiler/python/requirements.txt", "universal/resize/python/requirements.txt", @@ -59,8 +58,7 @@ all = { file = [ ]} language = { file = [ -##### pii_redactor seem to be failing UT -## "language/pii_redactor/python/requirements.txt", +"language/pii_redactor/requirements.txt", "language/lang_id/requirements.txt", "language/doc_quality/requirements.txt", @@ -92,8 +90,6 @@ license_select = { file = ["code/license_select/python/requirements.txt"]} code_quality = { file = ["code/code_quality/python/requirements.txt"]} code2parquet = {file = ["code/code2parquet/python/requirements.txt"]} -pii_redactor = { file = ["language/pii_redactor/python/requirements.txt"]} - profiler = { file = ["universal/profiler/python/requirements.txt"]} resize = { file = ["universal/resize/python/requirements.txt"]} @@ -104,6 +100,7 @@ html2parquet = { file = ["language/html2parquet/requirements.txt"]} lang_id = { file = ["language/lang_id/requirements.txt"]} pdf2parquet = { file = ["language/pdf2parquet/requirements.txt"]} text_encoder = { file = ["language/text_encoder/requirements.txt"]} +pii_redactor = { file = ["language/pii_redactor/requirements.txt"]} filter = { file = ["universal/filter/requirements.txt"]} doc_id = { file = ["universal/doc_id/requirements.txt"]} @@ -132,6 +129,7 @@ dpk_html2parquet = "language/html2parquet/dpk_html2parquet" dpk_lang_id = "language/lang_id/dpk_lang_id" dpk_pdf2parquet = "language/pdf2parquet/dpk_pdf2parquet" dpk_text_encoder = "language/text_encoder/dpk_text_encoder" +dpk_pii_redactor = "language/pii_redactor/dpk_pii_redactor" dpk_doc_id = "universal/doc_id/dpk_doc_id" dpk_hap = "universal/hap/dpk_hap" dpk_ededup = "universal/ededup/dpk_ededup"