Skip to content

Commit

Permalink
first cut at refactoring fdedup as its own named dpk_ module
Browse files Browse the repository at this point in the history
Signed-off-by: Maroun Touma <touma@us.ibm.com>
  • Loading branch information
touma-I committed Dec 18, 2024
1 parent d4ffb13 commit 2bd246d
Show file tree
Hide file tree
Showing 135 changed files with 546 additions and 916 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,34 +2,21 @@ FROM docker.io/python:3.10.14-slim-bullseye

RUN pip install --upgrade --no-cache-dir pip

# install pytest
RUN pip install --no-cache-dir pytest
ARG DPK_WHEEL_FILE_NAME

# Create a user and use it to run the transform
RUN useradd -ms /bin/bash dpk
USER dpk
WORKDIR /home/dpk
ARG DPK_WHEEL_FILE_NAME

# Copy and install data processing libraries
# These are expected to be placed in the docker context before this is run (see the make image).
COPY --chown=dpk:root data-processing-dist data-processing-dist
RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}
RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}

COPY --chown=dpk:root src/ src/
COPY --chown=dpk:root pyproject.toml pyproject.toml
COPY --chown=dpk:root README.md README.md
COPY --chown=dpk:root dpk_fdedup/ dpk_fdedup/
COPY --chown=dpk:root requirements.txt requirements.txt
RUN pip install --no-cache-dir -r requirements.txt

RUN pip install --no-cache-dir -e .

# copy source data
COPY ./src/fdedup_transform_python.py fdedup_transform_python.py
COPY ./src/fdedup_transform_python.py local/

# copy test
COPY test/ test/
COPY test-data/ test-data/

# Set environment
ENV PYTHONPATH /home/dpk
Expand All @@ -38,4 +25,4 @@ ENV PYTHONPATH /home/dpk
ARG BUILD_DATE
ARG GIT_COMMIT
LABEL build-date=$BUILD_DATE
LABEL git-commit=$GIT_COMMIT
LABEL git-commit=$GIT_COMMIT
31 changes: 31 additions & 0 deletions transforms/universal/fdedup/Dockerfile.ray
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
ARG BASE_IMAGE=docker.io/rayproject/ray:2.36.1-py310

FROM ${BASE_IMAGE}

RUN pip install --upgrade --no-cache-dir pip

# install pytest
RUN pip install --no-cache-dir pytest
ARG DPK_WHEEL_FILE_NAME

# Copy and install data processing libraries
# These are expected to be placed in the docker context before this is run (see the make image).
COPY --chown=ray:users data-processing-dist data-processing-dist
RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray]

## Copy the python version of the tansform
COPY --chown=ray:users dpk_fdedup/ dpk_fdedup/
COPY --chown=ray:users requirements.txt requirements.txt
RUN pip install -r requirements.txt

# Grant non-root users the necessary permissions to the ray directory
RUN chmod 755 /home/ray

# Set environment
ENV PYTHONPATH /home/ray

# Put these at the end since they seem to upset the docker cache.
ARG BUILD_DATE
ARG GIT_COMMIT
LABEL build-date=$BUILD_DATE
LABEL git-commit=$GIT_COMMIT
40 changes: 40 additions & 0 deletions transforms/universal/fdedup/Dockerfile.spark
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
FROM quay.io/dataprep1/data-prep-kit/data-prep-kit-spark-3.5.2:latest

USER root
# install pytest
RUN pip install --no-cache-dir pytest

WORKDIR ${SPARK_HOME}/work-dir
ARG DPK_WHEEL_FILE_NAME

# Copy and install data processing libraries
# These are expected to be placed in the docker context before this is run (see the make image).
COPY --chown=spark:root data-processing-dist data-processing-dist
RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[spark]


# Install project source

## Copy the python version of the tansform
COPY --chown=spark:root dpk_fdedup/ dpk_fdedup/
COPY --chown=spark:root requirements.txt requirements.txt
RUN pip install -r requirements.txt

RUN mkdir -p /opt/spark/work-dir/src/templates && \
mkdir -p /opt/spark/work-dir/config
COPY --chown=spark:root spark-deployment/kubernetes/spark-executor-pod-template.yml /opt/spark/work-dir/src/templates/
COPY --chown=spark:root spark-deployment/kubernetes/spark_profile.yml /opt/spark/work-dir/config/


USER spark

# Set environment
ENV PYTHONPATH=${SPARK_HOME}/work-dir/:${SPARK_HOME}/work-dir/src/:${PYTHONPATH}
ENV PATH=${SPARK_HOME}/work-dir/.local/bin/:${PATH}

# Put these at the end since they seem to upset the docker cache.
ARG BUILD_DATE
ARG GIT_COMMIT
LABEL build-date=$BUILD_DATE
LABEL git-commit=$GIT_COMMIT

86 changes: 15 additions & 71 deletions transforms/universal/fdedup/Makefile
Original file line number Diff line number Diff line change
@@ -1,79 +1,23 @@
REPOROOT=../../..
# Use make help, to see the available rules
include $(REPOROOT)/.make.defaults
include $(REPOROOT)/transforms/.make.cicd.targets

setup::
@# Help: Recursively make $@ all subdirs
$(MAKE) RULE=$@ .recurse
#
# This is intended to be included across the Makefiles provided within
# a given transform's directory tree, so must use compatible syntax.
#
################################################################################
# This defines the name of the transform and is used to match against
# expected files and is used to define the transform's image name.
TRANSFORM_NAME=$(shell basename `pwd`)

clean::
@# Help: Recursively make $@ all subdirs
$(MAKE) RULE=$@ .recurse
################################################################################

build::
@# Help: Recursively make $@ in subdirs
$(MAKE) RULE=$@ .recurse
venv::
@# Help: Recursively make $@ in subdirs
$(MAKE) RULE=$@ .recurse

image::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

set-versions:
@# Help: Recursively $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

publish::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

test-image::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

test::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

test-src::
@# Help: Recursively make $@ in all subdirs
$(MAKE) RULE=$@ .recurse

kind-load-image::
@# Help: Recursively make $@ in all subdirs
$(MAKE) RULE=$@ .recurse

docker-load-image::
@# Help: Recursively make $@ in all subdirs
$(MAKE) RULE=$@ .recurse

docker-save-image::
@# Help: Recursively make $@ in all subdirs
$(MAKE) RULE=$@ .recurse

.PHONY: workflow-venv
workflow-venv:
if [ -e kfp_ray ]; then \
$(MAKE) -C kfp_ray workflow-venv; \
fi

.PHONY: workflow-test
workflow-test:
if [ -e kfp_ray ]; then \
$(MAKE) -C kfp_ray workflow-test; \
fi

.PHONY: workflow-upload
workflow-upload:
if [ -e kfp_ray ]; then \
$(MAKE) -C kfp_ray workflow-upload; \
fi

.PHONY: workflow-build
workflow-build:
if [ -e kfp_ray ]; then \
$(MAKE) -C kfp_ray workflow-build; \
fi

un-cli-sample:
$(MAKE) RUN_FILE=$(TRANSFORM_NAME)_transform_ray.py \
RUN_ARGS="--run_locally True --data_local_config \"{ 'input_folder' : '../test-data/input', 'output_folder' : '../output'}\" \
--fdedup_id_column int_id_column" \
.transforms.run-src-file
Loading

0 comments on commit 2bd246d

Please sign in to comment.