Skip to content

Commit

Permalink
Merge pull request #682 from IBM/dev1-singlepackage-p3.12
Browse files Browse the repository at this point in the history
Single Packages (0.2.2.dev1) for data-prep-toolkit and data-prep-toolkit-transforms  with python3.12
  • Loading branch information
touma-I authored Oct 13, 2024
2 parents 34ade66 + be4bfe4 commit 18c99f5
Show file tree
Hide file tree
Showing 129 changed files with 568 additions and 863 deletions.
12 changes: 10 additions & 2 deletions .make.defaults
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,10 @@ endif
if [ ! -z "$(EXTRA_INDEX_URL)" ]; then \
extra_url='--extra-index-url $(EXTRA_INDEX_URL)'; \
fi; \
pip install $(PIP_INSTALL_EXTRA_ARGS) $${extra_url} -e $(PYTHON_PROJECT_DIR);
if [ -e $(PYTHON_PROJECT_DIR)/requirements.txt ]; then \
pip install -r $(PYTHON_PROJECT_DIR)/requirements.txt; \
fi; \
pip install $(PIP_INSTALL_EXTRA_ARGS) $${extra_url} -e $(PYTHON_PROJECT_DIR)
@echo Done installing source from $(PYTHON_PROJECT_DIR) into venv

# Install local requirements last as it generally includes our lib source
Expand Down Expand Up @@ -348,6 +351,11 @@ endif
.defaults.ray-lib-src-venv:: .defaults.create-venv .defaults.install-ray-lib-src-venv .defaults.install-local-requirements-venv
@# Help: Create the venv and install Ray library source, local dependencies and adjacent python source if present.

# Install local requirements last as it generally includes our lib source
.PHONY: .defaults.kfp-venv
.defaults.kfp-venv:: .defaults.create-venv .defaults.install-ray-lib-src-venv
@# Help: Create the venv and install Ray library source, local dependencies and adjacent python source if present.

# Install all source from the repo for a ray runtime transform into an existing venv
# And if there is an adjacent python dir (as for transforms), then also install that source
.PHONY: .defaults.install-ray-lib-src-venv
Expand Down Expand Up @@ -633,7 +641,7 @@ endif
rm -rf dist || true
rm -rf src/*egg-info || true
${PIP} install --upgrade build
${PYTHON} -m build
${PYTHON} -m build $(BUILD_WHEEL_EXTRA_ARG)

# Publish the distribution in the dist directory, usually created with .defaults.build-dist target
.PHONY: .defaults.publish-dist
Expand Down
2 changes: 1 addition & 1 deletion .make.versions
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ DPK_MINOR_VERSION=2
DPK_MICRO_VERSION=2
# The suffix is generally always set in the main/development branch and only nulled out when creating release branches.
# It can be manually incremented, for example, to allow publishing a new intermediate version wheel to pypi.
DPK_VERSION_SUFFIX=.dev0
DPK_VERSION_SUFFIX=.dev1

DPK_VERSION=$(DPK_MAJOR_VERSION).$(DPK_MINOR_VERSION).$(DPK_MICRO_VERSION)$(DPK_VERSION_SUFFIX)

Expand Down
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,11 @@ conda install gcc_linux-64
conda install gxx_linux-64
```

Next, install the data prep toolkit library. This library installs both the python and ray versions of the transforms.
Next, install the data prep toolkit library. This library installs both the python and ray versions of the transforms. For better management of dependencies, it is recommended to install the same tagged version of both the library and the transform.

```bash
pip3 install data-prep-toolkit-transforms-ray
pip3 install data-prep-toolkit[ray]==0.2.2
pip3 install data-prep-toolkit-transforms[ray,all]==0.2.2
pip3 install jupyterlab ipykernel ipywidgets

## install custom kernel
Expand Down
8 changes: 8 additions & 0 deletions data-processing-lib/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ REPOROOT=..

# Get some common rules for the whole repo
include $(REPOROOT)/.make.defaults
include $(REPOROOT)/.make.versions

########## ########## ########## ########## ########## ########## ########## ##########
# Global rules that are generally to be implemented in the sub-directories and can
Expand Down Expand Up @@ -53,5 +54,12 @@ publish::

set-versions:
@# Help: Recursively $@ in all subdirs
$(MAKE) TOML_VERSION=$(DPK_LIB_VERSION) .defaults.update-toml
@$(MAKE) RULE=$@ .recurse


build-pkg-dist::
$(MAKE) .defaults.build-dist BUILD_WHEEL_EXTRA_ARG=-w

publish-dist :: .defaults.publish-dist

43 changes: 43 additions & 0 deletions data-processing-lib/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
[project]
name = "data_prep_toolkit"
version = "0.2.2.dev1"
keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
requires-python = ">=3.10,<3.13"
description = "Data Preparation Toolkit Library for Ray and Python"
license = {text = "Apache-2.0"}
readme = {file = "README.md", content-type = "text/markdown"}
authors = [
{ name = "Maroun Touma", email = "touma@us.ibm.com" },
]

dynamic = ["dependencies", "optional-dependencies"]

[project_urls]
Repository = "https://github.com/IBM/data-prep-kit"
Issues = "https://github.com/IBM/data-prep-kit/issues"
Documentation = "https://ibm.github.io/data-prep-kit/"
"Transform project" = "https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/noop"

[build-system]
requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
build-backend = "setuptools.build_meta"

[tool.setuptools.dynamic.dependencies]
file = ["requirements.txt"]

[tool.setuptools.dynamic.optional-dependencies]
dev = { file = ["requirements-dev.txt"]}
ray = { file = ["requirements-ray.txt"]}
spark = { file = ["requirements-spark.txt"]}

[tool.setuptools.packages.find]
where = ["python/src", "ray/src", "spark/src"]


[tool.pytest.ini_options]
# Currently we use low coverage since we have to run tests separately (see makefile)
#addopts = "--cov --cov-report term-missing --cov-fail-under 25"
markers = ["unit: unit tests", "integration: integration tests"]

[tool.coverage.run]
include = ["src/*"]
2 changes: 1 addition & 1 deletion data-processing-lib/python/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "data_prep_toolkit"
version = "0.2.2.dev0"
version = "0.2.2.dev1"
requires-python = ">=3.10,<3.13"
keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
description = "Data Preparation Toolkit Library"
Expand Down
4 changes: 2 additions & 2 deletions data-processing-lib/ray/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "data_prep_toolkit_ray"
version = "0.2.2.dev0"
version = "0.2.2.dev1"
keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
requires-python = ">=3.10,<3.13"
description = "Data Preparation Toolkit Library for Ray"
Expand All @@ -11,7 +11,7 @@ authors = [
{ name = "Boris Lublinsky", email = "blublinsky@ibm.com" },
]
dependencies = [
"data-prep-toolkit>=0.2.2.dev0",
"data-prep-toolkit>=0.2.2.dev1",
"ray[default]==2.36.1",
# These two are to fix security issues identified by quay.io
"fastapi>=0.110.2",
Expand Down
9 changes: 9 additions & 0 deletions data-processing-lib/requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
twine
pytest>=7.3.2
pytest-dotenv>=0.5.2
pytest-env>=1.0.0
pre-commit>=3.3.2
pytest-cov>=4.1.0
pytest-mock>=3.10.0
moto==5.0.5
markupsafe==2.0.1
3 changes: 3 additions & 0 deletions data-processing-lib/requirements-ray.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
ray[default]==2.36.1
fastapi>=0.110.2
pillow>=10.3.0
2 changes: 2 additions & 0 deletions data-processing-lib/requirements-spark.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
pyspark>=3.5.2
psutil>=6.0.0
6 changes: 6 additions & 0 deletions data-processing-lib/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
numpy < 1.29.0
pyarrow==16.1.0
boto3==1.34.69
argparse
mmh3
psutil
4 changes: 2 additions & 2 deletions data-processing-lib/spark/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "data_prep_toolkit_spark"
version = "0.2.2.dev0"
version = "0.2.2.dev1"
keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
requires-python = ">=3.10,<3.13"
description = "Data Preparation Toolkit Library for Spark"
Expand All @@ -11,7 +11,7 @@ authors = [
{ name = "Boris Lublinsky", email = "blublinsk@ibm.com" },
]
dependencies = [
"data-prep-toolkit==0.2.2.dev0",
"data-prep-toolkit==0.2.2.dev1",
"pyspark>=3.5.2",
"psutil>=6.0.0",
"PyYAML>=6.0.2"
Expand Down
2 changes: 1 addition & 1 deletion doc/quick-start/quick-start.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ or
**Deploy the latest releases of the data prep toolkit library, all python transforms and all ray transforms**

```shell
pip3 install data-prep-toolkit-transforms-ray
pip3 install data-prep-toolkit-transforms[ray]
```

## Running transforms
Expand Down
2 changes: 1 addition & 1 deletion examples/notebooks/Run_your_first_transform_colab.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
},
"outputs": [],
"source": [
"! pip install --default-timeout=100 data-prep-toolkit-transforms-ray\n"
"! pip install --default-timeout=100 data-prep-toolkit-transforms[ray]\n"
]
},
{
Expand Down
4 changes: 2 additions & 2 deletions kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "data_prep_toolkit_kfp_v1"
version = "0.2.2.dev0"
version = "0.2.2.dev1"
requires-python = ">=3.10,<3.13"
description = "Data Preparation Kit Library. KFP support"
license = {text = "Apache-2.0"}
Expand All @@ -13,7 +13,7 @@ authors = [
]
dependencies = [
"kfp==1.8.22",
"data-prep-toolkit-kfp-shared==0.2.2.dev0",
"data-prep-toolkit-kfp-shared==0.2.2.dev1",
]

[build-system]
Expand Down
4 changes: 2 additions & 2 deletions kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "data_prep_toolkit_kfp_v2"
version = "0.2.2.dev0"
version = "0.2.2.dev1"
requires-python = ">=3.10,<3.13"
description = "Data Preparation Kit Library. KFP support"
license = {text = "Apache-2.0"}
Expand All @@ -14,7 +14,7 @@ authors = [
dependencies = [
"kfp==2.8.0",
"kfp-kubernetes==1.2.0",
"data-prep-toolkit-kfp-shared==0.2.2.dev0",
"data-prep-toolkit-kfp-shared==0.2.2.dev1",
]

[build-system]
Expand Down
4 changes: 2 additions & 2 deletions kfp/kfp_support_lib/shared_workflow_support/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "data_prep_toolkit_kfp_shared"
version = "0.2.2.dev0"
version = "0.2.2.dev1"
requires-python = ">=3.10,<3.13"
description = "Data Preparation Kit Library. KFP support"
license = {text = "Apache-2.0"}
Expand All @@ -14,7 +14,7 @@ authors = [
dependencies = [
"requests",
"kubernetes",
"data-prep-toolkit-ray==0.2.2.dev0",
"data-prep-toolkit-ray==0.2.2.dev1",
]

[build-system]
Expand Down
7 changes: 7 additions & 0 deletions transforms/.make.transforms
Original file line number Diff line number Diff line change
Expand Up @@ -342,4 +342,11 @@ minio-stop:
> tt.toml; \
mv tt.toml pyproject.toml; \
fi
if [ -e requirements.txt ]; then \
cat requirements.txt | sed \
-e 's/\(dpk[_-].*transform[_-]python[=<>~][=]\).*/\1$(TRANSFORM_PYTHON_VERSION)/' \
> tt.txt; \
mv tt.txt requirements.txt; \
fi


2 changes: 1 addition & 1 deletion transforms/.make.workflows
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ endif

${WORKFLOW_VENV_ACTIVATE}: ${REPOROOT}/.make.versions ${REPOROOT}/kfp/kfp_ray_components/requirements.txt ${DPK_RAY_LIB_DIR} ${KFP_LIB_SRC_FILES} ${KFP_LIB_CONFIG_FILE} ${KFP_SHARED_LIB_SRC_FILES}
rm -rf ${REPOROOT}/transforms/venv
$(MAKE) -C ${REPOROOT}/transforms .defaults.ray-lib-src-venv
$(MAKE) -C ${REPOROOT}/transforms .defaults.kfp-venv
. ${WORKFLOW_VENV_ACTIVATE}; \
pip install -e $(REPOROOT)/kfp/kfp_support_lib/shared_workflow_support; \
pip install -e $(REPOROOT)/kfp/kfp_support_lib/$(WORKFLOW_SUPPORT_LIB); \
Expand Down
52 changes: 52 additions & 0 deletions transforms/Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
REPOROOT=../
# Use make help, to see the available rules
include ../.make.defaults
include ./transform.config


setup::
@# Help: Recursively make $@ all subdirs
Expand Down Expand Up @@ -78,4 +80,54 @@ workflow-upload::

set-versions::
@# Help: Recursively make $@ in all subdirs
make set-pkg-version
@$(MAKE) RULE=$@ .recurse

set-pkg-version:
@# Help: Set tag for this package and its dependencies
cat pyproject.toml | sed -e \
's/^version[ ]*=.*/version = "'${TRANSFORMS_PKG_VERSION}'"/' \
> tt
mv tt pyproject.toml
echo $(DPK_VERSION)
cat requirements.txt | sed -e \
's/data-prep-toolkit\([=><~][=]\).*/data-prep-toolkit\1$(DPK_VERSION)/' \
> tt
mv tt requirements.txt
cat requirements-ray.txt | sed -e \
's/data-prep-toolkit\[ray\]\([=><~][=]\).*/data-prep-toolkit\[ray\]\1$(DPK_VERSION)/' \
> tt
mv tt requirements-ray.txt


build-pkg-dist:
@# Help: Build package wheel
## Most transforms today don't have a package name.... Need to fix that
## In the meantime, we will copy everything to a single folder
-rm -fr src
mkdir src
# Copy all the src folders recursively (not clear if they have subfolders)
for x in $(shell find . | grep '[ray| python]/src$$') ; do \
echo $$x ; \
if [ -d "$$x" ]; then \
cp -r $$x/* src ; \
fi \
done
# Only needs to build the whl
$(MAKE) BUILD_WHEEL_EXTRA_ARG=-w .defaults.build-dist
-rm -fr src

test-pkg-dist:
@# Help: Setup environment and run unit tests for all transforms.
-rm -fr venv
python -m venv venv
source venv/bin/activate && $(PYTHON) -m pip install '$(REPOROOT)/data-processing-lib/dist/data_prep_toolkit-$(DPK_VERSION)-py3-none-any.whl[dev,ray]'
source venv/bin/activate && $(PYTHON) -m pip install 'dist/data_prep_toolkit_transforms-$(DPK_TRANSFORMS_VERSION)-py3-none-any.whl[all]'
for T in $(shell find . | grep '[ray| python]/test$$') ; do \
echo "running unit test on: $$T" ; \
source venv/bin/activate && $(PYTEST) $$T; \
done;
@# Help: Setup environment and run unit tests for all transforms

publish-dist :: .defaults.publish-dist

Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,14 @@
The [transforms](https://github.com/IBM/data-prep-kit/blob/dev/transforms/README.md) are delivered as a standard pyton library available on pypi and can be installed using pip install:

`python -m pip install data-prep-toolkit-transforms`
or
`python -m pip install data-prep-toolkit-transforms[ray]`


installing the python transforms will also install `data-prep-toolkit`

installing the ray transforms will also install `data-prep-toolkit[ray]`

## List of Transforms in current package

Note: This list includes the transforms that were part of the release starting with data-prep-toolkit-transforms:0.2.1. This list may not always reflect up to date information. Users are encourage to raise an issue in git when they discover missing components or packages that are listed below but not in the current release they get from pypi.
Expand Down
1 change: 1 addition & 0 deletions transforms/code/code2parquet/python/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e .

COPY --chown=dpk:root src/ src/
COPY --chown=dpk:root pyproject.toml pyproject.toml
COPY --chown=dpk:root requirements.txt requirements.txt
RUN pip install --no-cache-dir -e .

# copy the main() entry point to the image
Expand Down
11 changes: 5 additions & 6 deletions transforms/code/code2parquet/python/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "dpk_code2parquet_transform_python"
version = "0.2.2.dev0"
version = "0.2.2.dev1"
requires-python = ">=3.10,<3.13"
description = "code2parquet Python Transform"
license = {text = "Apache-2.0"}
Expand All @@ -9,16 +9,15 @@ authors = [
{ name = "David Wood", email = "dawood@us.ibm.com" },
{ name = "Boris Lublinsky", email = "blublinsky@ibm.com" },
]
dependencies = [
"data-prep-toolkit==0.2.2.dev0",
"parameterized",
"pandas",
]
dynamic = ["dependencies"]

[build-system]
requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
build-backend = "setuptools.build_meta"

[tool.setuptools.dynamic]
dependencies = {file = ["requirements.txt"]}

[project.optional-dependencies]
dev = [
"twine",
Expand Down
3 changes: 3 additions & 0 deletions transforms/code/code2parquet/python/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
data-prep-toolkit==0.2.2.dev1
parameterized
pandas
Loading

0 comments on commit 18c99f5

Please sign in to comment.