diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 68581eb2..a8b70043 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -53,3 +53,6 @@ jobs: - name: Build components run: ./scripts/build_components.sh --cache -t $GITHUB_SHA -t dev + + - name: Build base image + run: ./scripts/build_base_image.sh -t $GITHUB_SHA diff --git a/.github/workflows/prep-release.yaml b/.github/workflows/prep-release.yaml index 2d57b43c..1217e142 100644 --- a/.github/workflows/prep-release.yaml +++ b/.github/workflows/prep-release.yaml @@ -62,6 +62,9 @@ jobs: - name: Build data explorer run: ./scripts/build_explorer.sh -t $GITHUB_REF_NAME + - name: Build base image + run: ./scripts/build_base_image.sh -t $GITHUB_REF_NAME + - name: Update version in pyproject.toml with tag version run: sed -i "s/^version = .*/version = '${{github.ref_name}}'/" pyproject.toml diff --git a/images/Dockerfile b/images/Dockerfile new file mode 100644 index 00000000..4bafaa79 --- /dev/null +++ b/images/Dockerfile @@ -0,0 +1,12 @@ +ARG PYTHON_VERSION +FROM --platform=linux/amd64 python:${PYTHON_VERSION}-slim + +# System dependencies +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install git -y + +# Install Fondant +ARG FONDANT_VERSION=main +RUN pip3 install fondant[component,aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} + diff --git a/scripts/build_base_image.sh b/scripts/build_base_image.sh new file mode 100755 index 00000000..8ed04fc0 --- /dev/null +++ b/scripts/build_base_image.sh @@ -0,0 +1,47 @@ +#!/bin/bash +set -e + +function usage { + echo "Usage: $0 [options]" + echo "Options:" + echo " -t, --tag Set the tag (default: latest)" + echo " -h, --help Display this help message" +} + +# Parse the arguments +while [[ "$#" -gt 0 ]]; do case $1 in + -t|--tag) tag="$2"; shift;; + -h|--help) usage; exit;; + *) echo "Unknown parameter passed: $1"; exit 1;; +esac; shift; done + +# Supported Python versions +python_versions=("3.8" "3.9" "3.10" "3.11") + + +for python_version in "${python_versions[@]}"; do + BASENAME=fondant + IMAGE_TAG=${tag}-py${python_version} + full_image_names=() + + # create repo if not exists + aws ecr-public describe-repositories --region us-east-1 --repository-names ${BASENAME} || aws ecr-public create-repository --region us-east-1 --repository-name ${BASENAME} + full_image_names+=("public.ecr.aws/fndnt/${BASENAME}:${IMAGE_TAG}") + full_image_names+=("fndnt/${BASENAME}:${IMAGE_TAG}") + + # Add argument for each tag + for image_name in "${full_image_names[@]}" ; do + args+=(-t "$image_name") + done + + for element in "${args[@]}"; do + echo "$element" + done + + # Build docker images and push to docker hub + docker build --push "${args[@]}" \ + --build-arg="PYTHON_VERSION=${python_version}" \ + --build-arg="FONDANT_VERSION=${tag}" \ + -f "images/Dockerfile" \ + . +done diff --git a/src/fondant/pipeline/lightweight_component.py b/src/fondant/pipeline/lightweight_component.py index afeed47e..c45079b8 100644 --- a/src/fondant/pipeline/lightweight_component.py +++ b/src/fondant/pipeline/lightweight_component.py @@ -1,23 +1,60 @@ import inspect import itertools +import logging +import sys import textwrap import typing as t from dataclasses import asdict, dataclass from functools import wraps +from importlib.metadata import version from fondant.component import BaseComponent, Component +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +MIN_PYTHON_VERSION = (3, 8) +MAX_PYTHON_VERSION = (3, 11) + @dataclass class Image: - base_image: str = "fondant:latest" + base_image: str extra_requires: t.Optional[t.List[str]] = None script: t.Optional[str] = None def __post_init__(self): if self.base_image is None: - # TODO: link to Fondant version - self.base_image = "fondant:latest" + self.base_image = self.resolve_fndnt_base_image() + + # log info when custom image without Fondant is defined + elif self.extra_requires and not any( + dependency.startswith("fondant") for dependency in self.extra_requires + ): + msg = ( + "You are not using a Fondant default base image, and Fondant is not part of" + "your extra requirements. Please make sure that you have installed fondant " + "inside your container. Alternatively, you can should add Fondant to " + "the extra requirements. \n" + "E.g. \n" + '@lightweight_component(..., extra_requires=["fondant"])' + ) + + logger.warning(msg) + + @staticmethod + def resolve_fndnt_base_image(): + """Resolve the correct fndnt base image using python version and fondant version.""" + # Set python version to latest supported version + python_version = sys.version_info + if MIN_PYTHON_VERSION <= python_version < MAX_PYTHON_VERSION: + python_version = f"{python_version.major}.{python_version.minor}" + else: + python_version = f"{MAX_PYTHON_VERSION[0]}.{MAX_PYTHON_VERSION[1]}" + + fondant_version = version("fondant") + basename = "fndnt/fondant" + return f"{basename}:{fondant_version}-py{python_version}" def to_dict(self): return asdict(self) diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index 1b49e76c..9309d5dd 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -1,5 +1,7 @@ """Fondant pipelines test.""" import copy +import sys +from importlib.metadata import version from pathlib import Path import dask.dataframe as dd @@ -83,10 +85,16 @@ def load(self) -> dd.DataFrame: ) return dd.from_pandas(df, npartitions=1) + basename = "fndnt/fondant" + fondant_version = version("fondant") + python_version = sys.version_info + python_version = f"{python_version.major}.{python_version.minor}" + fondant_image_name = f"{basename}:{fondant_version}-py{python_version}" + component = ComponentOp.from_ref(Foo, produces={"bar": pa.string()}) assert component.component_spec._specification == { "name": "Foo", - "image": "fondant:latest", + "image": fondant_image_name, "description": "python component", "consumes": {"additionalProperties": True}, "produces": {"additionalProperties": True}, diff --git a/tests/pipeline/test_python_component.py b/tests/pipeline/test_python_component.py index 502ad71f..40b7eb3a 100644 --- a/tests/pipeline/test_python_component.py +++ b/tests/pipeline/test_python_component.py @@ -1,6 +1,8 @@ import json import re +import sys import textwrap +from importlib.metadata import version import dask.dataframe as dd import pandas as pd @@ -12,6 +14,15 @@ from fondant.pipeline.compiler import DockerCompiler +@pytest.fixture() +def default_fondant_image(): + basename = "fndnt/fondant" + fondant_version = version("fondant") + python_version = sys.version_info + python_version = f"{python_version.major}.{python_version.minor}" + return f"{basename}:{fondant_version}-py{python_version}" + + def test_build_python_script(): @lightweight_component() class CreateData(DaskLoadComponent): @@ -51,7 +62,7 @@ def load(self) -> dd.DataFrame: ) -def test_lightweight_component_sdk(): +def test_lightweight_component_sdk(default_fondant_image, caplog): pipeline = Pipeline( name="dummy-pipeline", base_path="./data", @@ -93,6 +104,18 @@ def load(self) -> dd.DataFrame: "produces": {"x": {"type": "int32"}, "y": {"type": "int32"}}, } + # check warning: fondant is not part of the requirements + msg = ( + "You are not using a Fondant default base image, and Fondant is not part of" + "your extra requirements. Please make sure that you have installed fondant " + "inside your container. Alternatively, you can should add Fondant to " + "the extra requirements. \n" + "E.g. \n" + '@lightweight_component(..., extra_requires=["fondant"])' + ) + + assert any(msg in record.message for record in caplog.records) + @lightweight_component() class AddN(PandasTransformComponent): def __init__(self, n: int, **kwargs): @@ -110,11 +133,13 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: ) assert len(pipeline._graph.keys()) == 1 + 1 assert pipeline._graph["AddN"]["dependencies"] == ["CreateData"] + pipeline._graph["AddN"]["operation"].operation_spec.to_json() + operation_spec_dict = pipeline._graph["AddN"]["operation"].operation_spec.to_dict() assert operation_spec_dict == { "specification": { "name": "AddN", - "image": "fondant:latest", + "image": default_fondant_image, "description": "python component", "consumes": {"additionalProperties": True}, "produces": {"additionalProperties": True}, @@ -160,7 +185,30 @@ def load(self) -> dd.DataFrame: ) return dd.from_pandas(df, npartitions=1) - CreateData(produces={}, consumes={}) + pipeline = Pipeline( + name="dummy-pipeline", + base_path="./data", + ) + + pipeline.read( + ref=CreateData, + ) + + assert len(pipeline._graph.keys()) == 1 + operation_spec = pipeline._graph["CreateData"]["operation"].operation_spec.to_json() + operation_spec_without_image = json.loads(operation_spec) + + assert operation_spec_without_image == { + "specification": { + "name": "CreateData", + "image": "python:3.8-slim-buster", + "description": "python component", + "consumes": {"additionalProperties": True}, + "produces": {"additionalProperties": True}, + }, + "consumes": {}, + "produces": {}, + } def test_invalid_load_component(): @@ -220,7 +268,7 @@ def load(self) -> int: CreateData(produces={}, consumes={}) -def test_lightweight_component_decorator_without_parentheses(): +def test_lightweight_component_decorator_without_parentheses(default_fondant_image): @lightweight_component class CreateData(DaskLoadComponent): def load(self) -> dd.DataFrame: @@ -237,10 +285,12 @@ def load(self) -> dd.DataFrame: assert len(pipeline._graph.keys()) == 1 operation_spec = pipeline._graph["CreateData"]["operation"].operation_spec.to_json() - assert json.loads(operation_spec) == { + operation_spec_without_image = json.loads(operation_spec) + + assert operation_spec_without_image == { "specification": { "name": "CreateData", - "image": "fondant:latest", + "image": default_fondant_image, "description": "python component", "consumes": {"additionalProperties": True}, "produces": {"additionalProperties": True},