ml6team · PhilippeMoussalli · Jul 25, 2023 · Jul 11, 2023 · Jul 11, 2023 · Jul 17, 2023
diff --git a/examples/pipelines/controlnet-interior-design/components/generate_prompts/Dockerfile b/examples/pipelines/controlnet-interior-design/components/generate_prompts/Dockerfile
@@ -12,7 +12,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=main
+ARG FONDANT_VERSION=09ef9254fef5d382d7d60d97b66fa2ac1e0df7e0
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder

diff --git a/examples/pipelines/controlnet-interior-design/pipeline.py b/examples/pipelines/controlnet-interior-design/pipeline.py
@@ -17,6 +17,7 @@
 generate_prompts_op = ComponentOp(
     component_dir="components/generate_prompts",
     arguments={"n_rows_to_load": None},
+    output_partition_size="disable",
 )
 laion_retrieval_op = ComponentOp.from_registry(
     name="prompt_based_laion_retrieval",
@@ -72,8 +73,8 @@
 pipeline = Pipeline(pipeline_name=pipeline_name, base_path=PipelineConfigs.BASE_PATH)
 
 pipeline.add_op(generate_prompts_op)
-pipeline.add_op(laion_retrieval_op, dependencies=generate_prompts_op)
-pipeline.add_op(download_images_op, dependencies=laion_retrieval_op)
-pipeline.add_op(caption_images_op, dependencies=download_images_op)
-pipeline.add_op(segment_images_op, dependencies=caption_images_op)
-pipeline.add_op(write_to_hub_controlnet, dependencies=segment_images_op)
+# pipeline.add_op(laion_retrieval_op, dependencies=generate_prompts_op)
+# pipeline.add_op(download_images_op, dependencies=laion_retrieval_op)
+# pipeline.add_op(caption_images_op, dependencies=download_images_op)
+# pipeline.add_op(segment_images_op, dependencies=caption_images_op)
+# pipeline.add_op(write_to_hub_controlnet, dependencies=segment_images_op)
diff --git a/src/fondant/component.py b/src/fondant/component.py
@@ -41,6 +41,13 @@ def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame:
         raise NotImplementedError
 
 
+class DaskWriteComponent(BaseComponent):
+    """Component that accepts a Dask DataFrame and writes its contents."""
+
+    def write(self, dataframe: dd.DataFrame) -> None:
+        raise NotImplementedError
+
+
 class PandasTransformComponent(BaseComponent):
     """Component that transforms the incoming dataset partition per partition as a pandas
     DataFrame.
@@ -57,12 +64,5 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
         raise NotImplementedError
 
 
-class DaskWriteComponent(BaseComponent):
-    """Component that accepts a Dask DataFrame and writes its contents."""
-
-    def write(self, dataframe: dd.DataFrame) -> None:
-        raise NotImplementedError
-
-
 Component = t.TypeVar("Component", bound=BaseComponent)
 """Component type which can represents any of the subclasses of BaseComponent"""
diff --git a/src/fondant/component_spec.py b/src/fondant/component_spec.py
@@ -256,6 +256,13 @@ def from_fondant_component_spec(
                     "type": "JsonObject",
                     "default": "None",
                 },
+                {
+                    "name": "output_partition_size",
+                    "description": "The size of the output partition size, defaults"
+                    " to 250MB. Set to `disable` to disable the automatic partitioning",
+                    "type": "String",
+                    "default": "250MB",
+                },
                 *(
                     {
                         "name": arg.name,
@@ -285,6 +292,8 @@ def from_fondant_component_spec(
                         {"inputValue": "metadata"},
                         "--component_spec",
                         {"inputValue": "component_spec"},
+                        "--output_partition_size",
+                        {"inputValue": "output_partition_size"},
                         *cls._dump_args(fondant_component.args.values()),
                         "--output_manifest_path",
                         {"outputPath": "output_manifest_path"},

diff --git a/src/fondant/data_io.py b/src/fondant/data_io.py
@@ -1,4 +1,5 @@
 import logging
+import os
 import typing as t
 
 import dask.dataframe as dd
@@ -17,6 +18,31 @@ def __init__(self, *, manifest: Manifest, component_spec: ComponentSpec) -> None
 
 
 class DaskDataLoader(DataIO):
+    def __init__(self, *, manifest: Manifest, component_spec: ComponentSpec):
+        super().__init__(manifest=manifest, component_spec=component_spec)
+
+    @staticmethod
+    def partition_loaded_dataframe(dataframe: dd.DataFrame) -> dd.DataFrame:
+        """
+        Function that partitions the loaded dataframe depending on its partitions and the available
+        workers
+        Returns:
+            The partitioned dataframe.
+        """
+        n_partitions = dataframe.npartitions
+        n_workers = os.cpu_count()
+        logger.info(
+            f"The number of partitions of the input dataframe is {n_partitions}. The "
+            f"available number of workers is {n_workers}.",
+        )
+        if n_partitions < n_workers:
-        if n_partitions < n_workers:
+        if input_partition_size:
+            dataframe.repartition(partition_size=input_partition_size)
+        elif n_partitions < n_workers:
-        if n_partitions < n_workers:
+        if input_partition_size:
+            dataframe.repartition(partition_size=input_partition_size)
+        elif n_partitions < n_workers:
+            dataframe = dataframe.repartition(npartitions=n_workers)
+            logger.info(
+                "Repartitioning the data before transforming to maximize worker usage",
+            )
+
+        return dataframe
+
     def _load_subset(self, subset_name: str, fields: t.List[str]) -> dd.DataFrame:
         """
         Function that loads a subset from the manifest as a Dask dataframe.
@@ -80,12 +106,37 @@ def load_dataframe(self) -> dd.DataFrame:
                 how="left",
             )
 
+        dataframe = self.partition_loaded_dataframe(dataframe)
+
         logging.info(f"Columns of dataframe: {list(dataframe.columns)}")
 
         return dataframe
 
 
 class DaskDataWriter(DataIO):
+    def __init__(
+        self,
+        *,
+        manifest: Manifest,
+        component_spec: ComponentSpec,
+        output_partition_size: t.Optional[str] = None,
+    ):
+        super().__init__(manifest=manifest, component_spec=component_spec)
+        self.output_partition_size = output_partition_size
+
+    def partition_written_dataframe(self, dataframe: dd.DataFrame) -> dd.DataFrame:
+        """
+        Function that partitions the written dataframe to smaller partitions based on a given
+        partition size.
+        """
+        if self.output_partition_size and self.output_partition_size != "disable":
+            dataframe = dataframe.repartition(partition_size=self.output_partition_size)
+            logger.info(
+                f"Repartitioning the written data such that the size per partition is approx."
+                f" {self.output_partition_size}",
+            )
+        return dataframe
+
     def write_dataframe(self, dataframe: dd.DataFrame) -> None:
         write_tasks = []
 
@@ -159,6 +210,8 @@ def _write_subset(
 
         schema = {field.name: field.type.value for field in subset_spec.fields.values()}
 
+        dataframe = self.partition_written_dataframe(dataframe)
+
         return self._create_write_task(dataframe, location=location, schema=schema)
 
     @staticmethod

diff --git a/src/fondant/exceptions.py b/src/fondant/exceptions.py
@@ -15,6 +15,10 @@ class InvalidComponentSpec(ValidationError, FondantException):
     """Thrown when a component spec cannot be validated against the schema."""
 
 
+class InvalidComponentOpDefinition(ValidationError, FondantException):
+    """Thrown when a componentOp is invalid."""
+
+
 class InvalidPipelineDefinition(ValidationError, FondantException):
     """Thrown when a pipeline definition is invalid."""
 

diff --git a/src/fondant/executor.py b/src/fondant/executor.py
@@ -39,13 +39,15 @@ def __init__(
         input_manifest_path: t.Union[str, Path],
         output_manifest_path: t.Union[str, Path],
         metadata: t.Dict[str, t.Any],
-        user_arguments: t.Dict[str, Argument],
+        user_arguments: t.Dict[str, t.Any],
+        output_partition_size: t.Optional[str] = "250MB",
     ) -> None:
         self.spec = spec
         self.input_manifest_path = input_manifest_path
         self.output_manifest_path = output_manifest_path
         self.metadata = metadata
         self.user_arguments = user_arguments
+        self.output_partition_size = output_partition_size
 
     @classmethod
     def from_file(
@@ -85,7 +87,7 @@ def from_spec(cls, component_spec: ComponentSpec) -> "Executor":
         input_manifest_path = args_dict.pop("input_manifest_path")
         output_manifest_path = args_dict.pop("output_manifest_path")
         metadata = args_dict.pop("metadata")
-
+        output_partition_size = args_dict.pop("output_partition_size")
         metadata = json.loads(metadata) if metadata else {}
 
         return cls(
@@ -94,6 +96,7 @@ def from_spec(cls, component_spec: ComponentSpec) -> "Executor":
             output_manifest_path=output_manifest_path,
             metadata=metadata,
             user_arguments=args_dict,
+            output_partition_size=output_partition_size,
         )
 
     @classmethod
@@ -166,7 +169,12 @@ def _execute_component(
 
     def _write_data(self, dataframe: dd.DataFrame, *, manifest: Manifest):
         """Create a data writer given a manifest and writes out the index and subsets."""
-        data_writer = DaskDataWriter(manifest=manifest, component_spec=self.spec)
+        data_writer = DaskDataWriter(
+            manifest=manifest,
+            component_spec=self.spec,
+            output_partition_size=self.output_partition_size,
+        )
+
         data_writer.write_dataframe(dataframe)
 
     def execute(self, component_cls: t.Type[Component]) -> None:

diff --git a/src/fondant/pipeline.py b/src/fondant/pipeline.py
@@ -12,7 +12,7 @@
     from importlib_resources import files  # type: ignore
 
 from fondant.component_spec import ComponentSpec
-from fondant.exceptions import InvalidPipelineDefinition
+from fondant.exceptions import InvalidComponentOpDefinition, InvalidPipelineDefinition
 from fondant.import_utils import is_kfp_available
 from fondant.manifest import Manifest
 
@@ -32,6 +32,8 @@ class ComponentOp:
     Arguments:
         component_dir: The path to the component directory.
         arguments: A dictionary containing the argument name and value for the operation.
+        output_partition_size: the size of the output written dataset. Defaults to 250MB,
+        set to "disable" to disable automatic repartitioning of the output,
         number_of_gpus: The number of gpus to assign to the operation
         node_pool_name: The name of the node pool to which the operation will be assigned.
         p_volumes: Collection of persistent volumes in a Kubernetes cluster. Keys are mount paths,
@@ -57,13 +59,15 @@ def __init__(
         component_dir: t.Union[str, Path],
         *,
         arguments: t.Optional[t.Dict[str, t.Any]] = None,
+        output_partition_size: t.Optional[str] = "250MB",
         number_of_gpus: t.Optional[int] = None,
         node_pool_name: t.Optional[str] = None,
         p_volumes: t.Optional[t.Dict[str, k8s_client.V1Volume]] = None,
         ephemeral_storage_size: t.Optional[str] = None,
     ) -> None:
         self.component_dir = Path(component_dir)
-        self.arguments = arguments or {}
+        self.output_partitioning_size = output_partition_size
+        self.arguments = self._set_arguments(arguments)
 
         self.component_spec = ComponentSpec.from_file(
             self.component_dir / self.COMPONENT_SPEC_NAME,
@@ -75,6 +79,38 @@ def __init__(
         self.p_volumes = p_volumes
         self.ephemeral_storage_size = ephemeral_storage_size
 
+    def _set_arguments(
+        self,
+        arguments: t.Optional[t.Dict[str, t.Any]],
+    ) -> t.Dict[str, t.Any]:
+        """Set component arguments based on provided arguments and relevant ComponentOp
+        parameters.
+        """
+
+        def _validate_partition_size_arg(file_size):
+            # Define the regular expression pattern to match file size notations: KB, MB, GB or TB
+            pattern = r"^(?:\d+(?:\.\d+)?(?:KB|MB|GB|TB)|disable)$"
+
+            # Use the re.match() function to check if the provided file_size matches the pattern
+            return bool(re.match(pattern, file_size, re.I))
+
+        arguments = arguments or {}
+
+        if self.output_partitioning_size is not None:
+            if not _validate_partition_size_arg(
+                file_size=str(self.output_partitioning_size),
+            ):
+                msg = (
+                    f"Invalid partition size defined `{self.output_partitioning_size}`,"
+                    " partition size must be a string followed by a file size notation"
+                    " e.g. ('250MB') or 'disable' to disable the automatic partitioning"
+                )
+                raise InvalidComponentOpDefinition(msg)
+
+            arguments["output_partition_size"] = self.output_partitioning_size
+
+        return arguments
+
     @property
     def dockerfile_path(self) -> t.Optional[Path]:
         path = self.component_dir / "Dockerfile"

diff --git a/tests/example_pipelines/compiled_pipeline/example_1/docker-compose.yml b/tests/example_pipelines/compiled_pipeline/example_1/docker-compose.yml
@@ -9,6 +9,8 @@ services:
     - /foo/bar/first_component/manifest.json
     - --storage_args
     - a dummy string arg
+    - --output_partition_size
+    - 250MB
     - --component_spec
     - '{"name": "First component", "description": "This is an example component",
       "image": "example_component:latest", "produces": {"images": {"fields": {"data":
@@ -48,6 +50,8 @@ services:
     - a dummy string arg
     - --some_list
     - '[1, 2, 3]'
+    - --output_partition_size
+    - 10MB
     - --component_spec
     - '{"name": "Third component", "description": "This is an example component",
       "image": "example_component:latest", "consumes": {"images": {"fields": {"data":
@@ -63,4 +67,4 @@ services:
       second_component:
         condition: service_completed_successfully
     volumes: []
-version: '3.8'
+version: '3.8'
diff --git a/tests/example_pipelines/compiled_pipeline/example_2/docker-compose.yml b/tests/example_pipelines/compiled_pipeline/example_2/docker-compose.yml
@@ -9,6 +9,8 @@ services:
     - /foo/bar/first_component/manifest.json
     - --storage_args
     - a dummy string arg
+    - --output_partition_size
+    - 250MB
     - --component_spec
     - '{"name": "First component", "description": "This is an example component",
       "image": "example_component:latest", "produces": {"images": {"fields": {"data":
@@ -26,6 +28,8 @@ services:
     - '0'
     - --padding
     - '0'
+    - --output_partition_size
+    - 250MB
     - --component_spec
     - '{"name": "Image cropping", "description": "Component that removes single-colored
       borders around images and crops them appropriately", "image": "ghcr.io/ml6team/image_cropping:dev",

diff --git a/tests/example_specs/component_specs/kubeflow_component.yaml b/tests/example_specs/component_specs/kubeflow_component.yaml
@@ -11,6 +11,11 @@ inputs:
     description: The component specification as a dictionary
     type: JsonObject
     default: None
+-   name: output_partition_size
+    description: The size of the output partition size, defaults to 250MB. Set to
+        `disable` to disable the automatic partitioning
+    type: String
+    default: 250MB
 -   name: storage_args
     description: Storage arguments
     type: String
@@ -30,7 +35,9 @@ implementation:
         -   inputValue: metadata
         - --component_spec
         -   inputValue: component_spec
+        - --output_partition_size
+        -   inputValue: output_partition_size
         - --storage_args
         -   inputValue: storage_args
         - --output_manifest_path
-        -   outputPath: output_manifest_path
+        -   outputPath: output_manifest_path
diff --git a/tests/test_compiler.py b/tests/test_compiler.py
@@ -17,17 +17,20 @@
             ComponentOp(
                 Path(COMPONENTS_PATH / "example_1" / "first_component"),
                 arguments={"storage_args": "a dummy string arg"},
+                output_partition_size="250MB",
             ),
             ComponentOp(
                 Path(COMPONENTS_PATH / "example_1" / "second_component"),
                 arguments={"storage_args": "a dummy string arg"},
+                output_partition_size=None,
             ),
             ComponentOp(
                 Path(COMPONENTS_PATH / "example_1" / "fourth_component"),
                 arguments={
                     "storage_args": "a dummy string arg",
                     "some_list": [1, 2, 3],
                 },
+                output_partition_size="10MB",
             ),
         ],
     ),