ml6team · RobbeSneyders · Jun 21, 2023 · Jun 9, 2023 · Jun 12, 2023 · Jun 12, 2023
diff --git a/components/filter_comments/src/main.py b/components/filter_comments/src/main.py
@@ -7,14 +7,14 @@
 import dask.dataframe as dd
 from utils.text_extraction import get_comments_to_code_ratio
 
-from fondant.component import TransformComponent
+from fondant.component import DaskTransformComponent
 from fondant.logger import configure_logging
 
 configure_logging()
 logger = logging.getLogger(__name__)
 
 
-class FilterCommentsComponent(TransformComponent):
+class FilterCommentsComponent(DaskTransformComponent):
     """Component that filters instances based on code to comments ratio."""
 
     def transform(
@@ -46,4 +46,4 @@ def transform(
 
 if __name__ == "__main__":
     component = FilterCommentsComponent.from_args()
-    component.run()
+    component.run()
diff --git a/components/filter_line_length/src/main.py b/components/filter_line_length/src/main.py
@@ -3,14 +3,14 @@
 
 import dask.dataframe as dd
 
-from fondant.component import TransformComponent
+from fondant.component import DaskTransformComponent
 from fondant.logger import configure_logging
 
 configure_logging()
 logger = logging.getLogger(__name__)
 
 
-class FilterLineLengthComponent(TransformComponent):
+class FilterLineLengthComponent(DaskTransformComponent):
     """
     This component filters code based on a set of metadata associated with it:
     average line length, maximum line length and alphanum fraction.
@@ -44,4 +44,4 @@ def transform(
 
 if __name__ == "__main__":
     component = FilterLineLengthComponent.from_args()
-    component.run()
+    component.run()
diff --git a/...components/load_from_hub_stack/Dockerfile → ...ts/image_resolution_extraction/Dockerfile b/...components/load_from_hub_stack/Dockerfile → ...ts/image_resolution_extraction/Dockerfile
@@ -9,7 +9,7 @@ RUN apt-get update && \
 COPY requirements.txt /
 RUN pip3 install --no-cache-dir -r requirements.txt
 
-# Set the working directory to the component folder
+# Set the working directory to the compoent folder
 WORKDIR /component/src
 
 # Copy over src-files

diff --git a/components/image_resolution_extraction/fondant_component.yaml b/components/image_resolution_extraction/fondant_component.yaml
@@ -0,0 +1,19 @@
+name: Image resolution extraction
+description: Component that extracts image resolution data from the images
+image: ghcr.io/ml6team/image_resolution_extraction:latest
+
+consumes:
+  images:
+    fields:
+      data:
+        type: binary
+
+produces:
+  images:
+    fields:
+      width:
+        type: int16
+      height:
+        type: int16
+      data:
+        type: binary
diff --git a/components/image_resolution_extraction/requirements.txt b/components/image_resolution_extraction/requirements.txt
@@ -0,0 +1,4 @@
+fondant
+pyarrow>=7.0
+gcsfs==2023.4.0
+imagesize==1.4.1
diff --git a/components/image_resolution_extraction/src/main.py b/components/image_resolution_extraction/src/main.py
@@ -0,0 +1,55 @@
+"""This component filters images of the dataset based on image size (minimum height and width)."""
+import io
+import logging
+import typing as t
+
+import dask.dataframe as dd
+import imagesize
+import numpy as np
+
+from fondant.component import DaskTransformComponent
+from fondant.logger import configure_logging
+
+configure_logging()
+logger = logging.getLogger(__name__)
+
+
+def extract_dimensions(image_df: dd.DataFrame) -> t.Tuple[np.int16, np.int16]:
+    """Extract the width and height of an image.
+
+    Args:
+        image_df (dd.DataFrame): input dataframe with images_data column
+
+    Returns:
+        np.int16: width of the image
+        np.int16: height of the image
+    """
+    width, height = imagesize.get(io.BytesIO(image_df["images_data"]))
+
+    return np.int16(width), np.int16(height)
+
+
+class ImageResolutionExtractionComponent(DaskTransformComponent):
+    """Component that extracts image dimensions."""
+
+    def transform(self, *, dataframe: dd.DataFrame) -> dd.DataFrame:
+        """
+        Args:
+            dataframe: Dask dataframe
+        Returns:
+            dataset.
+        """
+        logger.info("Length of the dataframe before filtering: %s", len(dataframe))
+
+        logger.info("Filtering dataset...")
+
+        dataframe[["images_width", "images_height"]] = \
+            dataframe[["images_data"]].apply(extract_dimensions,
+                                             axis=1, result_type="expand", meta={0: int, 1: int})
+
+        return dataframe
+
+
+if __name__ == "__main__":
+    component = ImageResolutionExtractionComponent.from_args()
+    component.run()
diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml
@@ -2,21 +2,25 @@ name: Load from hub
 description: Component that loads a dataset from the hub
 image: ghcr.io/ml6team/load_from_hf_hub:latest
 
-produces:
-  images:
+consumes:
+  dummy_variable:  #TODO: fill in here
     fields:
       data:
         type: binary
-      width:
-        type: int16
-      height:
-        type: int16
-  captions:
-    fields:
-      data:
-        type: string
 
 args:
   dataset_name:
     description: Name of dataset on the hub
     type: str
+  column_name_mapping:
+    description: Mapping of the consumed hub dataset to fondant column names
+    type: dict
+  image_column_names:
+    description: A list containing the original hub image column names. Used to format the image
+      from HF hub format to a byte string
-    description: A list containing the original hub image column names. Used to format the image
-      from HF hub format to a byte string
+    description: Optional argument, a list containing the original image column names in case the dataset on the hub contains them. Used to format the image from HF hub format to a byte string.
-    description: A list containing the original hub image column names. Used to format the image
-      from HF hub format to a byte string
+    description: Optional argument, a list containing the original image column names in case the dataset on the hub contains them. Used to format the image from HF hub format to a byte string.
+    type: list
+    default: None
+  nb_rows_to_load:
+    description: Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale
+    type: int
+    default: None
diff --git a/components/load_from_hf_hub/src/main.py b/components/load_from_hf_hub/src/main.py
@@ -1,10 +1,8 @@
 """This component loads a seed dataset from the hub."""
-import io
 import logging
+import typing as t
 
 import dask.dataframe as dd
-import numpy as np
-from PIL import Image
 
 from fondant.component import LoadComponent
 from fondant.logger import configure_logging
@@ -13,52 +11,43 @@
 logger = logging.getLogger(__name__)
 
 
-def extract_width(image_bytes):
-    # Decode image bytes to PIL Image object
-    pil_image = Image.open(io.BytesIO(image_bytes))
-    width = pil_image.size[0]
-
-    return np.int16(width)
-
-
-def extract_height(image_bytes):
-    # Decode image bytes to PIL Image object
-    pil_image = Image.open(io.BytesIO(image_bytes))
-    height = pil_image.size[1]
-
-    return np.int16(height)
-
-
 class LoadFromHubComponent(LoadComponent):
-    def load(self, *, dataset_name: str) -> dd.DataFrame:
+    def load(self,
+             *,
+             dataset_name: str,
-             *,
-             dataset_name: str,
+             dataset_name: str,
+             *,
-             *,
-             dataset_name: str,
+             dataset_name: str,
+             *,
+             column_name_mapping: dict,
+             image_column_names: t.Optional[list],
+             nb_rows_to_load: t.Optional[int]) -> dd.DataFrame:
         """
         Args:
             dataset_name: name of the dataset to load.
-
+            column_name_mapping: Mapping of the consumed hub dataset to fondant column names
+            image_column_names: A list containing the original hub image column names. Used to
+                format the image from HF hub format to a byte string
+            nb_rows_to_load: optional argument that defines the number of rows to load. Useful for
+              testing pipeline runs on a small scale
         Returns:
-            Dataset: HF dataset
+            Dataset: HF dataset.
         """
         # 1) Load data, read as Dask dataframe
         logger.info("Loading dataset from the hub...")
         dask_df = dd.read_parquet(f"hf://datasets/{dataset_name}")
 
-        # 2) Rename columns
-        dask_df = dask_df.rename(
-            columns={"image": "images_data", "text": "captions_data"}
-        )
+        # 2) Make sure images are bytes instead of dicts
+        if image_column_names:
-        if image_column_names:
+        if image_column_names is not None:
-        if image_column_names:
+        if image_column_names is not None:
+            for image_column_name in image_column_names:
+                dask_df[image_column_name] = dask_df[image_column_name].map(
+                    lambda x: x["bytes"], meta=("bytes", bytes)
+                )
+
+        # 3) Rename columns
+        dask_df = dask_df.rename(columns=column_name_mapping)
 
-        # 3) Make sure images are bytes instead of dicts
-        dask_df["images_data"] = dask_df["images_data"].map(
-            lambda x: x["bytes"], meta=("bytes", bytes)
-        )
+        # 4) Optional: only return specific amount of rows
 
-        # 4) Add width and height columns
-        dask_df["images_width"] = dask_df["images_data"].map(
-            extract_width, meta=("images_width", int)
-        )
-        dask_df["images_height"] = dask_df["images_data"].map(
-            extract_height, meta=("images_height", int)
-        )
+        if nb_rows_to_load:
+            dask_df = dask_df.head(nb_rows_to_load)
+            dask_df = dd.from_pandas(dask_df, npartitions=1)
 
         return dask_df
 

diff --git a/components/pii_redaction/src/main.py b/components/pii_redaction/src/main.py
@@ -7,14 +7,14 @@
 from pii_detection import scan_pii
 from pii_redaction import redact_pii
 
-from fondant.component import TransformComponent
+from fondant.component import DaskTransformComponent
 from fondant.logger import configure_logging
 
 configure_logging()
 logger = logging.getLogger(__name__)
 
 
-class RemovePIIComponent(TransformComponent):
+class RemovePIIComponent(DaskTransformComponent):
     """Component that detects and redacts PII from code."""
 
     def transform(

diff --git a/components/write_to_hf_hub/Dockerfile b/components/write_to_hf_hub/Dockerfile
diff --git a/components/write_to_hf_hub/fondant_component.yaml b/components/write_to_hf_hub/fondant_component.yaml
@@ -0,0 +1,28 @@
+name: Write to hub
+description: Component that writes a dataset to the hub
+image: ghcr.io/ml6team/write_to_hf_hub:0.1.3
+
+consumes:
+  dummy_variable:  #TODO: fill in here
+    fields:
+      data:
+        type: binary
+
+args:
+  hf_token:
+    description: The hugging face token used to write to the hub
+    type: str
+  username:
+    description: The username under which to upload the dataset
+    type: str
+  dataset_name:
+    description: The name of the dataset to upload
+    type: str
+  image_column_names:
+    description: A list containing the image column names. Used to format to image to HF hub format
+    type: list
+    default: None
+  column_name_mapping:
+    description: Mapping of the consumed fondant column names to the written hub column names
+    type: dict
+    default: None
diff --git a/.../write_to_hub_controlnet/requirements.txt → components/write_to_hf_hub/requirements.txt b/.../write_to_hub_controlnet/requirements.txt → components/write_to_hf_hub/requirements.txt
@@ -1,5 +1,6 @@
 huggingface_hub==0.14.1
-fondant
+datasets==2.10.1
+fondant==0.1.3
 pyarrow>=7.0
 Pillow==9.4.0
 gcsfs==2023.4.0