ml6team · RobbeSneyders · Aug 24, 2023 · Aug 23, 2023 · Aug 23, 2023 · Aug 23, 2023
diff --git a/examples/pipelines/datacomp/components/clean_captions/Dockerfile b/examples/pipelines/datacomp/components/clean_captions/Dockerfile
@@ -0,0 +1,23 @@
+FROM --platform=linux/amd64 python:3.8-slim
+
+## System dependencies
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install git -y
+
+# install requirements
+COPY requirements.txt /
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+# Install Fondant
+# This is split from other requirements to leverage caching
+ARG FONDANT_VERSION=main
+RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
+
+# Set the working directory to the component folder
+WORKDIR /component/src
+
+# Copy over src-files
+COPY src/ .
+
+ENTRYPOINT ["fondant", "execute", "main"]
diff --git a/examples/pipelines/datacomp/components/clean_captions/fondant_component.yaml b/examples/pipelines/datacomp/components/clean_captions/fondant_component.yaml
@@ -0,0 +1,15 @@
+name: Filter Caption
+description: Component that filters out bad captions (Empty captions, Captions with weird characters, Captions that are dates)
+image: ghcr.io/ml6team/filter_captions:dev
+
+consumes:
+  text:
+    fields:
+      data:
+        type: string
+
+produces:
+  text:
+    fields:
+      data:
+        type: string
diff --git a/examples/pipelines/datacomp/components/clean_captions/requirements.txt b/examples/pipelines/datacomp/components/clean_captions/requirements.txt
diff --git a/examples/pipelines/datacomp/components/clean_captions/src/main.py b/examples/pipelines/datacomp/components/clean_captions/src/main.py
@@ -0,0 +1,65 @@
+import logging
+
+import pandas as pd
+
+from fondant.component import PandasTransformComponent
+from dateutil.parser import parse
+
+logger = logging.getLogger(__name__)
+
+
+def isNonEnglish(s):
+    try:
+        s.encode(encoding="utf-8").decode("ascii")
+    except UnicodeDecodeError:
+        return True
+    else:
+        return False
+
+def get_num_nonenglish_characters(text):
+    return sum([isNonEnglish(char) for char in text])
+
+def has_too_much_weird_characters(text, max_ratio=0.5):
+    return (get_num_nonenglish_characters(text) / len(text)) > max_ratio
+
+def is_valid_date(date_string):
+    try:
+        parse(date_string)
+        return True
+    except (ValueError, OverflowError):
+        return False
+
+def is_empty(text):
+    return text.strip() != ""
+
+
+class FilterTextComplexity(PandasTransformComponent):
+    """Component that filters out bad captions in image-text pairs:
+    - Empty captions
+    - Captions with weird characters
+    - Captions that are dates
+    """
+
+    def __init__(
+        self,
+        *args,
+    ) -> None:
+        pass
+
+    def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
+        texts = dataframe["text"]["data"]
+
+        logger.info("Filtering on empty captions...")
+        mask = texts.apply(lambda text: not is_empty(text))
+        dataframe = dataframe[mask]
+
+        logger.info("Filtering on weird character captions...")
+        mask = texts.apply(lambda text: not has_too_much_weird_characters(text))
+        dataframe = dataframe[mask]
+
+        logger.info("Filtering on captions that look like dates...")
+        mask = texts.apply(lambda text: not is_valid_date(text))
+        dataframe = dataframe[mask]
+
+        return dataframe
+
diff --git a/examples/pipelines/datacomp/components/filter_clip_score/Dockerfile b/examples/pipelines/datacomp/components/filter_clip_score/Dockerfile
@@ -0,0 +1,23 @@
+FROM --platform=linux/amd64 python:3.8-slim
+
+## System dependencies
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install git -y
+
+# install requirements
+COPY requirements.txt /
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+# Install Fondant
+# This is split from other requirements to leverage caching
+ARG FONDANT_VERSION=main
+RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
+
+# Set the working directory to the component folder
+WORKDIR /component/src
+
+# Copy over src-files
+COPY src/ .
+
+ENTRYPOINT ["fondant", "execute", "main"]
diff --git a/examples/pipelines/datacomp/components/filter_clip_score/fondant_component.yaml b/examples/pipelines/datacomp/components/filter_clip_score/fondant_component.yaml
@@ -0,0 +1,20 @@
+name: Filter Caption
+description: Component that filters out bad captions (Empty captions, Captions with weird characters, Captions that are dates)
+image: ghcr.io/ml6team/filter_clip_score:dev
+
+consumes:
+  image_text:
+    fields:
+      clip_l14_similarity_score:
+        type: float32
+
+produces:
+  image_text:
+    fields:
+      clip_l14_similarity_score:
+        type: float32
+
+args:
+  pct_treshold:
+    type: float
+    description: "Percentage treshold to filter out captions"
diff --git a/examples/pipelines/datacomp/components/filter_clip_score/requirements.txt b/examples/pipelines/datacomp/components/filter_clip_score/requirements.txt
diff --git a/examples/pipelines/datacomp/components/filter_clip_score/src/main.py b/examples/pipelines/datacomp/components/filter_clip_score/src/main.py
@@ -0,0 +1,32 @@
+import logging
+import pandas as pd
+from fondant.component import PandasTransformComponent
+from fondant.executor import PandasTransformExecutor
+
+logger = logging.getLogger(__name__)
+
+class FilterTextComplexity(PandasTransformComponent):
+    """
+    Component that filters rows based on clip scores
+    """
+
+    def __init__(self, pct_threshold: float, *args, **kwargs):
+        self.pct_threshold = pct_threshold
+
+    def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
+        logger.info("Filtering on clip scores...")
+        logger.info(f"Initial length: {len(dataframe)}")
+
+        clip_scores = dataframe["image_text"]["clip_l14_similarity_score"]
+        sorted_clip_scores = clip_scores.sort_values(ascending=False)
+        threshold_idx = int(len(sorted_clip_scores) * self.pct_threshold)
+        threshold = sorted_clip_scores.iloc[threshold_idx]
+        logger.info(f"Clip score Threshold: {threshold}")
+
+        mask = clip_scores > threshold
+        filtered_dataframe = dataframe[mask]
+        logger.info(f"Final length: {len(filtered_dataframe)} ({len(filtered_dataframe) / len(dataframe):.2f})")
+
+        return filtered_dataframe
+
+
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
@@ -68,6 +68,7 @@
 pipeline.add_op(load_from_hub_op)
 pipeline.add_op(filter_complexity_op, dependencies=download_images_op)
 pipeline.add_op(download_images_op, dependencies=load_from_hub_op)
+
 # TODO add more ops