[DataComp pipeline] Add first 2 components (#223)

This PR adds the first two components of the DataComp pipeline, where it reuses the `load_from_hf_hub` component but with a different `fondant_component.yaml` file. --------- Co-authored-by: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
ml6team · Jun 22, 2023 · 50a2796 · 50a2796
1 parent 3e47ef7
commit 50a2796
Show file tree

Hide file tree

Showing 10 changed files with 311 additions and 3 deletions.
diff --git a/.gitignore b/.gitignore
@@ -83,4 +83,7 @@ MANIFEST
 *.lock
 
 # kubeflow artifacts
-*.tgz
+*.tgz
+
+# docker artifacts
+examples/pipelines/*/docker-compose.yml
diff --git a/components/image_resolution_filtering/src/main.py b/components/image_resolution_filtering/src/main.py
@@ -40,4 +40,4 @@ def transform(
 
 if __name__ == "__main__":
     component = ImageFilterComponent.from_args()
-    component.run()
+    component.run()
diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml
@@ -2,7 +2,7 @@ name: Load from hub
 description: Component that loads a dataset from the hub
 image: ghcr.io/ml6team/load_from_hf_hub:latest
 
-consumes:
+produces:
   dummy_variable:  #TODO: fill in here
     fields:
       data:

diff --git a/examples/pipelines/datacomp/build_images.sh b/examples/pipelines/datacomp/build_images.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+function usage {
+  echo "Usage: $0 [options]"
+  echo "Options:"
+  echo "  -c, --component <value>  Set the component name. Pass the component folder name to build a certain components or 'all' to build all components in the current directory (required)"
+  echo "  -n, --namespace <value>  Set the namespace (default: ml6team)"
+  echo "  -r, --repo <value>       Set the repo (default: fondant)"
+  echo "  -t, --tag <value>        Set the tag (default: latest)"
+  echo "  -h, --help               Display this help message"
+}
+
+# Parse the arguments
+while [[ "$#" -gt 0 ]]; do case $1 in
+  -n|--namespace) namespace="$2"; shift;;
+  -r|--repo) repo="$2"; shift;;
+  -t|--tag) tag="$2"; shift;;
+  -c|--component) component="$2"; shift;;
+  -h|--help) usage; exit;;
+  *) echo "Unknown parameter passed: $1"; exit 1;;
+esac; shift; done
+
+# Check for required argument
+if [ -z "${component}" ]; then
+  echo "Error: component parameter is required"
+  usage
+  exit 1
+fi
+
+# Set default values for optional arguments if not passed
+[ -n "${namespace-}" ] || namespace="ml6team"
+[ -n "${repo-}" ] || repo="fondant"
+[ -n "${tag-}" ] || tag="latest"
+
+# Get the component directory
+component_dir=$(pwd)/"components"
+
+# Loop through all subdirectories
+for dir in $component_dir/*/; do
+  cd "$dir"
+  BASENAME=${dir%/}
+  BASENAME=${BASENAME##*/}
+  # Build all images or one image depending on the passed argument
+  if [[ "$BASENAME" == "${component}" ]] || [[ "${component}" == "all" ]]; then
+    full_image_name=ghcr.io/${namespace}/${BASENAME}:${tag}
+    echo $full_image_name
+    docker build -t "$full_image_name" \
+     --build-arg COMMIT_SHA=$(git rev-parse HEAD) \
+     --build-arg GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD) \
+     --build-arg BUILD_TIMESTAMP=$(date '+%F_%H:%M:%S') \
+     --label org.opencontainers.image.source=https://github.com/${namespace}/${repo} \
+     --platform=linux/arm64 \
+     .
+    docker push "$full_image_name"
+  fi
+  cd "$component_dir"
+done
diff --git a/examples/pipelines/datacomp/components/filter_text_complexity/Dockerfile b/examples/pipelines/datacomp/components/filter_text_complexity/Dockerfile
@@ -0,0 +1,19 @@
+FROM --platform=linux/amd64 python:3.8-slim
+
+## System dependencies
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install git -y
+
+# install requirements
+COPY requirements.txt /
+RUN pip3 install --no-cache-dir -r requirements.txt
+RUN python -m spacy download en_core_web_sm
+
+# Set the working directory to the component folder
+WORKDIR /component/src
+
+# Copy over src-files
+COPY src/ .
+
+ENTRYPOINT ["python", "main.py"]
diff --git a/examples/pipelines/datacomp/components/filter_text_complexity/fondant_component.yaml b/examples/pipelines/datacomp/components/filter_text_complexity/fondant_component.yaml
@@ -0,0 +1,23 @@
+name: Filter text complexity
+description: Component that filters text based on their dependency parse complexity and number of actions
+image: ghcr.io/ml6team/filter_text_complexity:latest
+
+consumes:
+  text:
+    fields:
+      data:
+        type: string
+
+args:
+  spacy_pipeline:
+    description: SpaCy pipeline to use, e.g. "en_core_web_sm"
+    type: str
+  batch_size:
+    description: batch size to use when parsing text using SpaCy
+    type: int
+  min_complexity:
+    description: Minimum complexity to filter text on.
+    type: int
+  min_num_actions:
+    description: Minimum number of actions a text should contain.
+    type: int
diff --git a/examples/pipelines/datacomp/components/filter_text_complexity/requirements.txt b/examples/pipelines/datacomp/components/filter_text_complexity/requirements.txt
@@ -0,0 +1,4 @@
+git+https://github.com/ml6team/fondant.git@main
+pyarrow>=7.0
+gcsfs==2023.4.0
+spacy==3.5.3
diff --git a/examples/pipelines/datacomp/components/filter_text_complexity/src/main.py b/examples/pipelines/datacomp/components/filter_text_complexity/src/main.py
@@ -0,0 +1,82 @@
+"""This component filters text based on:
+
+- complexity of the dependency parse tree
+- number of actions.
+
+As proposed in [Radenovic et al., 2023](https://arxiv.org/abs/2301.02280).
+"""
+import logging
+
+import pandas as pd
+import spacy
+from spacy.symbols import nsubj, VERB
+
+from fondant.component import PandasTransformComponent
+from fondant.logger import configure_logging
+
+configure_logging()
+logger = logging.getLogger(__name__)
+
+
+def get_text_complexity(doc: spacy.tokens.doc.Doc):
+    complexity = 0
+    for token in doc:
+        num_children = len([child for child in token.children])
+        if num_children > complexity:
+            complexity = num_children
+
+    return complexity
+
+
+def get_num_actions(doc: spacy.tokens.doc.Doc):
+    verbs = set()
+    for possible_subject in doc:
+        if possible_subject.dep == nsubj and possible_subject.head.pos == VERB:
+            verbs.add(possible_subject.head)
+
+    return len(verbs)
+
+
+class FilterTextComplexity(PandasTransformComponent):
+    """Component that filters text based on:
+
+    - complexity of the dependency parse tree
+    - number of actions"""
+
+    def setup(
+        self,
+        *,
+        spacy_pipeline,
+        batch_size: int,
+        min_complexity: int,
+        min_num_actions: int
+    ) -> None:
+        self.nlp = spacy.load(spacy_pipeline, exclude=["ner"])
+        self.batch_size = batch_size
+        self.min_complexity = min_complexity
+        self.min_num_actions = min_num_actions
+
+    def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
+        texts = dataframe["text"]["data"]
+
+        docs = list(self.nlp.pipe(texts, batch_size=self.batch_size))
+        docs = pd.Series(docs)
+
+        caption_complexity = docs.apply(lambda doc: get_text_complexity(doc))
+        num_actions = docs.apply(lambda doc: get_num_actions(doc))
+
+        mask = (caption_complexity >= self.min_complexity) & (
+            num_actions >= self.min_num_actions
+        )
+        mask = mask.to_numpy()
+
+        dataframe = dataframe[mask]
+
+        dataframe = dataframe.drop(("text", "data"), axis=1)
+
+        return dataframe
+
+
+if __name__ == "__main__":
+    component = FilterTextComplexity.from_args()
+    component.run()
diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
@@ -0,0 +1,50 @@
+name: Load from hub
+description: Component that loads the DataComp dataset from the hub
+image: ghcr.io/ml6team/load_from_hf_hub:latest
+
+produces:
+  image:
+    fields:
+      url:
+        type: string
+      original_width:
+        type: int16
+      original_height:
+        type: int16
+      face_bboxes:
+        type: array
+        items:
+          type: array
+          items:
+            type: float32
+      sha256:
+        type: utf8
+
+  text:
+    fields:
+      data:
+        type: string
+
+  image_text:
+    fields:
+      clip_b32_similarity_score:
+        type: float32
+      clip_l14_similarity_score:
+        type: float32
+
+args:
+  dataset_name:
+    description: Name of dataset on the hub
+    type: str
+  column_name_mapping:
+    description: Mapping of the consumed hub dataset to fondant column names
+    type: dict
+  image_column_names:
+    description: Optional argument, a list containing the original image column names in case the 
+      dataset on the hub contains them. Used to format the image from HF hub format to a byte string.
+    type: list
+    default: None
+  n_rows_to_load:
+    description: Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale
+    type: int
+    default: None
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
@@ -0,0 +1,70 @@
+"""Pipeline used to filter the dataset of the Datacomp competition."""
+
+import logging
+import sys
+
+sys.path.append("../")
+
+from pipeline_configs import PipelineConfigs
+
+from fondant.compiler import DockerCompiler
+from fondant.pipeline import ComponentOp, Pipeline, Client
+from fondant.logger import configure_logging
+
+configure_logging()
+logger = logging.getLogger(__name__)
+
+# Initialize pipeline and client
+pipeline = Pipeline(
+    pipeline_name="Datacomp filtering pipeline",
+    pipeline_description="A pipeline for filtering the Datacomp dataset",
+    # base_path=PipelineConfigs.BASE_PATH,
+    base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
+)
+client = Client(host=PipelineConfigs.HOST)
+
+# define ops
+load_component_column_mapping = {
+    "url": "image_url",
+    "original_width": "image_original_width",
+    "original_height": "image_original_height",
+    "face_bboxes": "image_face_bboxes",
+    "sha256": "image_sha256",
+    "text": "text_data",
+    "clip_b32_similarity_score": "image_text_clip_b32_similarity_score",
+    "clip_l14_similarity_score": "image_text_clip_l14_similarity_score",
+}
+
+load_from_hub_op = ComponentOp.from_registry(
+    name="load_from_hf_hub",
+    component_spec_path="components/load_from_hf_hub/fondant_component.yaml",
+    arguments={
+        "dataset_name": "mlfoundations/datacomp_small",
+        "column_name_mapping": load_component_column_mapping,
+        "n_rows_to_load": 100,
+    },
+)
+filter_complexity_op = ComponentOp(
+    component_spec_path="components/filter_text_complexity/fondant_component.yaml",
+    arguments={
+        "spacy_pipeline": "en_core_web_sm",
+        "batch_size": 1000,
+        "min_complexity": 1,
+        "min_num_actions": 1,
+    },
+)
+
+# add ops to pipeline
+pipeline.add_op(load_from_hub_op)
+pipeline.add_op(filter_complexity_op, dependencies=load_from_hub_op)
+# TODO add more ops
+
+# compile
+if __name__ == "__main__":
+    compiler = DockerCompiler()
+    # mount the gcloud credentials to the container
+    extra_volumes = [
+        "$HOME/.config/gcloud/application_default_credentials.json:/root/.config/gcloud/application_default_credentials.json:ro"
+    ]
+    compiler.compile(pipeline=pipeline, extra_volumes=extra_volumes)
+    logger.info("Run `docker compose up` to run the pipeline.")