Skip to content

Commit

Permalink
[Datacomp] Add clean_captions and filter_clip_score components (#381)
Browse files Browse the repository at this point in the history
Co-authored-by: Niels Rogge <niels.rogge1@gmail.com>
Co-authored-by: Robbe Sneyders <robbe.sneyders@gmail.com>
  • Loading branch information
3 people authored Aug 24, 2023
1 parent b9f3dea commit 293aa41
Show file tree
Hide file tree
Showing 11 changed files with 197 additions and 15 deletions.
23 changes: 23 additions & 0 deletions examples/pipelines/datacomp/components/clean_captions/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
FROM --platform=linux/amd64 python:3.8-slim

## System dependencies
RUN apt-get update && \
apt-get upgrade -y && \
apt-get install git -y

# install requirements
COPY requirements.txt /
RUN pip3 install --no-cache-dir -r requirements.txt

# Install Fondant
# This is split from other requirements to leverage caching
ARG FONDANT_VERSION=main
RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}

# Set the working directory to the component folder
WORKDIR /component/src

# Copy over src-files
COPY src/ .

ENTRYPOINT ["fondant", "execute", "main"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
name: Clean captions
description: Component that filters out bad captions (Empty captions, Captions with weird characters, Captions that are dates)
image: ghcr.io/ml6team/clean_captions:50f3a97878ac81670ebe624039ff0fcec0542e4f

consumes:
text:
fields:
data:
type: string

produces:
text:
fields:
data:
type: string
Empty file.
65 changes: 65 additions & 0 deletions examples/pipelines/datacomp/components/clean_captions/src/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import logging

import pandas as pd

from fondant.component import PandasTransformComponent
from dateutil.parser import parse

logger = logging.getLogger(__name__)


def isNonEnglish(s):
try:
s.encode(encoding="utf-8").decode("ascii")
except UnicodeDecodeError:
return True
else:
return False


def get_num_nonenglish_characters(text):
return sum([isNonEnglish(char) for char in text])


def has_too_much_weird_characters(text, max_ratio=0.5):
return (get_num_nonenglish_characters(text) / len(text)) > max_ratio


def is_valid_date(date_string):
try:
parse(date_string)
return True
except (ValueError, OverflowError):
return False


def is_empty(text):
return text.strip() == ""


class FilterTextComplexity(PandasTransformComponent):
"""Component that filters out bad captions in image-text pairs:
- Empty captions
- Captions with weird characters
- Captions that are dates
"""

def __init__(self, *args) -> None:
pass

def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
texts = dataframe["text"]["data"]

logger.info("Filtering on empty captions...")
mask = texts.apply(lambda text: not is_empty(text))
dataframe = dataframe[mask]

logger.info("Filtering on weird character captions...")
mask = texts.apply(lambda text: not has_too_much_weird_characters(text))
dataframe = dataframe[mask]

logger.info("Filtering on captions that look like dates...")
mask = texts.apply(lambda text: not is_valid_date(text))
dataframe = dataframe[mask]

return dataframe
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
FROM --platform=linux/amd64 python:3.8-slim

## System dependencies
RUN apt-get update && \
apt-get upgrade -y && \
apt-get install git -y

# install requirements
COPY requirements.txt /
RUN pip3 install --no-cache-dir -r requirements.txt

# Install Fondant
# This is split from other requirements to leverage caching
ARG FONDANT_VERSION=main
RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}

# Set the working directory to the component folder
WORKDIR /component/src

# Copy over src-files
COPY src/ .

ENTRYPOINT ["fondant", "execute", "main"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
name: Filter CLIP score
description: Component that filters out bad captions (Empty captions, Captions with weird characters, Captions that are dates)
image: ghcr.io/ml6team/filter_clip_score:50f3a97878ac81670ebe624039ff0fcec0542e4f

consumes:
imagetext:
fields:
clipl14score:
type: float32

args:
pct_threshold:
type: float
description: "Percentage treshold to filter out captions"
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import logging
import pandas as pd
from fondant.component import PandasTransformComponent

logger = logging.getLogger(__name__)


class FilterTextComplexity(PandasTransformComponent):
"""
Component that filters rows based on clip scores
"""

def __init__(self, *args, pct_threshold: float, **kwargs):
self.pct_threshold = pct_threshold

def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
logger.info("Filtering on clip scores...")
logger.info(f"Initial length: {len(dataframe)}")

clip_scores = dataframe["imagetext"]["clipl14score"]
sorted_clip_scores = clip_scores.sort_values(ascending=False)
threshold_idx = int(len(sorted_clip_scores) * self.pct_threshold)
threshold = sorted_clip_scores.iloc[threshold_idx]
logger.info(f"Clip score Threshold: {threshold}")

mask = clip_scores > threshold
filtered_dataframe = dataframe[mask]
logger.info(
f"Final length: {len(filtered_dataframe)} ({len(filtered_dataframe) / len(dataframe):.2f})"
)

return filtered_dataframe
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,11 @@ produces:
data:
type: string

image_text:
imagetext:
fields:
uid:
type: string
clip_b32_similarity_score:
clipb32score:
type: float32
clip_l14_similarity_score:
clipl14score:
type: float32

args:
Expand Down
1 change: 1 addition & 0 deletions examples/pipelines/datacomp/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
pipeline.add_op(load_from_hub_op)
pipeline.add_op(filter_complexity_op, dependencies=download_images_op)
pipeline.add_op(download_images_op, dependencies=load_from_hub_op)

# TODO add more ops


Expand Down
31 changes: 21 additions & 10 deletions examples/pipelines/datacomp/simple_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from pipeline_configs import PipelineConfigs

from fondant.pipeline import ComponentOp, Pipeline, Client
from fondant.pipeline import ComponentOp, Pipeline

logger = logging.getLogger(__name__)

Expand All @@ -17,7 +17,6 @@
pipeline_description="A pipeline for filtering the Datacomp dataset",
base_path=PipelineConfigs.BASE_PATH,
)
client = Client(host=PipelineConfigs.HOST)

# define ops
load_component_column_mapping = {
Expand All @@ -27,16 +26,17 @@
"face_bboxes": "images_face_bboxes",
"sha256": "images_sha256",
"text": "text_data",
"uid": "image_text_uid",
"clip_b32_similarity_score": "image_text_clip_b32_similarity_score",
"clip_l14_similarity_score": "image_text_clip_l14_similarity_score",
"clip_b32_similarity_score": "imagetext_clipb32score",
"clip_l14_similarity_score": "imagetext_clipl14score",
}

load_from_hub_op = ComponentOp(
component_dir="components/load_from_hf_hub",
arguments={
"dataset_name": "nielsr/datacomp-small-with-embeddings",
"dataset_name": "mlfoundations/datacomp_small",
"column_name_mapping": load_component_column_mapping,
"n_rows_to_load": 100,
"index_column": "uid",
},
node_pool_label="node_pool",
node_pool_name="n2-standard-128-pool",
Expand All @@ -57,13 +57,24 @@
node_pool_label="node_pool",
node_pool_name="n2-standard-128-pool",
)
clean_captions_op = ComponentOp(
component_dir="components/clean_captions",
node_pool_label="node_pool",
node_pool_name="n2-standard-128-pool",
)
filter_clip_score_op = ComponentOp(
component_dir="components/filter_clip_score",
arguments={
"pct_threshold": 0.3,
},
node_pool_label="node_pool",
node_pool_name="n2-standard-128-pool",
)

# add ops to pipeline
pipeline.add_op(load_from_hub_op)
pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op)
pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op)
pipeline.add_op(clean_captions_op, dependencies=filter_complexity_op)
pipeline.add_op(filter_clip_score_op, dependencies=clean_captions_op)
# TODO add more ops


if __name__ == "__main__":
client.compile_and_run(pipeline=pipeline)

0 comments on commit 293aa41

Please sign in to comment.