Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Datacomp] Add clean_captions and filter_clip_score components #381

Merged
merged 7 commits into from
Aug 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions examples/pipelines/datacomp/components/clean_captions/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
FROM --platform=linux/amd64 python:3.8-slim

## System dependencies
RUN apt-get update && \
apt-get upgrade -y && \
apt-get install git -y

# install requirements
COPY requirements.txt /
RUN pip3 install --no-cache-dir -r requirements.txt

# Install Fondant
# This is split from other requirements to leverage caching
ARG FONDANT_VERSION=main
RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}

# Set the working directory to the component folder
WORKDIR /component/src

# Copy over src-files
COPY src/ .

ENTRYPOINT ["fondant", "execute", "main"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
name: Clean captions
description: Component that filters out bad captions (Empty captions, Captions with weird characters, Captions that are dates)
image: ghcr.io/ml6team/clean_captions:50f3a97878ac81670ebe624039ff0fcec0542e4f

consumes:
text:
fields:
data:
type: string

produces:
text:
fields:
data:
type: string
Empty file.
65 changes: 65 additions & 0 deletions examples/pipelines/datacomp/components/clean_captions/src/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import logging

import pandas as pd

from fondant.component import PandasTransformComponent
from dateutil.parser import parse

logger = logging.getLogger(__name__)


def isNonEnglish(s):
try:
s.encode(encoding="utf-8").decode("ascii")
except UnicodeDecodeError:
return True
else:
return False


def get_num_nonenglish_characters(text):
return sum([isNonEnglish(char) for char in text])


def has_too_much_weird_characters(text, max_ratio=0.5):
return (get_num_nonenglish_characters(text) / len(text)) > max_ratio


def is_valid_date(date_string):
try:
parse(date_string)
return True
except (ValueError, OverflowError):
return False


def is_empty(text):
return text.strip() == ""


class FilterTextComplexity(PandasTransformComponent):
"""Component that filters out bad captions in image-text pairs:
- Empty captions
- Captions with weird characters
- Captions that are dates
"""

def __init__(self, *args) -> None:
pass

def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
texts = dataframe["text"]["data"]

logger.info("Filtering on empty captions...")
mask = texts.apply(lambda text: not is_empty(text))
dataframe = dataframe[mask]

logger.info("Filtering on weird character captions...")
mask = texts.apply(lambda text: not has_too_much_weird_characters(text))
dataframe = dataframe[mask]

logger.info("Filtering on captions that look like dates...")
mask = texts.apply(lambda text: not is_valid_date(text))
dataframe = dataframe[mask]

return dataframe
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
FROM --platform=linux/amd64 python:3.8-slim

## System dependencies
RUN apt-get update && \
apt-get upgrade -y && \
apt-get install git -y

# install requirements
COPY requirements.txt /
RUN pip3 install --no-cache-dir -r requirements.txt

# Install Fondant
# This is split from other requirements to leverage caching
ARG FONDANT_VERSION=main
RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}

# Set the working directory to the component folder
WORKDIR /component/src

# Copy over src-files
COPY src/ .

ENTRYPOINT ["fondant", "execute", "main"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
name: Filter CLIP score
description: Component that filters out bad captions (Empty captions, Captions with weird characters, Captions that are dates)
image: ghcr.io/ml6team/filter_clip_score:50f3a97878ac81670ebe624039ff0fcec0542e4f

consumes:
imagetext:
fields:
clipl14score:
type: float32

args:
pct_threshold:
type: float
description: "Percentage treshold to filter out captions"
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import logging
import pandas as pd
from fondant.component import PandasTransformComponent

logger = logging.getLogger(__name__)


class FilterTextComplexity(PandasTransformComponent):
"""
Component that filters rows based on clip scores
"""

def __init__(self, *args, pct_threshold: float, **kwargs):
self.pct_threshold = pct_threshold

def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
logger.info("Filtering on clip scores...")
logger.info(f"Initial length: {len(dataframe)}")

clip_scores = dataframe["imagetext"]["clipl14score"]
sorted_clip_scores = clip_scores.sort_values(ascending=False)
threshold_idx = int(len(sorted_clip_scores) * self.pct_threshold)
threshold = sorted_clip_scores.iloc[threshold_idx]
logger.info(f"Clip score Threshold: {threshold}")

mask = clip_scores > threshold
filtered_dataframe = dataframe[mask]
logger.info(
f"Final length: {len(filtered_dataframe)} ({len(filtered_dataframe) / len(dataframe):.2f})"
)

return filtered_dataframe
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,11 @@ produces:
data:
type: string

image_text:
imagetext:
fields:
uid:
type: string
clip_b32_similarity_score:
clipb32score:
type: float32
clip_l14_similarity_score:
clipl14score:
type: float32

args:
Expand Down
1 change: 1 addition & 0 deletions examples/pipelines/datacomp/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
pipeline.add_op(load_from_hub_op)
pipeline.add_op(filter_complexity_op, dependencies=download_images_op)
pipeline.add_op(download_images_op, dependencies=load_from_hub_op)

# TODO add more ops


Expand Down
31 changes: 21 additions & 10 deletions examples/pipelines/datacomp/simple_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from pipeline_configs import PipelineConfigs

from fondant.pipeline import ComponentOp, Pipeline, Client
from fondant.pipeline import ComponentOp, Pipeline

logger = logging.getLogger(__name__)

Expand All @@ -17,7 +17,6 @@
pipeline_description="A pipeline for filtering the Datacomp dataset",
base_path=PipelineConfigs.BASE_PATH,
)
client = Client(host=PipelineConfigs.HOST)

# define ops
load_component_column_mapping = {
Expand All @@ -27,16 +26,17 @@
"face_bboxes": "images_face_bboxes",
"sha256": "images_sha256",
"text": "text_data",
"uid": "image_text_uid",
"clip_b32_similarity_score": "image_text_clip_b32_similarity_score",
"clip_l14_similarity_score": "image_text_clip_l14_similarity_score",
"clip_b32_similarity_score": "imagetext_clipb32score",
"clip_l14_similarity_score": "imagetext_clipl14score",
}

load_from_hub_op = ComponentOp(
component_dir="components/load_from_hf_hub",
arguments={
"dataset_name": "nielsr/datacomp-small-with-embeddings",
"dataset_name": "mlfoundations/datacomp_small",
"column_name_mapping": load_component_column_mapping,
"n_rows_to_load": 100,
"index_column": "uid",
},
node_pool_label="node_pool",
node_pool_name="n2-standard-128-pool",
Expand All @@ -57,13 +57,24 @@
node_pool_label="node_pool",
node_pool_name="n2-standard-128-pool",
)
clean_captions_op = ComponentOp(
component_dir="components/clean_captions",
node_pool_label="node_pool",
node_pool_name="n2-standard-128-pool",
)
filter_clip_score_op = ComponentOp(
component_dir="components/filter_clip_score",
arguments={
"pct_threshold": 0.3,
},
node_pool_label="node_pool",
node_pool_name="n2-standard-128-pool",
)

# add ops to pipeline
pipeline.add_op(load_from_hub_op)
pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op)
pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op)
pipeline.add_op(clean_captions_op, dependencies=filter_complexity_op)
pipeline.add_op(filter_clip_score_op, dependencies=clean_captions_op)
# TODO add more ops


if __name__ == "__main__":
client.compile_and_run(pipeline=pipeline)