Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Datacomp] Add clean_captions and filter_clip_score components #381

Merged
merged 7 commits into from
Aug 24, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions examples/pipelines/datacomp/components/clean_captions/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
FROM --platform=linux/amd64 python:3.8-slim

## System dependencies
RUN apt-get update && \
apt-get upgrade -y && \
apt-get install git -y

# install requirements
COPY requirements.txt /
RUN pip3 install --no-cache-dir -r requirements.txt

# Install Fondant
# This is split from other requirements to leverage caching
ARG FONDANT_VERSION=main
RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}

# Set the working directory to the component folder
WORKDIR /component/src

# Copy over src-files
COPY src/ .

ENTRYPOINT ["fondant", "execute", "main"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
name: Filter Caption
description: Component that filters out bad captions (Empty captions, Captions with weird characters, Captions that are dates)
image: ghcr.io/ml6team/filter_captions:dev

consumes:
text:
fields:
data:
type: string

produces:
text:
fields:
data:
type: string
Empty file.
65 changes: 65 additions & 0 deletions examples/pipelines/datacomp/components/clean_captions/src/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import logging

import pandas as pd

from fondant.component import PandasTransformComponent
from dateutil.parser import parse

logger = logging.getLogger(__name__)


def isNonEnglish(s):
try:
s.encode(encoding="utf-8").decode("ascii")
except UnicodeDecodeError:
return True
else:
return False

def get_num_nonenglish_characters(text):
return sum([isNonEnglish(char) for char in text])

def has_too_much_weird_characters(text, max_ratio=0.5):
return (get_num_nonenglish_characters(text) / len(text)) > max_ratio

def is_valid_date(date_string):
try:
parse(date_string)
return True
except (ValueError, OverflowError):
return False

def is_empty(text):
return text.strip() != ""


class FilterTextComplexity(PandasTransformComponent):
"""Component that filters out bad captions in image-text pairs:
- Empty captions
- Captions with weird characters
- Captions that are dates
"""

def __init__(
self,
*args,
) -> None:
pass

def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
texts = dataframe["text"]["data"]

logger.info("Filtering on empty captions...")
mask = texts.apply(lambda text: not is_empty(text))
dataframe = dataframe[mask]

logger.info("Filtering on weird character captions...")
mask = texts.apply(lambda text: not has_too_much_weird_characters(text))
dataframe = dataframe[mask]

logger.info("Filtering on captions that look like dates...")
mask = texts.apply(lambda text: not is_valid_date(text))
dataframe = dataframe[mask]

return dataframe

Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
FROM --platform=linux/amd64 python:3.8-slim

## System dependencies
RUN apt-get update && \
apt-get upgrade -y && \
apt-get install git -y

# install requirements
COPY requirements.txt /
RUN pip3 install --no-cache-dir -r requirements.txt

# Install Fondant
# This is split from other requirements to leverage caching
ARG FONDANT_VERSION=main
RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}

# Set the working directory to the component folder
WORKDIR /component/src

# Copy over src-files
COPY src/ .

ENTRYPOINT ["fondant", "execute", "main"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
name: Filter Caption
description: Component that filters out bad captions (Empty captions, Captions with weird characters, Captions that are dates)
image: ghcr.io/ml6team/filter_clip_score:dev

consumes:
image_text:
fields:
clip_l14_similarity_score:
type: float32

produces:
image_text:
fields:
clip_l14_similarity_score:
type: float32

args:
pct_treshold:
type: float
description: "Percentage treshold to filter out captions"
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import logging
import pandas as pd
from fondant.component import PandasTransformComponent
from fondant.executor import PandasTransformExecutor

logger = logging.getLogger(__name__)

class FilterTextComplexity(PandasTransformComponent):
"""
Component that filters rows based on clip scores
"""

def __init__(self, pct_threshold: float, *args, **kwargs):
self.pct_threshold = pct_threshold

def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
logger.info("Filtering on clip scores...")
logger.info(f"Initial length: {len(dataframe)}")

clip_scores = dataframe["image_text"]["clip_l14_similarity_score"]
sorted_clip_scores = clip_scores.sort_values(ascending=False)
threshold_idx = int(len(sorted_clip_scores) * self.pct_threshold)
threshold = sorted_clip_scores.iloc[threshold_idx]
logger.info(f"Clip score Threshold: {threshold}")

mask = clip_scores > threshold
filtered_dataframe = dataframe[mask]
logger.info(f"Final length: {len(filtered_dataframe)} ({len(filtered_dataframe) / len(dataframe):.2f})")

return filtered_dataframe


1 change: 1 addition & 0 deletions examples/pipelines/datacomp/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
pipeline.add_op(load_from_hub_op)
pipeline.add_op(filter_complexity_op, dependencies=download_images_op)
pipeline.add_op(download_images_op, dependencies=load_from_hub_op)

# TODO add more ops


Expand Down