-
Notifications
You must be signed in to change notification settings - Fork 26
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Generic read write component #214
Changes from 42 commits
f7e5dcb
654a5a6
17c5c7e
a4e44d2
6d2d1ab
af597f8
c90b53b
a8e55d8
00dfa55
0fa694f
c9f4429
5ee3b31
919c21f
66fdb4c
61adc40
88b4729
64614f7
218b8ce
7c7ace7
3d5b42a
7d88861
e2445d1
8b5a749
eaa2dfa
86f89b4
d160e34
49d80a6
402866f
4a00829
3579cef
c56a9ed
7c6d7a4
c741116
e21755e
c97b405
a2c8717
421ef3e
c21f0ea
94ecd32
e731744
2120a5c
c1ec93c
316eff3
a47f42c
2184ef6
177bed7
543d673
4ec35b4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
name: Image resolution extraction | ||
description: Component that extracts image resolution data from the images | ||
image: ghcr.io/ml6team/image_resolution_extraction:latest | ||
|
||
consumes: | ||
images: | ||
fields: | ||
data: | ||
type: binary | ||
|
||
produces: | ||
images: | ||
fields: | ||
width: | ||
type: int16 | ||
height: | ||
type: int16 | ||
data: | ||
type: binary | ||
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
fondant | ||
pyarrow>=7.0 | ||
gcsfs==2023.4.0 | ||
imagesize==1.4.1 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
"""This component filters images of the dataset based on image size (minimum height and width).""" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Docstring to be updated |
||
import io | ||
import logging | ||
import typing as t | ||
|
||
import dask.dataframe as dd | ||
PhilippeMoussalli marked this conversation as resolved.
Show resolved
Hide resolved
|
||
import imagesize | ||
import numpy as np | ||
|
||
from fondant.component import DaskTransformComponent | ||
PhilippeMoussalli marked this conversation as resolved.
Show resolved
Hide resolved
|
||
from fondant.logger import configure_logging | ||
|
||
configure_logging() | ||
logger = logging.getLogger(__name__) | ||
|
||
|
||
def extract_dimensions(image_df: dd.DataFrame) -> t.Tuple[np.int16, np.int16]: | ||
"""Extract the width and height of an image. | ||
|
||
Args: | ||
image_df (dd.DataFrame): input dataframe with images_data column | ||
|
||
Returns: | ||
np.int16: width of the image | ||
np.int16: height of the image | ||
""" | ||
width, height = imagesize.get(io.BytesIO(image_df["images_data"])) | ||
|
||
return np.int16(width), np.int16(height) | ||
PhilippeMoussalli marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
|
||
class ImageResolutionExtractionComponent(DaskTransformComponent): | ||
"""Component that extracts image dimensions.""" | ||
|
||
def transform(self, *, dataframe: dd.DataFrame) -> dd.DataFrame: | ||
""" | ||
Args: | ||
dataframe: Dask dataframe | ||
Returns: | ||
dataset. | ||
""" | ||
logger.info("Length of the dataframe before filtering: %s", len(dataframe)) | ||
|
||
logger.info("Filtering dataset...") | ||
|
||
dataframe[["images_width", "images_height"]] = \ | ||
dataframe[["images_data"]].apply(extract_dimensions, | ||
axis=1, result_type="expand", meta={0: int, 1: int}) | ||
|
||
return dataframe | ||
PhilippeMoussalli marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
|
||
if __name__ == "__main__": | ||
component = ImageResolutionExtractionComponent.from_args() | ||
component.run() |
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
|
@@ -2,21 +2,25 @@ name: Load from hub | |||||||
description: Component that loads a dataset from the hub | ||||||||
image: ghcr.io/ml6team/load_from_hf_hub:latest | ||||||||
|
||||||||
produces: | ||||||||
images: | ||||||||
consumes: | ||||||||
dummy_variable: #TODO: fill in here | ||||||||
fields: | ||||||||
data: | ||||||||
type: binary | ||||||||
width: | ||||||||
type: int16 | ||||||||
height: | ||||||||
type: int16 | ||||||||
captions: | ||||||||
fields: | ||||||||
data: | ||||||||
type: string | ||||||||
|
||||||||
args: | ||||||||
dataset_name: | ||||||||
description: Name of dataset on the hub | ||||||||
type: str | ||||||||
column_name_mapping: | ||||||||
description: Mapping of the consumed hub dataset to fondant column names | ||||||||
type: dict | ||||||||
PhilippeMoussalli marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||
image_column_names: | ||||||||
description: A list containing the original hub image column names. Used to format the image | ||||||||
from HF hub format to a byte string | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
type: list | ||||||||
default: None | ||||||||
nb_rows_to_load: | ||||||||
PhilippeMoussalli marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||
description: Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale | ||||||||
type: int | ||||||||
default: None |
Original file line number | Diff line number | Diff line change | ||||||||
---|---|---|---|---|---|---|---|---|---|---|
@@ -1,10 +1,8 @@ | ||||||||||
"""This component loads a seed dataset from the hub.""" | ||||||||||
import io | ||||||||||
import logging | ||||||||||
import typing as t | ||||||||||
|
||||||||||
import dask.dataframe as dd | ||||||||||
import numpy as np | ||||||||||
from PIL import Image | ||||||||||
|
||||||||||
from fondant.component import LoadComponent | ||||||||||
from fondant.logger import configure_logging | ||||||||||
|
@@ -13,52 +11,43 @@ | |||||||||
logger = logging.getLogger(__name__) | ||||||||||
|
||||||||||
|
||||||||||
def extract_width(image_bytes): | ||||||||||
# Decode image bytes to PIL Image object | ||||||||||
pil_image = Image.open(io.BytesIO(image_bytes)) | ||||||||||
width = pil_image.size[0] | ||||||||||
|
||||||||||
return np.int16(width) | ||||||||||
|
||||||||||
|
||||||||||
def extract_height(image_bytes): | ||||||||||
# Decode image bytes to PIL Image object | ||||||||||
pil_image = Image.open(io.BytesIO(image_bytes)) | ||||||||||
height = pil_image.size[1] | ||||||||||
|
||||||||||
return np.int16(height) | ||||||||||
|
||||||||||
|
||||||||||
class LoadFromHubComponent(LoadComponent): | ||||||||||
def load(self, *, dataset_name: str) -> dd.DataFrame: | ||||||||||
def load(self, | ||||||||||
*, | ||||||||||
dataset_name: str, | ||||||||||
Comment on lines
+16
to
+17
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
The This makes sense: component.load("my_dataset") This doesn't: component.load("my_dataset", {"original_name": "new_name"}, ["original_name"], 5) |
||||||||||
column_name_mapping: dict, | ||||||||||
image_column_names: t.Optional[list], | ||||||||||
nb_rows_to_load: t.Optional[int]) -> dd.DataFrame: | ||||||||||
""" | ||||||||||
Args: | ||||||||||
dataset_name: name of the dataset to load. | ||||||||||
|
||||||||||
column_name_mapping: Mapping of the consumed hub dataset to fondant column names | ||||||||||
image_column_names: A list containing the original hub image column names. Used to | ||||||||||
format the image from HF hub format to a byte string | ||||||||||
nb_rows_to_load: optional argument that defines the number of rows to load. Useful for | ||||||||||
testing pipeline runs on a small scale | ||||||||||
Returns: | ||||||||||
Dataset: HF dataset | ||||||||||
Dataset: HF dataset. | ||||||||||
""" | ||||||||||
# 1) Load data, read as Dask dataframe | ||||||||||
logger.info("Loading dataset from the hub...") | ||||||||||
dask_df = dd.read_parquet(f"hf://datasets/{dataset_name}") | ||||||||||
|
||||||||||
# 2) Rename columns | ||||||||||
dask_df = dask_df.rename( | ||||||||||
columns={"image": "images_data", "text": "captions_data"} | ||||||||||
) | ||||||||||
# 2) Make sure images are bytes instead of dicts | ||||||||||
if image_column_names: | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
This is usually clearer |
||||||||||
for image_column_name in image_column_names: | ||||||||||
dask_df[image_column_name] = dask_df[image_column_name].map( | ||||||||||
lambda x: x["bytes"], meta=("bytes", bytes) | ||||||||||
) | ||||||||||
|
||||||||||
# 3) Rename columns | ||||||||||
dask_df = dask_df.rename(columns=column_name_mapping) | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This doesn't create hierarchical columns right? Is this necessary given that we now use them? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, the columns are still stored as |
||||||||||
|
||||||||||
# 3) Make sure images are bytes instead of dicts | ||||||||||
dask_df["images_data"] = dask_df["images_data"].map( | ||||||||||
lambda x: x["bytes"], meta=("bytes", bytes) | ||||||||||
) | ||||||||||
# 4) Optional: only return specific amount of rows | ||||||||||
|
||||||||||
# 4) Add width and height columns | ||||||||||
dask_df["images_width"] = dask_df["images_data"].map( | ||||||||||
extract_width, meta=("images_width", int) | ||||||||||
) | ||||||||||
dask_df["images_height"] = dask_df["images_data"].map( | ||||||||||
extract_height, meta=("images_height", int) | ||||||||||
) | ||||||||||
Comment on lines
-56
to
-61
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is something we could still do for image columns right? Although then it needs to match the provided component spec as well. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not completely sure about that one, it might be that the original dataset has this metadata and we're assuming that the user requires it. I was thinking about implementing another component that generates image metadata to go around this, it can be based on a conditional arguments (e.g. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, downside is that it requires loading the image data into the component. We might be able to do this by only loading the first x bytes, but not sure how this works with different image formats and if this can be done performant with parquet. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. found this library: https://github.com/shibukawa/imagesize_py |
||||||||||
if nb_rows_to_load: | ||||||||||
dask_df = dask_df.head(nb_rows_to_load) | ||||||||||
dask_df = dd.from_pandas(dask_df, npartitions=1) | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Calling On the other hand, I'm having trouble finding a good way to do this using Dask without knowing the index labels. What is possible, is sampling, or selecting a number of partitions, but this might be less usable for the user. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmm good point, I was considering small scale for end-to-end testing but this might not always be the case. Maybe best to tackle this separately. Created a ticket for it |
||||||||||
|
||||||||||
return dask_df | ||||||||||
|
||||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
name: Write to hub | ||
description: Component that writes a dataset to the hub | ||
image: ghcr.io/ml6team/write_to_hf_hub:0.1.3 | ||
|
||
consumes: | ||
dummy_variable: #TODO: fill in here | ||
fields: | ||
data: | ||
type: binary | ||
|
||
args: | ||
hf_token: | ||
description: The hugging face token used to write to the hub | ||
type: str | ||
username: | ||
description: The username under which to upload the dataset | ||
type: str | ||
dataset_name: | ||
description: The name of the dataset to upload | ||
type: str | ||
image_column_names: | ||
description: A list containing the image column names. Used to format to image to HF hub format | ||
type: list | ||
default: None | ||
column_name_mapping: | ||
description: Mapping of the consumed fondant column names to the written hub column names | ||
type: dict | ||
default: None |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
huggingface_hub==0.14.1 | ||
fondant | ||
datasets==2.10.1 | ||
fondant==0.1.3 | ||
pyarrow>=7.0 | ||
Pillow==9.4.0 | ||
gcsfs==2023.4.0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For components that only add columns, is there a need to specify existing columns in the produces section?
cc @RobbeSneyders
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In theory not, but I'm not sure if it works in practice already if you leave them out.