Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Run ruff on components #209

Merged
merged 1 commit into from
Jun 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ repos:
- id: ruff
files: |
(?x)^(
components/.*|
fondant/.*|
tests/.*|
)$
Expand Down
22 changes: 8 additions & 14 deletions components/caption_images/src/main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
"""
This component that captions images using a model from the Hugging Face hub.
"""
"""This component that captions images using a model from the Hugging Face hub."""
import io
import logging
import typing as t
Expand All @@ -10,7 +8,7 @@
import pandas as pd
import torch
from PIL import Image
from transformers import BatchEncoding, BlipProcessor, BlipForConditionalGeneration
from transformers import BatchEncoding, BlipForConditionalGeneration, BlipProcessor

from fondant.component import DaskTransformComponent
from fondant.logger import configure_logging
Expand All @@ -29,14 +27,12 @@ def process_image(image: bytes, *, processor: BlipProcessor, device: str) -> tor
device: The device to move the transformed image to.
"""
def load(img: bytes) -> Image:
"""Load the bytestring as an image"""
"""Load the bytestring as an image."""
bytes_ = io.BytesIO(img)
return Image.open(bytes_).convert("RGB")

def transform(img: Image) -> BatchEncoding:
"""
Transform the image to a tensor using a processor and move it to the specified device.
"""
"""Transform the image to a tensor using a processor and move it to the specified device."""
return processor(images=img, return_tensors="pt").to(device)

return transform(load(image))["pixel_values"]
Expand All @@ -49,7 +45,7 @@ def caption_image_batch(
processor: BlipProcessor,
max_new_tokens: int
) -> pd.Series:
"""Caption a batch of images"""
"""Caption a batch of images."""
input_batch = torch.cat(image_batch.tolist())
output_batch = model.generate(pixel_values=input_batch, max_new_tokens=max_new_tokens)
captions_batch = processor.batch_decode(output_batch, skip_special_tokens=True)
Expand All @@ -66,7 +62,7 @@ def caption_images(
max_new_tokens: int,
device: str,
) -> pd.DataFrame:
"""Caption a pandas series of images"""
"""Caption a pandas series of images."""
images = images.apply(process_image, processor=processor, device=device)
results: t.List[pd.Series] = []
for batch in np.split(images, np.arange(batch_size, len(images), batch_size)):
Expand All @@ -83,9 +79,7 @@ def caption_images(


class CaptionImagesComponent(DaskTransformComponent):
"""
Component that captions images using a model from the Hugging Face hub.
"""
"""Component that captions images using a model from the Hugging Face hub."""

def transform(
self,
Expand All @@ -99,7 +93,7 @@ def transform(
dataframe: Dask dataframe
model_id: id of the model on the Hugging Face hub
batch_size: batch size to use
max_new_tokens: maximum token length of each caption
max_new_tokens: maximum token length of each caption.

Returns:
Dask dataframe
Expand Down
28 changes: 15 additions & 13 deletions components/download_images/src/main.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
"""
This component downloads images based on URLs, and resizes them based on various settings like minimum image size and aspect ratio.
This component downloads images based on URLs, and resizes them based on various settings like
minimum image size and aspect ratio.

Some functions here are directly taken from https://github.com/rom1504/img2dataset/blob/main/img2dataset/downloader.py.
Some functions here are directly taken from
https://github.com/rom1504/img2dataset/blob/main/img2dataset/downloader.py.
"""
import logging
import io
import logging
import traceback
import urllib

import dask.dataframe as dd

from resizer import Resizer

from fondant.component import DaskTransformComponent
Expand All @@ -20,32 +21,34 @@


def is_disallowed(headers, user_agent_token, disallowed_header_directives):
"""Check if HTTP headers contain an X-Robots-Tag directive disallowing usage"""
"""Check if HTTP headers contain an X-Robots-Tag directive disallowing usage."""
for values in headers.get_all("X-Robots-Tag", []):
try:
uatoken_directives = values.split(":", 1)
directives = [x.strip().lower() for x in uatoken_directives[-1].split(",")]
ua_token = (
uatoken_directives[0].lower() if len(uatoken_directives) == 2 else None
uatoken_directives[0].lower() if len(uatoken_directives) == 2 # noqa : PLR2004
else None
)
if (ua_token is None or ua_token == user_agent_token) and any(
x in disallowed_header_directives for x in directives
):
return True
except Exception as err: # pylint: disable=broad-except
except Exception as err:
traceback.print_exc()
print(f"Failed to parse X-Robots-Tag: {values}: {err}")
return False


def download_image(url, timeout, user_agent_token, disallowed_header_directives):
"""Download an image with urllib"""
"""Download an image with urllib."""
img_stream = None
user_agent_string = (
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0"
)
if user_agent_token:
user_agent_string += f" (compatible; {user_agent_token}; +https://github.com/rom1504/img2dataset)"
user_agent_string += f" (compatible; {user_agent_token}; " \
f"+https://github.com/rom1504/img2dataset)"
try:
request = urllib.request.Request(
url, data=None, headers={"User-Agent": user_agent_string}
Expand All @@ -59,13 +62,14 @@ def download_image(url, timeout, user_agent_token, disallowed_header_directives)
return None
img_stream = io.BytesIO(r.read())
return img_stream
except Exception as err: # pylint: disable=broad-except
except Exception:
if img_stream is not None:
img_stream.close()
return None


def download_image_with_retry(
*,
url,
timeout,
retries,
Expand All @@ -84,9 +88,7 @@ def download_image_with_retry(


class DownloadImagesComponent(DaskTransformComponent):
"""
Component that downloads images based on URLs.
"""
"""Component that downloads images based on URLs."""

def transform(
self,
Expand Down
1 change: 1 addition & 0 deletions components/download_images/src/resizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

Copyright (c) 2021 Romain Beaumont
"""
# ruff: noqa

import albumentations as A
import cv2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
Copyright (c) 2021 Romain Beaumont
"""

# ruff: noqa
# mypy: ignore-errors

import base64
Expand Down
23 changes: 7 additions & 16 deletions components/embedding_based_laion_retrieval/src/main.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
"""
This component retrieves image URLs from LAION-5B based on a set of CLIP embeddings.
"""
"""This component retrieves image URLs from LAION-5B based on a set of CLIP embeddings."""
import asyncio
import concurrent.futures
import logging
import typing as t

import pandas as pd

from clip_client import ClipClient, Modality

from fondant.component import PandasTransformComponent
Expand All @@ -18,9 +15,7 @@


class LAIONRetrievalComponent(PandasTransformComponent):
"""
Component that retrieves image URLs from LAION-5B based on a set of CLIP embeddings.
"""
"""Component that retrieves image URLs from LAION-5B based on a set of CLIP embeddings."""

def setup(
self,
Expand All @@ -33,8 +28,10 @@ def setup(

Args:
num_images: number of images to retrieve for each prompt
aesthetic_score: ranking score for aesthetic embedding, higher is prettier, between 0 and 9.
aesthetic_weight: weight of the aesthetic embedding to add to the query, between 0 and 1.
aesthetic_score: ranking score for aesthetic embedding, higher is prettier,
between 0 and 9.
aesthetic_weight: weight of the aesthetic embedding to add to the query,
between 0 and 1.
"""
self.client = ClipClient(
url="https://knn.laion.ai/knn-service",
Expand All @@ -49,13 +46,7 @@ def transform(
self,
dataframe: pd.DataFrame,
) -> pd.DataFrame:
"""
Args:
dataframe: Pandas dataframe

Returns:
Dask dataframe
"""
"""Asynchronously retrieve image URLs and ids based on prompts in the provided dataframe."""
results: t.List[t.Tuple[str]] = []
loop = asyncio.new_event_loop()

Expand Down
11 changes: 4 additions & 7 deletions components/filter_comments/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,17 @@
import logging

import dask.dataframe as dd
from utils.text_extraction import get_comments_to_code_ratio

from fondant.component import TransformComponent
from fondant.logger import configure_logging

from utils.text_extraction import get_comments_to_code_ratio

configure_logging()
logger = logging.getLogger(__name__)


class FilterCommentsComponent(TransformComponent):
"""
Component that filters instances based on code to comments ratio.
"""
"""Component that filters instances based on code to comments ratio."""

def transform(
self,
Expand All @@ -32,9 +30,8 @@ def transform(
min_comments_ratio: The minimum code to comment ratio
max_comments_ratio: The maximum code to comment ratio
Returns:
Filtered dask dataframe
Filtered dask dataframe.
"""

# Apply the function to the desired column and filter the DataFrame
filtered_df = dataframe[
dataframe["code_content"].map_partitions(
Expand Down
38 changes: 21 additions & 17 deletions components/filter_comments/src/utils/text_extraction.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
""" This code is adapted from BigScience PII detection
https://github.com/bigcode-project/bigcode-dataset/blob/main/preprocessing/filtering.py
"""This code is adapted from BigScience PII detection
https://github.com/bigcode-project/bigcode-dataset/blob/main/preprocessing/filtering.py.

MST BigScience PII Code
Original colab that is a source of this file is located at
Expand All @@ -17,11 +17,11 @@
limitations under the License.
"""

import typing as t
import tokenize
import warnings
import ast
import io
import tokenize
import typing as t
import warnings
from itertools import groupby

NODE_TYPES = {
Expand Down Expand Up @@ -56,13 +56,14 @@ def parse_docstrings(source):
# comment extraction
def get_comments(source: str) -> str:
"""
Returns a string including all comments in python code
Returns a string including all comments in python code.

Args:
source: the code to parse

Returns:
The script comments
The script comments.
"""

comments = []
g = tokenize.generate_tokens(io.StringIO(source).readline)
for toknum, tokval, _, _, _ in g:
Expand All @@ -76,10 +77,12 @@ def get_comments(source: str) -> str:
def get_docstrings(source: str) -> t.List[str]:
"""
Parse Python source code from file or string and print docstrings.

Args:
source: the code to parse

Returns:
A list containing the script docstrings
A list containing the script docstrings.
"""
if hasattr(source, "read"):
source = source.read()
Expand All @@ -99,16 +102,18 @@ def get_docstrings(source: str) -> t.List[str]:

def get_text_python(source: str, extract_comments: bool = True) -> str:
"""Extract all natural text in source: comments + docstrings
the extraction fails in case of syntax errors in the file
the extraction fails in case of syntax errors in the file.

Args:
source: the code to parse
comments: if True extract comments too
Returns:
A string with concatenated docstrings and comments"""
extract_comments: if True extract comments too

Returns:
A string with concatenated docstrings and comments.
"""
try:
docstrings = "\n".join(get_docstrings(source))
except:
except Exception:
docstrings = ""
warnings.warn(
"code couldn't be parsed due to compilation failure, no docstring is extracted"
Expand All @@ -117,7 +122,7 @@ def get_text_python(source: str, extract_comments: bool = True) -> str:
if extract_comments:
try:
comments = get_comments(source)
except:
except Exception:
comments = ""
warnings.warn("tokenization error, no comments are extracted")
else:
Expand All @@ -133,9 +138,8 @@ def get_comments_to_code_ratio(text: str) -> float:
Args:
text: the string source code
Returns:
The comments to code ratio
The comments to code ratio.
"""

comments = get_text_python(text)

return len(comments) / len(text)
7 changes: 2 additions & 5 deletions components/filter_line_length/src/main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
"""
This component filters code based on a set of metadata associated with it.
"""
"""This component filters code based on a set of metadata associated with it."""
import logging

import dask.dataframe as dd
Expand Down Expand Up @@ -33,9 +31,8 @@ def transform(
max_line_length_threshold: Threshold for max line length to filter on
alphanum_fraction_threshold: Alphanum fraction to filter on
Returns:
Filtered dask dataframe
Filtered dask dataframe.
"""

filtered_df = dataframe[
(dataframe["code_avg_line_length"] > avg_line_length_threshold)
& (dataframe["code_max_line_length"] > max_line_length_threshold)
Expand Down
Loading