Run ruff on components

ml6team · Jun 14, 2023 · 9a8e597 · 9a8e597
1 parent dcd9d74
commit 9a8e597
Show file tree

Hide file tree

Showing 22 changed files with 139 additions and 178 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -8,6 +8,7 @@ repos:
       - id: ruff
         files:  |
             (?x)^(
+                components/.*|
                 fondant/.*|
                 tests/.*|
             )$

diff --git a/components/caption_images/src/main.py b/components/caption_images/src/main.py
@@ -1,6 +1,4 @@
-"""
-This component that captions images using a model from the Hugging Face hub.
-"""
+"""This component that captions images using a model from the Hugging Face hub."""
 import io
 import logging
 import typing as t
@@ -10,7 +8,7 @@
 import pandas as pd
 import torch
 from PIL import Image
-from transformers import BatchEncoding, BlipProcessor, BlipForConditionalGeneration
+from transformers import BatchEncoding, BlipForConditionalGeneration, BlipProcessor
 
 from fondant.component import DaskTransformComponent
 from fondant.logger import configure_logging
@@ -29,14 +27,12 @@ def process_image(image: bytes, *, processor: BlipProcessor, device: str) -> tor
         device: The device to move the transformed image to.
     """
     def load(img: bytes) -> Image:
-        """Load the bytestring as an image"""
+        """Load the bytestring as an image."""
         bytes_ = io.BytesIO(img)
         return Image.open(bytes_).convert("RGB")
 
     def transform(img: Image) -> BatchEncoding:
-        """
-        Transform the image to a tensor using a processor and move it to the specified device.
-        """
+        """Transform the image to a tensor using a processor and move it to the specified device."""
         return processor(images=img, return_tensors="pt").to(device)
 
     return transform(load(image))["pixel_values"]
@@ -49,7 +45,7 @@ def caption_image_batch(
     processor: BlipProcessor,
     max_new_tokens: int
 ) -> pd.Series:
-    """Caption a batch of images"""
+    """Caption a batch of images."""
     input_batch = torch.cat(image_batch.tolist())
     output_batch = model.generate(pixel_values=input_batch, max_new_tokens=max_new_tokens)
     captions_batch = processor.batch_decode(output_batch, skip_special_tokens=True)
@@ -66,7 +62,7 @@ def caption_images(
     max_new_tokens: int,
     device: str,
 ) -> pd.DataFrame:
-    """Caption a pandas series of images"""
+    """Caption a pandas series of images."""
     images = images.apply(process_image, processor=processor, device=device)
     results: t.List[pd.Series] = []
     for batch in np.split(images, np.arange(batch_size, len(images), batch_size)):
@@ -83,9 +79,7 @@ def caption_images(
 
 
 class CaptionImagesComponent(DaskTransformComponent):
-    """
-    Component that captions images using a model from the Hugging Face hub.
-    """
+    """Component that captions images using a model from the Hugging Face hub."""
 
     def transform(
         self,
@@ -99,7 +93,7 @@ def transform(
             dataframe: Dask dataframe
             model_id: id of the model on the Hugging Face hub
             batch_size: batch size to use
-            max_new_tokens: maximum token length of each caption
+            max_new_tokens: maximum token length of each caption.
 
         Returns:
             Dask dataframe

diff --git a/components/download_images/src/main.py b/components/download_images/src/main.py
@@ -1,15 +1,16 @@
 """
-This component downloads images based on URLs, and resizes them based on various settings like minimum image size and aspect ratio.
+This component downloads images based on URLs, and resizes them based on various settings like
+minimum image size and aspect ratio.
 
-Some functions here are directly taken from https://github.com/rom1504/img2dataset/blob/main/img2dataset/downloader.py.
+Some functions here are directly taken from
+https://github.com/rom1504/img2dataset/blob/main/img2dataset/downloader.py.
 """
-import logging
 import io
+import logging
 import traceback
 import urllib
 
 import dask.dataframe as dd
-
 from resizer import Resizer
 
 from fondant.component import DaskTransformComponent
@@ -20,32 +21,34 @@
 
 
 def is_disallowed(headers, user_agent_token, disallowed_header_directives):
-    """Check if HTTP headers contain an X-Robots-Tag directive disallowing usage"""
+    """Check if HTTP headers contain an X-Robots-Tag directive disallowing usage."""
     for values in headers.get_all("X-Robots-Tag", []):
         try:
             uatoken_directives = values.split(":", 1)
             directives = [x.strip().lower() for x in uatoken_directives[-1].split(",")]
             ua_token = (
-                uatoken_directives[0].lower() if len(uatoken_directives) == 2 else None
+                uatoken_directives[0].lower() if len(uatoken_directives) == 2   # noqa : PLR2004
+                else None
             )
             if (ua_token is None or ua_token == user_agent_token) and any(
                 x in disallowed_header_directives for x in directives
             ):
                 return True
-        except Exception as err:  # pylint: disable=broad-except
+        except Exception as err:
             traceback.print_exc()
             print(f"Failed to parse X-Robots-Tag: {values}: {err}")
     return False
 
 
 def download_image(url, timeout, user_agent_token, disallowed_header_directives):
-    """Download an image with urllib"""
+    """Download an image with urllib."""
     img_stream = None
     user_agent_string = (
         "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0"
     )
     if user_agent_token:
-        user_agent_string += f" (compatible; {user_agent_token}; +https://github.com/rom1504/img2dataset)"
+        user_agent_string += f" (compatible; {user_agent_token}; " \
+                             f"+https://github.com/rom1504/img2dataset)"
     try:
         request = urllib.request.Request(
             url, data=None, headers={"User-Agent": user_agent_string}
@@ -59,13 +62,14 @@ def download_image(url, timeout, user_agent_token, disallowed_header_directives)
                 return None
             img_stream = io.BytesIO(r.read())
         return img_stream
-    except Exception as err:  # pylint: disable=broad-except
+    except Exception:
         if img_stream is not None:
             img_stream.close()
         return None
 
 
 def download_image_with_retry(
+    *,
     url,
     timeout,
     retries,
@@ -84,9 +88,7 @@ def download_image_with_retry(
 
 
 class DownloadImagesComponent(DaskTransformComponent):
-    """
-    Component that downloads images based on URLs.
-    """
+    """Component that downloads images based on URLs."""
 
     def transform(
         self,

diff --git a/components/download_images/src/resizer.py b/components/download_images/src/resizer.py
@@ -6,6 +6,7 @@
 
 Copyright (c) 2021 Romain Beaumont
 """
+# ruff: noqa
 
 import albumentations as A
 import cv2

diff --git a/components/embedding_based_laion_retrieval/src/clip_client.py b/components/embedding_based_laion_retrieval/src/clip_client.py
@@ -4,6 +4,7 @@
 Copyright (c) 2021 Romain Beaumont
 """
 
+# ruff: noqa
 # mypy: ignore-errors
 
 import base64

diff --git a/components/embedding_based_laion_retrieval/src/main.py b/components/embedding_based_laion_retrieval/src/main.py
@@ -1,13 +1,10 @@
-"""
-This component retrieves image URLs from LAION-5B based on a set of CLIP embeddings.
-"""
+"""This component retrieves image URLs from LAION-5B based on a set of CLIP embeddings."""
 import asyncio
 import concurrent.futures
 import logging
 import typing as t
 
 import pandas as pd
-
 from clip_client import ClipClient, Modality
 
 from fondant.component import PandasTransformComponent
@@ -18,9 +15,7 @@
 
 
 class LAIONRetrievalComponent(PandasTransformComponent):
-    """
-    Component that retrieves image URLs from LAION-5B based on a set of CLIP embeddings.
-    """
+    """Component that retrieves image URLs from LAION-5B based on a set of CLIP embeddings."""
 
     def setup(
             self,
@@ -33,8 +28,10 @@ def setup(
 
         Args:
             num_images: number of images to retrieve for each prompt
-            aesthetic_score: ranking score for aesthetic embedding, higher is prettier, between 0 and 9.
-            aesthetic_weight: weight of the aesthetic embedding to add to the query, between 0 and 1.
+            aesthetic_score: ranking score for aesthetic embedding, higher is prettier,
+                between 0 and 9.
+            aesthetic_weight: weight of the aesthetic embedding to add to the query,
+                between 0 and 1.
         """
         self.client = ClipClient(
             url="https://knn.laion.ai/knn-service",
@@ -49,13 +46,7 @@ def transform(
             self,
             dataframe: pd.DataFrame,
     ) -> pd.DataFrame:
-        """
-        Args:
-            dataframe: Pandas dataframe
-
-        Returns:
-            Dask dataframe
-        """
+        """Asynchronously retrieve image URLs and ids based on prompts in the provided dataframe."""
         results: t.List[t.Tuple[str]] = []
         loop = asyncio.new_event_loop()
 

diff --git a/components/filter_comments/src/main.py b/components/filter_comments/src/main.py
@@ -5,19 +5,17 @@
 import logging
 
 import dask.dataframe as dd
+from utils.text_extraction import get_comments_to_code_ratio
+
 from fondant.component import TransformComponent
 from fondant.logger import configure_logging
 
-from utils.text_extraction import get_comments_to_code_ratio
-
 configure_logging()
 logger = logging.getLogger(__name__)
 
 
 class FilterCommentsComponent(TransformComponent):
-    """
-    Component that filters instances based on code to comments ratio.
-    """
+    """Component that filters instances based on code to comments ratio."""
 
     def transform(
         self,
@@ -32,9 +30,8 @@ def transform(
             min_comments_ratio: The minimum code to comment ratio
             max_comments_ratio: The maximum code to comment ratio
         Returns:
-            Filtered dask dataframe
+            Filtered dask dataframe.
         """
-
         # Apply the function to the desired column and filter the DataFrame
         filtered_df = dataframe[
             dataframe["code_content"].map_partitions(

diff --git a/components/filter_comments/src/utils/text_extraction.py b/components/filter_comments/src/utils/text_extraction.py
@@ -1,5 +1,5 @@
-""" This code is adapted from BigScience PII detection
-https://github.com/bigcode-project/bigcode-dataset/blob/main/preprocessing/filtering.py
+"""This code is adapted from BigScience PII detection
+https://github.com/bigcode-project/bigcode-dataset/blob/main/preprocessing/filtering.py.
 
 MST BigScience PII Code
 Original colab that is a source of this file is located at
@@ -17,11 +17,11 @@
 limitations under the License.
 """
 
-import typing as t
-import tokenize
-import warnings
 import ast
 import io
+import tokenize
+import typing as t
+import warnings
 from itertools import groupby
 
 NODE_TYPES = {
@@ -56,13 +56,14 @@ def parse_docstrings(source):
 # comment extraction
 def get_comments(source: str) -> str:
     """
-    Returns a string including all comments in python code
+    Returns a string including all comments in python code.
+
     Args:
         source: the code to parse
+
     Returns:
-        The script comments
+        The script comments.
     """
-
     comments = []
     g = tokenize.generate_tokens(io.StringIO(source).readline)
     for toknum, tokval, _, _, _ in g:
@@ -76,10 +77,12 @@ def get_comments(source: str) -> str:
 def get_docstrings(source: str) -> t.List[str]:
     """
     Parse Python source code from file or string and print docstrings.
+
     Args:
         source: the code to parse
+
     Returns:
-        A list containing the script docstrings
+        A list containing the script docstrings.
     """
     if hasattr(source, "read"):
         source = source.read()
@@ -99,16 +102,18 @@ def get_docstrings(source: str) -> t.List[str]:
 
 def get_text_python(source: str, extract_comments: bool = True) -> str:
     """Extract all natural text in source: comments + docstrings
-    the extraction fails in case of syntax errors in the file
+    the extraction fails in case of syntax errors in the file.
+
     Args:
         source: the code to parse
-        comments: if True extract comments too
-    Returns:
-        A string with concatenated docstrings and comments"""
+        extract_comments: if True extract comments too
 
+    Returns:
+        A string with concatenated docstrings and comments.
+    """
     try:
         docstrings = "\n".join(get_docstrings(source))
-    except:
+    except Exception:
         docstrings = ""
         warnings.warn(
             "code couldn't be parsed due to compilation failure, no docstring is extracted"
@@ -117,7 +122,7 @@ def get_text_python(source: str, extract_comments: bool = True) -> str:
     if extract_comments:
         try:
             comments = get_comments(source)
-        except:
+        except Exception:
             comments = ""
             warnings.warn("tokenization error, no comments are extracted")
     else:
@@ -133,9 +138,8 @@ def get_comments_to_code_ratio(text: str) -> float:
     Args:
         text: the string source code
     Returns:
-        The comments to code ratio
+        The comments to code ratio.
     """
-
     comments = get_text_python(text)
 
     return len(comments) / len(text)
diff --git a/components/filter_line_length/src/main.py b/components/filter_line_length/src/main.py
@@ -1,6 +1,4 @@
-"""
-This component filters code based on a set of metadata associated with it.
-"""
+"""This component filters code based on a set of metadata associated with it."""
 import logging
 
 import dask.dataframe as dd
@@ -33,9 +31,8 @@ def transform(
             max_line_length_threshold: Threshold for max line length to filter on
             alphanum_fraction_threshold: Alphanum fraction to filter on
         Returns:
-            Filtered dask dataframe
+            Filtered dask dataframe.
         """
-
         filtered_df = dataframe[
             (dataframe["code_avg_line_length"] > avg_line_length_threshold)
             & (dataframe["code_max_line_length"] > max_line_length_threshold)