Enable more Ruff rules (#231)

My IDE was highlighting additional warnings not covered by Ruff, so I added some more rules. 90% of this was autofixed, the rest was straightforward and useful. I even found an issue with a test because of it.
ml6team · Jun 23, 2023 · 309a886 · 309a886
1 parent c0b497e
commit 309a886
Show file tree

Hide file tree

Showing 42 changed files with 428 additions and 321 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -12,7 +12,11 @@ repos:
                 fondant/.*|
                 tests/.*|
             )$
-        args: [--fix, --exit-non-zero-on-fix]
+        args: [
+          "--target-version=py38",
+          "--fix",
+          "--exit-non-zero-on-fix",
+        ]
 
 
   - repo: https://github.com/PyCQA/bandit

diff --git a/components/caption_images/src/main.py b/components/caption_images/src/main.py
@@ -40,7 +40,7 @@ def caption_image_batch(
     *,
     model: BlipForConditionalGeneration,
     processor: BlipProcessor,
-    max_new_tokens: int
+    max_new_tokens: int,
 ) -> pd.Series:
     """Caption a batch of images."""
     input_batch = torch.cat(image_batch.tolist())
@@ -67,7 +67,7 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
         images = dataframe["images"]["data"].apply(
             process_image,
             processor=self.processor,
-            device=self.device
+            device=self.device,
         )
 
         results: t.List[pd.Series] = []
@@ -78,8 +78,8 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
                         batch,
                         model=self.model,
                         processor=self.processor,
-                        max_new_tokens=self.max_new_tokens
-                    ).T
+                        max_new_tokens=self.max_new_tokens,
+                    ).T,
                 )
 
         return pd.concat(results).to_frame(name=("captions", "text"))

diff --git a/components/download_images/src/main.py b/components/download_images/src/main.py
@@ -49,7 +49,7 @@ def download_image(url, timeout, user_agent_token, disallowed_header_directives)
                              f"+https://github.com/rom1504/img2dataset)"
     try:
         request = urllib.request.Request(
-            url, data=None, headers={"User-Agent": user_agent_string}
+            url, data=None, headers={"User-Agent": user_agent_string},
         )
         with urllib.request.urlopen(request, timeout=timeout) as r:
             if disallowed_header_directives and is_disallowed(
@@ -77,7 +77,7 @@ def download_image_with_retry(
 ):
     for _ in range(retries + 1):
         img_stream = download_image(
-            url, timeout, user_agent_token, disallowed_header_directives
+            url, timeout, user_agent_token, disallowed_header_directives,
         )
         if img_stream is not None:
             # resize the image
@@ -114,7 +114,7 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
         dataframe[[
             ("images", "data"),
             ("images", "width"),
-            ("images", "height")
+            ("images", "height"),
         ]] = dataframe.apply(
             lambda example: download_image_with_retry(
                 url=example["images"]["url"],

diff --git a/components/embedding_based_laion_retrieval/src/main.py b/components/embedding_based_laion_retrieval/src/main.py
@@ -21,7 +21,7 @@ def setup(
             *,
             num_images: int,
             aesthetic_score: int,
-            aesthetic_weight: float
+            aesthetic_weight: float,
     ) -> None:
         """
 
@@ -54,7 +54,7 @@ async def async_query():
                 futures = [
                     loop.run_in_executor(
                         executor,
-                        functools.partial(self.client.query, embedding_input=embedding.tolist())
+                        functools.partial(self.client.query, embedding_input=embedding.tolist()),
                     )
                     for embedding in dataframe["embeddings"]["data"]
                 ]
@@ -64,7 +64,7 @@ async def async_query():
         loop.run_until_complete(async_query())
 
         results_df = pd.DataFrame(results)[["id", "url"]]
-        results_df.set_index("id", inplace=True)
+        results_df = results_df.set_index("id")
         results_df.columns = [["images"], ["url"]]
 
         return results_df

diff --git a/components/filter_comments/src/main.py b/components/filter_comments/src/main.py
@@ -20,7 +20,7 @@ def transform(
         *,
         dataframe: dd.DataFrame,
         min_comments_ratio: float,
-        max_comments_ratio: float
+        max_comments_ratio: float,
     ) -> dd.DataFrame:
         """
         Args:
@@ -31,16 +31,14 @@ def transform(
             Filtered dask dataframe.
         """
         # Apply the function to the desired column and filter the DataFrame
-        filtered_df = dataframe[
+        return dataframe[
             dataframe["code_content"].map_partitions(
                 lambda example: example.map(get_comments_to_code_ratio).between(
-                    min_comments_ratio, max_comments_ratio
-                )
+                    min_comments_ratio, max_comments_ratio,
+                ),
             )
         ]
 
-        return filtered_df
-
 
 if __name__ == "__main__":
     component = FilterCommentsComponent.from_args()

diff --git a/components/filter_comments/src/utils/text_extraction.py b/components/filter_comments/src/utils/text_extraction.py
@@ -69,9 +69,7 @@ def get_comments(source: str) -> str:
     for toknum, tokval, _, _, _ in g:
         if toknum == tokenize.COMMENT:
             comments.append((toknum, tokval))
-    result = tokenize.untokenize(comments).replace("#", "")
-
-    return result
+    return tokenize.untokenize(comments).replace("#", "")
 
 
 def get_docstrings(source: str) -> t.List[str]:
@@ -88,13 +86,13 @@ def get_docstrings(source: str) -> t.List[str]:
         source = source.read()
 
     docstrings = sorted(
-        parse_docstrings(source), key=lambda x: (NODE_TYPES.get(type(x[0])), x[1])
+        parse_docstrings(source), key=lambda x: (NODE_TYPES.get(type(x[0])), x[1]),
     )
 
     grouped = groupby(docstrings, key=lambda x: NODE_TYPES.get(type(x[0])))
     results = []
     for _, group in grouped:
-        for _, name, docstring in group:
+        for _, _name, docstring in group:
             if docstring:
                 results.append(docstring)
     return results
@@ -116,7 +114,7 @@ def get_text_python(source: str, extract_comments: bool = True) -> str:
     except Exception:
         docstrings = ""
         warnings.warn(
-            "code couldn't be parsed due to compilation failure, no docstring is extracted"
+            "code couldn't be parsed due to compilation failure, no docstring is extracted",
         )
 
     if extract_comments:
@@ -142,4 +140,4 @@ def get_comments_to_code_ratio(text: str) -> float:
     """
     comments = get_text_python(text)
 
-    return len(comments) / len(text)
+    return len(comments) / len(text)
diff --git a/components/filter_line_length/src/main.py b/components/filter_line_length/src/main.py
@@ -20,7 +20,7 @@ def transform(
         dataframe: dd.DataFrame,
         avg_line_length_threshold: int,
         max_line_length_threshold: int,
-        alphanum_fraction_threshold: float
+        alphanum_fraction_threshold: float,
     ) -> dd.DataFrame:
         """
         Args:
@@ -31,14 +31,12 @@ def transform(
         Returns:
             Filtered dask dataframe.
         """
-        filtered_df = dataframe[
+        return dataframe[
             (dataframe["code_avg_line_length"] > avg_line_length_threshold)
             & (dataframe["code_max_line_length"] > max_line_length_threshold)
             & (dataframe["code_alphanum_fraction"] > alphanum_fraction_threshold)
         ]
 
-        return filtered_df
-
 
 if __name__ == "__main__":
     component = FilterLineLengthComponent.from_args()

diff --git a/components/image_cropping/src/image_crop.py b/components/image_cropping/src/image_crop.py
@@ -47,7 +47,7 @@ def get_image_borders(image: Image.Image) -> t.Tuple:
 
 
 def remove_borders(
-    image_bytes: bytes, cropping_threshold: int = -30, padding: int = 10
+    image_bytes: bytes, cropping_threshold: int = -30, padding: int = 10,
 ) -> bytes:
     """This method removes borders by checking the overlap between
     a color and the original image. By subtracting these two
@@ -89,12 +89,12 @@ def remove_borders(
         if image_crop.size[0] > image_crop.size[1]:
             padding = int((image_crop.size[0] - image_crop.size[1]) / 2)
             image_crop = ImageOps.expand(
-                image_crop, border=(0, padding), fill=color_common
+                image_crop, border=(0, padding), fill=color_common,
             )
         else:
             padding = int((image_crop.size[1] - image_crop.size[0]) / 2)
             image_crop = ImageOps.expand(
-                image_crop, border=(padding, 0), fill=color_common
+                image_crop, border=(padding, 0), fill=color_common,
             )
 
     # serialize image to JPEG

diff --git a/components/image_cropping/src/main.py b/components/image_cropping/src/main.py
@@ -36,7 +36,7 @@ def transform(
         *,
         dataframe: dd.DataFrame,
         cropping_threshold: int = -30,
-        padding: int = 10
+        padding: int = 10,
     ) -> dd.DataFrame:
         """
         Args:

diff --git a/components/image_embedding/src/main.py b/components/image_embedding/src/main.py
@@ -75,7 +75,7 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
         images = dataframe["images"]["data"].apply(
             process_image,
             processor=self.processor,
-            device=self.device
+            device=self.device,
         )
         results: t.List[pd.Series] = []
         for batch in np.split(images, np.arange(self.batch_size, len(images), self.batch_size)):

diff --git a/components/image_resolution_filtering/src/main.py b/components/image_resolution_filtering/src/main.py
@@ -12,7 +12,7 @@ class ImageFilterComponent(DaskTransformComponent):
     """Component that filters images based on height and width."""
 
     def transform(
-        self, *, dataframe: dd.DataFrame, min_width: int, min_height: int
+        self, *, dataframe: dd.DataFrame, min_width: int, min_height: int,
     ) -> dd.DataFrame:
         """
         Args:
@@ -38,4 +38,4 @@ def transform(
 
 if __name__ == "__main__":
     component = ImageFilterComponent.from_args()
-    component.run()
+    component.run()
diff --git a/components/load_from_hf_hub/src/main.py b/components/load_from_hf_hub/src/main.py
@@ -35,7 +35,7 @@ def load(self,
         if image_column_names is not None:
             for image_column_name in image_column_names:
                 dask_df[image_column_name] = dask_df[image_column_name].map(
-                    lambda x: x["bytes"], meta=("bytes", bytes)
+                    lambda x: x["bytes"], meta=("bytes", bytes),
                 )
 
         # 3) Rename columns

diff --git a/components/pii_redaction/src/main.py b/components/pii_redaction/src/main.py
@@ -40,7 +40,7 @@ def transform(
 
         # redact PII
         # we use random replacements by default
-        with open("replacements.json", "r") as f:
+        with open("replacements.json") as f:
             replacements = json.load(f)
 
         dataframe["code_content"] = dataframe.apply(
@@ -54,7 +54,7 @@ def transform(
             meta=(None, "str"),
         )
         dataframe = dataframe.drop(
-            ["code_secrets", "code_has_secrets", "code_number_secrets"], axis=1
+            ["code_secrets", "code_has_secrets", "code_number_secrets"], axis=1,
         )
 
         return dataframe

diff --git a/components/pii_redaction/src/pii_detection.py b/components/pii_redaction/src/pii_detection.py
@@ -26,12 +26,12 @@ def scan_pii(text, key_detector="other"):
     if key_detector == "regex":
         # use a regex to detect keys + emails + ips
         secrets = secrets + detect_email_addresses(
-            text, tag_types={"KEY", "EMAIL", "IP_ADDRESS"}
+            text, tag_types={"KEY", "EMAIL", "IP_ADDRESS"},
         )
     else:
         # detect emails and ip addresses with regexes
         secrets = secrets + detect_email_addresses(
-            text, tag_types={"EMAIL", "IP_ADDRESS"}
+            text, tag_types={"EMAIL", "IP_ADDRESS"},
         )
         # for keys use detect-secrets tool
         secrets = secrets + detect_keys(text)

diff --git a/components/pii_redaction/src/pii_redaction.py b/components/pii_redaction/src/pii_redaction.py
@@ -21,7 +21,6 @@
     ],
 }
 
-# providergs = ["google", "cloudfare", "alternate-dns", "quad9","open-dns", "comodo", "adguard"]
 POPULAR_DNS_SERVERS = [
     "8.8.8.8",
     "8.8.4.4",
@@ -113,7 +112,7 @@ def redact_pii_text(text, secrets, replacements, add_references=False):
         last_text = text
         for secret in secrets:
             # skip secret if it's an IP address for private networks or popular DNS servers
-            if secret["tag"] == "IP_ADDRESS":
+            if secret["tag"] == "IP_ADDRESS":  # ruff: noqa: SIM102
                 # if secret value in popular DNS servers, skip it
                 if is_private_ip(secret["value"]) or (
                     secret["value"] in POPULAR_DNS_SERVERS
@@ -146,10 +145,9 @@ def redact_pii_text(text, secrets, replacements, add_references=False):
     else:
         new_text = text
         references = ""
-    result = (
+    return (
         (new_text, references, modified) if add_references else (new_text, modified)
     )
-    return result
 
 
 def redact_pii(text, secrets, has_secrets, replacements):
@@ -160,5 +158,5 @@ def redact_pii(text, secrets, has_secrets, replacements):
     if has_secrets:
         new_text, _ = redact_pii_text(text, secrets, replacements)
         return new_text
-    else:
-        return text
+
+    return text