ml6team · PhilippeMoussalli · Jun 14, 2023 · May 31, 2023 · May 31, 2023 · Jun 8, 2023
diff --git a/components/filter_comments/Dockerfile b/components/filter_comments/Dockerfile
@@ -0,0 +1,18 @@
+FROM --platform=linux/amd64 python:3.8-slim
+
+## System dependencies
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install git -y
+
+# install requirements
+COPY requirements.txt ./
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+# Set the working directory to the compoent folder
+WORKDIR /component/src
+
+# Copy over src-files
+COPY src/ .
+
+ENTRYPOINT ["python", "main.py"]
diff --git a/components/filter_comments/fondant_component.yaml b/components/filter_comments/fondant_component.yaml
@@ -0,0 +1,18 @@
+name: Filter comments
+description: Component that filters code based on the code to comment ratio
+image: ghcr.io/ml6team/filter_comments:latest
+
+consumes:
+  code:
+    fields:
+      content:
+        type: string
+
+
+args:
+  min_comments_ratio:
+    description: The minimum code to comment ratio
+    type: float
+  max_comments_ratio:
+    description: The maximum code to comment ratio
+    type: float
diff --git a/components/filter_comments/requirements.txt b/components/filter_comments/requirements.txt
@@ -0,0 +1,3 @@
+fondant==0.1.0
+pyarrow>=7.0
+gcsfs==2023.4.00
diff --git a/components/filter_comments/src/main.py b/components/filter_comments/src/main.py
@@ -0,0 +1,52 @@
+"""
+This component estimates the code to comments ratio and filters instances between two chosen
+minimum and maximum values.
+"""
+import logging
+
+import dask.dataframe as dd
+from fondant.component import TransformComponent
+from fondant.logger import configure_logging
+
+from utils.text_extraction import get_comments_to_code_ratio
+
+configure_logging()
+logger = logging.getLogger(__name__)
+
+
+class FilterCommentsComponent(TransformComponent):
+    """
+    Component that filters instances based on code to comments ratio.
+    """
+
+    def transform(
+        self,
+        *,
+        dataframe: dd.DataFrame,
+        min_comments_ratio: float,
+        max_comments_ratio: float
+    ) -> dd.DataFrame:
+        """
+        Args:
+            dataframe: Dask dataframe
+            min_comments_ratio: The minimum code to comment ratio
+            max_comments_ratio: The maximum code to comment ratio
+        Returns:
+            Filtered dask dataframe
+        """
+
+        # Apply the function to the desired column and filter the DataFrame
+        filtered_df = dataframe[
+            dataframe["code_content"].map_partitions(
+                lambda example: example.map(get_comments_to_code_ratio).between(
+                    min_comments_ratio, max_comments_ratio
+                )
+            )
+        ]
+
+        return filtered_df
+
+
+if __name__ == "__main__":
+    component = FilterCommentsComponent.from_args()
+    component.run()
diff --git a/components/filter_comments/src/utils/text_extraction.py b/components/filter_comments/src/utils/text_extraction.py
@@ -0,0 +1,141 @@
+""" This code is adapted from BigScience PII detection
+https://github.com/bigcode-project/bigcode-dataset/blob/main/preprocessing/filtering.py
+
+MST BigScience PII Code
+Original colab that is a source of this file is located at
+    https://colab.research.google.com/drive/1086H3-LGMz3gX0pGy9ECgr8KflosSKso
+# License
+Copyright 2022 Authors of this Notebook
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import typing as t
+import tokenize
+import warnings
+import ast
+import io
+from itertools import groupby
+
+NODE_TYPES = {
+    ast.ClassDef: "Class",
+    ast.FunctionDef: "Function/Method",
+    ast.Module: "Module",
+}
+
+
+# Note: sometimes this can miss examples with decorators over classes
+# ast parsing, source: https://gist.github.com/SpotlightKid/1548cb6c97f2a844f72d
+def parse_docstrings(source):
+    """
+    Parse Python source code and yield a tuple of ast node instance, name,
+    and docstring for each function/method, class and module.
+
+    Args:
+        source: The Python source code to parse.
+
+    Yields:
+        A tuple containing ast node instance, name, and docstring.
+    """
+    tree = ast.parse(source)
+
+    for node in ast.walk(tree):
+        if isinstance(node, tuple(NODE_TYPES)):
+            docstring = ast.get_docstring(node)
+
+            yield node, getattr(node, "name", None), docstring
+
+
+# comment extraction
+def get_comments(source: str) -> str:
+    """
+    Returns a string including all comments in python code
+    Args:
+        source: the code to parse
+    Returns:
+        The script comments
+    """
+
+    comments = []
+    g = tokenize.generate_tokens(io.StringIO(source).readline)
+    for toknum, tokval, _, _, _ in g:
+        if toknum == tokenize.COMMENT:
+            comments.append((toknum, tokval))
+    result = tokenize.untokenize(comments).replace("#", "")
+
+    return result
+
+
+def get_docstrings(source: str) -> t.List[str]:
+    """
+    Parse Python source code from file or string and print docstrings.
+    Args:
+        source: the code to parse
+    Returns:
+        A list containing the script docstrings
+    """
+    if hasattr(source, "read"):
+        source = source.read()
+
+    docstrings = sorted(
+        parse_docstrings(source), key=lambda x: (NODE_TYPES.get(type(x[0])), x[1])
+    )
+
+    grouped = groupby(docstrings, key=lambda x: NODE_TYPES.get(type(x[0])))
+    results = []
+    for _, group in grouped:
+        for _, name, docstring in group:
+            if docstring:
+                results.append(docstring)
+    return results
+
+
+def get_text_python(source: str, extract_comments: bool = True) -> str:
+    """Extract all natural text in source: comments + docstrings
+    the extraction fails in case of syntax errors in the file
+    Args:
+        source: the code to parse
+        comments: if True extract comments too
+    Returns:
+        A string with concatenated docstrings and comments"""
+
+    try:
+        docstrings = "\n".join(get_docstrings(source))
+    except:
+        docstrings = ""
+        warnings.warn(
+            "code couldn't be parsed due to compilation failure, no docstring is extracted"
+        )
+
+    if extract_comments:
+        try:
+            comments = get_comments(source)
+        except:
+            comments = ""
+            warnings.warn("tokenization error, no comments are extracted")
+    else:
+        comments = ""
+
+    output = docstrings + "\n" + comments
+    return output.strip()
+
+
+def get_comments_to_code_ratio(text: str) -> float:
+    """
+    Get the ratio of comments to code in a program
+    Args:
+        text: the string source code
+    Returns:
+        The comments to code ratio
+    """
+
+    comments = get_text_python(text)
+
+    return len(comments) / len(text)
diff --git a/components/filter_line_length/Dockerfile b/components/filter_line_length/Dockerfile
@@ -0,0 +1,18 @@
+FROM --platform=linux/amd64 python:3.8-slim
+
+## System dependencies
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install git -y
+
+# install requirements
+COPY requirements.txt ./
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+# Set the working directory to the compoent folder
+WORKDIR /component/src
+
+# Copy over src-files
+COPY src/ .
+
+ENTRYPOINT ["python", "main.py"]
diff --git a/components/filter_line_length/fondant_component.yaml b/components/filter_line_length/fondant_component.yaml
@@ -0,0 +1,24 @@
+name: Filter line length
+description: Component that filters code based on line length
+image: ghcr.io/ml6team/filter_line_length:latest
+
+consumes:
+  code:
+    fields:
+      avg_line_length:
+        type: float64
+      max_line_length:
+        type: int32
+      alphanum_fraction:
+        type: float64
+
+args:
+  avg_line_length_threshold:
+    description: Threshold for average line length to filter on
+    type: int
+  max_line_length_threshold:
+    description: Threshold for maximum line length to filter on
+    type: int
+  alphanum_fraction_threshold:
+    description: Alphanum fraction to filter on
+    type: float
diff --git a/components/filter_line_length/requirements.txt b/components/filter_line_length/requirements.txt
@@ -0,0 +1,3 @@
+fondant==0.1.0
+pyarrow>=7.0
+gcsfs==2023.4.00
diff --git a/components/filter_line_length/src/main.py b/components/filter_line_length/src/main.py
@@ -0,0 +1,50 @@
+"""
+This component filters code based on a set of metadata associated with it.
+"""
+import logging
+
+import dask.dataframe as dd
+
+from fondant.component import TransformComponent
+from fondant.logger import configure_logging
+
+configure_logging()
+logger = logging.getLogger(__name__)
+
+
+class FilterLineLengthComponent(TransformComponent):
+    """
+    This component filters code based on a set of metadata associated with it:
+    average line length, maximum line length and alphanum fraction.
+    """
+
+    def transform(
+        self,
+        *,
+        dataframe: dd.DataFrame,
+        avg_line_length_threshold: int,
+        max_line_length_threshold: int,
+        alphanum_fraction_threshold: float
+    ) -> dd.DataFrame:
+        """
+        Args:
+            dataframe: Dask dataframe
+            avg_line_length_threshold: Threshold for average line length to filter on
+            max_line_length_threshold: Threshold for max line length to filter on
+            alphanum_fraction_threshold: Alphanum fraction to filter on
+        Returns:
+            Filtered dask dataframe
+        """
+
+        filtered_df = dataframe[
+            (dataframe["code_avg_line_length"] > avg_line_length_threshold)
+            & (dataframe["code_max_line_length"] > max_line_length_threshold)
+            & (dataframe["code_alphanum_fraction"] > alphanum_fraction_threshold)
+        ]
+
+        return filtered_df
+
+
+if __name__ == "__main__":
+    component = FilterLineLengthComponent.from_args()
+    component.run()
diff --git a/components/pii_redaction/Dockerfile b/components/pii_redaction/Dockerfile
@@ -0,0 +1,18 @@
+FROM --platform=linux/amd64 python:3.8-slim
+
+## System dependencies
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install git -y
+
+# install requirements
+COPY requirements.txt /
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+# Set the working directory to the component folder
+WORKDIR /component/src
+
+# Copy over src-files
+COPY src/ .
+
+ENTRYPOINT ["python", "main.py"]
diff --git a/components/pii_redaction/README.md b/components/pii_redaction/README.md
@@ -0,0 +1,13 @@
+## PII detection and redaction
+
+This component detects and redacts Personal Identifiable Information (PII) from code. Redaction means that sensitive data is replaced by random data.
+
+The code is based on the PII removal code used as part of the [BigCode project](https://github.com/bigcode-project/bigcode-dataset/tree/main/pii).
+
+### PII detection
+
+The component detects emails, IP addresses and API/SSH keys in text datasets (in particular datasets of source code). Regexes are used for emails and IP addresses (they are adapted from [BigScience PII pipeline](https://github.com/bigscience-workshop/data-preparation/tree/main/preprocessing/training/02_pii)). The [`detect-secrets`](https://github.com/Yelp/detect-secrets) package is used for finding secrets keys. Additionally filters are implemented on top to reduce the number of false positives, using the [gibberish-detector](https://github.com/domanchi/gibberish-detector) package.
+
+### PII redaction
+
+PII is replaced by random data which is stored in the `replacements.json` file.
diff --git a/components/pii_redaction/fondant_component.yaml b/components/pii_redaction/fondant_component.yaml
@@ -0,0 +1,17 @@
+name: PII redaction
+description: A component that detects and redacts Personal Identifiable Information (PII) from code.
+image: ghcr.io/ml6team/pii_redaction:latest
+
+consumes:
+  code:
+    fields:
+      content:
+        type: string
+
+produces:
+  code:
+    fields:
+      content:
+        type: string
+    additionalFields: False
+  additionalSubsets: False