Add Butterfinger, change the API

This was intended to add the Butterfinger error type, which it does. It also completely changes how ErrorMechanism and ErrorType is configured. The idea is that the abstract base classes ErrorMechanism and ErrorType become stateful, and that they are configured upon initialization. I made this change because I figured that ErrorTypes need to be configured with configurations that are very different. Having a dataclass that stores that configuration allows us to control all parameters in one place, which is handy when adding more ErrorTypes in the future I believe. This does change however how error_generation can be used, check the samples.ipynb for reference.
calgo-lab · May 14, 2024 · aabd16b · aabd16b
1 parent 6c9e801
commit aabd16b
Show file tree

Hide file tree

Showing 10 changed files with 500 additions and 207 deletions.
diff --git a/error_generation/api/low_level.py b/error_generation/api/low_level.py
@@ -2,31 +2,28 @@
 
 from typing import TYPE_CHECKING
 
-from error_generation.utils import ErrorConfig, set_column
+from error_generation.utils import set_column
 
 if TYPE_CHECKING:
     import pandas as pd
 
+    from error_generation.error_mechanism import ErrorMechanism
+    from error_generation.error_type import ErrorType
 
-def create_errors(
-    table: pd.DataFrame,
-    column: str | int,
-    error_config: ErrorConfig | dict,
-) -> tuple[pd.DataFrame, pd.DataFrame]:
+
+def create_errors(table: pd.DataFrame, column: str | int, error_mechanism: ErrorMechanism, error_type: ErrorType) -> tuple[pd.DataFrame, pd.DataFrame]:
     """Creates errors in a given column of a pandas DataFrame.
 
     Args:
         table: The pandas DataFrame to create errors in.
         column: The column to create errors in.
-        error_config: The error configuration to use.
+        error_mechanism: The mechanism, controls the error distribution.
+        error_type: The type of the error that will be distributed.
 
     Returns:
         A tuple of the original DataFrame and the error mask.
     """
-    if isinstance(error_config, dict):
-        error_config = ErrorConfig(**error_config)
-
-    error_mask = error_config.mechanism.sample(table, error_config.error_rate, error_config.condition_to_column, seed=None)
-    series = error_config.error_type.apply(table, error_mask, column)
+    error_mask = error_mechanism.sample(table, seed=None)
+    series = error_type.apply(table, error_mask, column)
     set_column(table, column, series)
     return table, error_mask
diff --git a/error_generation/error_mechanism/_base.py b/error_generation/error_mechanism/_base.py
@@ -5,29 +5,32 @@
 
 import pandas as pd
 
+from error_generation.utils import ErrorMechanismConfig
+
 if TYPE_CHECKING:
     from pandas._typing import Dtype
 
 
-class NotInstantiableError(Exception):
-    def __init__(self: NotInstantiableError) -> None:
-        super().__init__("This class is not meant to be instantiated.")
-
-
 class ErrorMechanism(ABC):
-    def __init__(self: ErrorMechanism) -> None:
-        raise NotInstantiableError
+    def __init__(self: ErrorMechanism, config: ErrorMechanismConfig | dict) -> None:
+        if isinstance(config, dict):
+            self.config = ErrorMechanismConfig(**config)
+        elif isinstance(config, ErrorMechanismConfig):
+            self.config = config
+        elif config is None:
+            msg = "'config' need to be ErrorMechanismConfig or dict."
+            raise TypeError(msg)
+        else:
+            msg = "Invalid config type."
+            raise TypeError(msg)
 
-    @classmethod
     def sample(
-        cls: type[ErrorMechanism],
+        self: ErrorMechanism,
         data: pd.DataFrame,
-        error_rate: float,
-        condition_to_column: Dtype | None = None,
         seed: int | None = None,
     ) -> pd.DataFrame:
         error_rate_msg = "'error_rate' need to be float: 0 <= error_rate <= 1."
-        if error_rate < 0 or error_rate > 1:
+        if self.config.error_rate < 0 or self.config.error_rate > 1:
             raise ValueError(error_rate_msg)
 
         if not (isinstance(seed, int) or seed is None):
@@ -42,11 +45,11 @@ def sample(
             raise ValueError(data_msg)
 
         # At least two columns are necessary if we condition to another
-        if condition_to_column is not None and len(data.columns) < 2:  # noqa: PLR2004
+        if self.config.condition_to_column is not None and len(data.columns) < 2:  # noqa: PLR2004
             msg = "'data' need at least 2 columns if 'condition_to_column' is given."
             raise ValueError(msg)
 
-        return cls._sample(data=data, error_rate=error_rate, condition_to_column=condition_to_column, seed=seed)
+        return self._sample(data=data, error_rate=self.config.error_rate, condition_to_column=self.config.condition_to_column, seed=seed)
 
     @staticmethod
     @abstractmethod

diff --git a/error_generation/error_type/__init__.py b/error_generation/error_type/__init__.py
@@ -1,2 +1,3 @@
 from ._base import ErrorType
+from .butterfinger import Butterfinger
 from .mojibake import Mojibake
diff --git a/error_generation/error_type/_base.py b/error_generation/error_type/_base.py
@@ -3,30 +3,34 @@
 from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING
 
+from error_generation.utils import ErrorTypeConfig
+
 if TYPE_CHECKING:
     import pandas as pd
 
 
-class NotInstantiableError(Exception):
-    def __init__(self: NotInstantiableError) -> None:
-        super().__init__("This class is not meant to be instantiated.")
-
-
 class ErrorType(ABC):
-    def __init__(self: ErrorType) -> None:
-        raise NotInstantiableError
+    def __init__(self: ErrorType, config: ErrorTypeConfig | dict | None = None) -> None:
+        if config is None:
+            self.config = ErrorTypeConfig()
+        elif isinstance(config, dict):
+            self.config = ErrorTypeConfig(**config)
+        elif isinstance(config, ErrorTypeConfig):
+            self.config = config
+        else:
+            msg = "config must be of type ErrorTypeConfig or dict"
+            raise TypeError(msg)
 
-    @classmethod
     # TODO (seja): def apply(cls: type[ErrorType], table: pd.DataFrame, error_mask: pd.DataFrame, preserve_dtypes: bool = True)
     # -> tuple[pd.DataFrame, pd.DataFrame]:
     # 1. prüft parameters, sodass table.shape == error_mask.shape
     # 2. kopiert 'table'
     # 3. ruft '_get_valid_columns' auf um mögliche Spalten zu bekommen
     # 4. ruft '_apply' mit 'table[valid_columns]' auf um geänderte 'table' zu bekommen
     # 5. gibt gänderte 'table' und maske zurück, die anzeigt welche Zellen verändert wurden
-    def apply(cls: type[ErrorType], table: pd.DataFrame, error_mask: pd.DataFrame, column: str | int) -> pd.Series:
-        cls._check_type(table, column)
-        return cls._apply(table, error_mask, column)
+    def apply(self: ErrorType, table: pd.DataFrame, error_mask: pd.DataFrame, column: str | int) -> pd.Series:
+        self._check_type(table, column)
+        return self._apply(table, error_mask, column)
 
     @staticmethod
     @abstractmethod
@@ -35,10 +39,9 @@ def apply(cls: type[ErrorType], table: pd.DataFrame, error_mask: pd.DataFrame, c
     def _check_type(table: pd.DataFrame, column: str | int) -> None:
         pass
 
-    @staticmethod
     @abstractmethod
     # TODO (seja): def _apply(table: pd.DataFrame, error_mask: pd.DataFrame) -> pd.DataFrame:
     # erwartet, dass 'table' ausschließlich valide columns hat. Wendet fehler stumpf auf alle Zellen an, wenn 'error_mask' True ist
     # Gibt geänderte 'table' zurück.
-    def _apply(table: pd.DataFrame, error_mask: pd.DataFrame, column: str | int) -> pd.Series:
+    def _apply(self: ErrorType, table: pd.DataFrame, error_mask: pd.DataFrame, column: str | int) -> pd.Series:
         pass
diff --git a/error_generation/error_type/butterfinger.py b/error_generation/error_type/butterfinger.py
@@ -0,0 +1,148 @@
+from __future__ import annotations
+
+import random
+from typing import TYPE_CHECKING
+
+from pandas.api.types import is_string_dtype
+
+from error_generation.error_type import ErrorType
+from error_generation.utils import get_column
+
+if TYPE_CHECKING:
+    import pandas as pd
+
+
+class Butterfinger(ErrorType):
+    """Inserts realistic typos into a column containing strings.
+
+    Butterfinger imitates a typist who misses the correct key. For a given keyboard-layout and key, Butterfinger maps
+    all keys that physically border the given key on the given layout. It assumes that all bordering keys are equally
+    likely to be hit by the typist.
+
+    Butterfinger assumes that words are separated by whitespaces. Applied to a cell, the period with which Butterfinger
+    will corrupt words in that cell is controlled by the parameter `error_period`. By default, Butterfinger will insert
+    a typo into every 10th word. Butterfinger will always insert at least one typo into an affected cell.
+    """
+
+    @staticmethod
+    def _check_type(table: pd.DataFrame, column: int | str) -> None:
+        series = get_column(table, column)
+
+        if not is_string_dtype(series):
+            msg = f"Column {column} does not contain values of the string dtype. Cannot apply Butterfingers."
+            raise TypeError(msg)
+
+    def _apply(self: Butterfinger, table: pd.DataFrame, error_mask: pd.DataFrame, column: int | str) -> pd.Series:
+        """Apply butterfinger.
+
+        table: the pandas dataframe to-be-corrupted
+        error_mask: binary mask the marks the error positions
+        column: column into which errors shall be inserted
+        error_period: specifies how frequent butterfinger corruptions are - see class description for details.
+        """
+        series = get_column(table, column).copy()
+        series_mask = get_column(error_mask, column)
+
+        def butterfn(x: str) -> str:
+            return butterfinger(x, self.config.error_period, self.config.keyboard_layout)
+
+        series.loc[series_mask] = series.loc[series_mask].apply(butterfn)
+        return series
+
+
+def butterfinger(input_text: str, error_period: int = 10, layout: str = "ansi-qwerty") -> str:
+    """Inserts realistic typos into a string.
+
+    Butterfinger imitates a typist who misses the correct key. For a given keyboard-layout and key, Butterfinger maps
+    all keys that physically border the given key on the given layout. It assumes that all bordering keys are equally
+    likely to be hit by the typist.
+
+    Butterfinger assumes that words are separated by whitespaces. It will corrupt words in the input text with a period
+    controlled by the parameter `error_period`. By default, Butterfinger will insert a typo into every 10th word.
+    Butterfinger will always insert at least one typo into the input text.
+
+    Args:
+        input_text: the string to be corrupted
+        error_period: specifies how frequent butterfinger corruptions are - see class description for details.
+        layout: the keyboard layout to be used for the corruption. Currently, only "ansi-qwerty" is supported.
+
+    Returns:
+        the corrupted string
+    """
+    if layout == "ansi-qwerty":
+        neighbors = {
+            "q": "12wa",
+            "w": "q23esa",
+            "e": "34rdsw",
+            "r": "e45tfd",
+            "t": "56ygfr",
+            "y": "t67uhg",
+            "u": "y78ijh",
+            "i": "u89okj",
+            "o": "i90plk",
+            "p": "o0-[;l",
+            "a": "qwsz",
+            "s": "awedxz",
+            "d": "serfcx",
+            "f": "drtgvc",
+            "g": "ftyhbv",
+            "h": "gyujnb",
+            "j": "huikmn",
+            "k": "jiol,m",
+            "l": "kop;.,",
+            "z": "asx",
+            "x": "sdcz",
+            "c": "dfvx",
+            "v": "cfgb",
+            "b": "vghn",
+            "n": "bhjm",
+            "m": "njk,",
+            "1": "2q`",
+            "2": "13wq",
+            "3": "24ew",
+            "4": "35re",
+            "5": "46tr",
+            "6": "57yt",
+            "7": "68uy",
+            "8": "79iu",
+            "9": "80oi",
+            "0": "9-po",
+            "-": "0=[p",
+            "=": "-][",
+            "[": "-=]';p",
+            "]": "[=\\'",
+            ";": "lp['/.",
+            "'": ";[]/",
+            ",": "mkl.",
+            ".": ",l;/",
+            "/": ".;'",
+            "\\": "]",
+        }
+    else:
+        message = f"Unsupported keyboard layout {layout}."
+        raise ValueError(message)
+
+    if error_period < 1:
+        message = "error_period smaller than 1 is invalid, as multiple errors per word are not supported."
+        raise ValueError(message)
+
+    splits = input_text.split(" ")
+
+    # draw only from splits that have a content
+    valid_positions = [i for i, w in enumerate(splits) if len(w) > 0]
+    n_draws = max(len(valid_positions) // error_period, 1)
+    positions = random.sample(valid_positions, n_draws)
+
+    for p in positions:
+        word = splits[p]  # select the to-be-corrupted word
+        char_position = random.choice(list(range(len(word))))
+        char = word[char_position]
+        is_upper = char.isupper()
+
+        new_char = random.choice(neighbors.get(char.lower(), [char.lower()]))
+
+        new_char = new_char.upper() if is_upper else new_char
+        new_word = "".join([x if i != char_position else new_char for i, x in enumerate(word)])
+        splits[p] = new_word
+
+    return " ".join(splits)
diff --git a/error_generation/error_type/mojibake.py b/error_generation/error_type/mojibake.py
@@ -23,8 +23,7 @@ def _check_type(table: pd.DataFrame, column: int | str) -> None:
             msg = f"Column {column} does not contain values of the string dtype. Cannot insert Mojibake."
             raise TypeError(msg)
 
-    @staticmethod
-    def _apply(table: pd.DataFrame, error_mask: pd.DataFrame, column: int | str) -> pd.Series:
+    def _apply(self: Mojibake, table: pd.DataFrame, error_mask: pd.DataFrame, column: int | str) -> pd.Series:
         # Top 10 most used encodings on the internet
         # https://w3techs.com/technologies/overview/character_encoding
         top10 = {"utf_8", "iso-8859-1", "windows-1252", "windows-1251", "shift_jis", "euc_jp", "gb2312", "euc_kr", "windows-1250", "iso-8859-2"}
@@ -44,8 +43,12 @@ def _apply(table: pd.DataFrame, error_mask: pd.DataFrame, column: int | str) ->
         }
 
         series = get_column(table, column).copy()
-        encoding_sender = random.choice(list(top10))
-        encoding_receiver = random.choice(list(encodings[encoding_sender]))
+        encoding_sender = self.config.encoding_sender
+        encoding_receiver = self.config.encoding_receiver
+
+        if encoding_sender is None or encoding_receiver is None:
+            encoding_sender = random.choice(list(top10))
+            encoding_receiver = random.choice(list(encodings[encoding_sender]))
 
         series_mask = get_column(error_mask, column)
         series.loc[series_mask] = (