Skip to content

Commit

Permalink
Add Butterfinger, change the API
Browse files Browse the repository at this point in the history
This was intended to add the Butterfinger error type, which it does. It
also completely changes how ErrorMechanism and ErrorType is configured.

The idea is that the abstract base classes ErrorMechanism and ErrorType
become stateful, and that they are configured upon initialization.

I made this change because I figured that ErrorTypes need to be
configured with configurations that are very different. Having a
dataclass that stores that configuration allows us to control all
parameters in one place, which is handy when adding more ErrorTypes in
the future I believe.

This does change however how error_generation can be used, check the
samples.ipynb for reference.
  • Loading branch information
Philipp Jung authored and philipp-jung committed May 14, 2024
1 parent 6c9e801 commit aabd16b
Show file tree
Hide file tree
Showing 10 changed files with 500 additions and 207 deletions.
21 changes: 9 additions & 12 deletions error_generation/api/low_level.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,31 +2,28 @@

from typing import TYPE_CHECKING

from error_generation.utils import ErrorConfig, set_column
from error_generation.utils import set_column

if TYPE_CHECKING:
import pandas as pd

from error_generation.error_mechanism import ErrorMechanism
from error_generation.error_type import ErrorType

def create_errors(
table: pd.DataFrame,
column: str | int,
error_config: ErrorConfig | dict,
) -> tuple[pd.DataFrame, pd.DataFrame]:

def create_errors(table: pd.DataFrame, column: str | int, error_mechanism: ErrorMechanism, error_type: ErrorType) -> tuple[pd.DataFrame, pd.DataFrame]:
"""Creates errors in a given column of a pandas DataFrame.
Args:
table: The pandas DataFrame to create errors in.
column: The column to create errors in.
error_config: The error configuration to use.
error_mechanism: The mechanism, controls the error distribution.
error_type: The type of the error that will be distributed.
Returns:
A tuple of the original DataFrame and the error mask.
"""
if isinstance(error_config, dict):
error_config = ErrorConfig(**error_config)

error_mask = error_config.mechanism.sample(table, error_config.error_rate, error_config.condition_to_column, seed=None)
series = error_config.error_type.apply(table, error_mask, column)
error_mask = error_mechanism.sample(table, seed=None)
series = error_type.apply(table, error_mask, column)
set_column(table, column, series)
return table, error_mask
31 changes: 17 additions & 14 deletions error_generation/error_mechanism/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,29 +5,32 @@

import pandas as pd

from error_generation.utils import ErrorMechanismConfig

if TYPE_CHECKING:
from pandas._typing import Dtype


class NotInstantiableError(Exception):
def __init__(self: NotInstantiableError) -> None:
super().__init__("This class is not meant to be instantiated.")


class ErrorMechanism(ABC):
def __init__(self: ErrorMechanism) -> None:
raise NotInstantiableError
def __init__(self: ErrorMechanism, config: ErrorMechanismConfig | dict) -> None:
if isinstance(config, dict):
self.config = ErrorMechanismConfig(**config)
elif isinstance(config, ErrorMechanismConfig):
self.config = config
elif config is None:
msg = "'config' need to be ErrorMechanismConfig or dict."
raise TypeError(msg)
else:
msg = "Invalid config type."
raise TypeError(msg)

@classmethod
def sample(
cls: type[ErrorMechanism],
self: ErrorMechanism,
data: pd.DataFrame,
error_rate: float,
condition_to_column: Dtype | None = None,
seed: int | None = None,
) -> pd.DataFrame:
error_rate_msg = "'error_rate' need to be float: 0 <= error_rate <= 1."
if error_rate < 0 or error_rate > 1:
if self.config.error_rate < 0 or self.config.error_rate > 1:
raise ValueError(error_rate_msg)

if not (isinstance(seed, int) or seed is None):
Expand All @@ -42,11 +45,11 @@ def sample(
raise ValueError(data_msg)

# At least two columns are necessary if we condition to another
if condition_to_column is not None and len(data.columns) < 2: # noqa: PLR2004
if self.config.condition_to_column is not None and len(data.columns) < 2: # noqa: PLR2004
msg = "'data' need at least 2 columns if 'condition_to_column' is given."
raise ValueError(msg)

return cls._sample(data=data, error_rate=error_rate, condition_to_column=condition_to_column, seed=seed)
return self._sample(data=data, error_rate=self.config.error_rate, condition_to_column=self.config.condition_to_column, seed=seed)

@staticmethod
@abstractmethod
Expand Down
1 change: 1 addition & 0 deletions error_generation/error_type/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from ._base import ErrorType
from .butterfinger import Butterfinger
from .mojibake import Mojibake
29 changes: 16 additions & 13 deletions error_generation/error_type/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,30 +3,34 @@
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING

from error_generation.utils import ErrorTypeConfig

if TYPE_CHECKING:
import pandas as pd


class NotInstantiableError(Exception):
def __init__(self: NotInstantiableError) -> None:
super().__init__("This class is not meant to be instantiated.")


class ErrorType(ABC):
def __init__(self: ErrorType) -> None:
raise NotInstantiableError
def __init__(self: ErrorType, config: ErrorTypeConfig | dict | None = None) -> None:
if config is None:
self.config = ErrorTypeConfig()
elif isinstance(config, dict):
self.config = ErrorTypeConfig(**config)
elif isinstance(config, ErrorTypeConfig):
self.config = config
else:
msg = "config must be of type ErrorTypeConfig or dict"
raise TypeError(msg)

@classmethod
# TODO (seja): def apply(cls: type[ErrorType], table: pd.DataFrame, error_mask: pd.DataFrame, preserve_dtypes: bool = True)
# -> tuple[pd.DataFrame, pd.DataFrame]:
# 1. prüft parameters, sodass table.shape == error_mask.shape
# 2. kopiert 'table'
# 3. ruft '_get_valid_columns' auf um mögliche Spalten zu bekommen
# 4. ruft '_apply' mit 'table[valid_columns]' auf um geänderte 'table' zu bekommen
# 5. gibt gänderte 'table' und maske zurück, die anzeigt welche Zellen verändert wurden
def apply(cls: type[ErrorType], table: pd.DataFrame, error_mask: pd.DataFrame, column: str | int) -> pd.Series:
cls._check_type(table, column)
return cls._apply(table, error_mask, column)
def apply(self: ErrorType, table: pd.DataFrame, error_mask: pd.DataFrame, column: str | int) -> pd.Series:
self._check_type(table, column)
return self._apply(table, error_mask, column)

@staticmethod
@abstractmethod
Expand All @@ -35,10 +39,9 @@ def apply(cls: type[ErrorType], table: pd.DataFrame, error_mask: pd.DataFrame, c
def _check_type(table: pd.DataFrame, column: str | int) -> None:
pass

@staticmethod
@abstractmethod
# TODO (seja): def _apply(table: pd.DataFrame, error_mask: pd.DataFrame) -> pd.DataFrame:
# erwartet, dass 'table' ausschließlich valide columns hat. Wendet fehler stumpf auf alle Zellen an, wenn 'error_mask' True ist
# Gibt geänderte 'table' zurück.
def _apply(table: pd.DataFrame, error_mask: pd.DataFrame, column: str | int) -> pd.Series:
def _apply(self: ErrorType, table: pd.DataFrame, error_mask: pd.DataFrame, column: str | int) -> pd.Series:
pass
148 changes: 148 additions & 0 deletions error_generation/error_type/butterfinger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
from __future__ import annotations

import random
from typing import TYPE_CHECKING

from pandas.api.types import is_string_dtype

from error_generation.error_type import ErrorType
from error_generation.utils import get_column

if TYPE_CHECKING:
import pandas as pd


class Butterfinger(ErrorType):
"""Inserts realistic typos into a column containing strings.
Butterfinger imitates a typist who misses the correct key. For a given keyboard-layout and key, Butterfinger maps
all keys that physically border the given key on the given layout. It assumes that all bordering keys are equally
likely to be hit by the typist.
Butterfinger assumes that words are separated by whitespaces. Applied to a cell, the period with which Butterfinger
will corrupt words in that cell is controlled by the parameter `error_period`. By default, Butterfinger will insert
a typo into every 10th word. Butterfinger will always insert at least one typo into an affected cell.
"""

@staticmethod
def _check_type(table: pd.DataFrame, column: int | str) -> None:
series = get_column(table, column)

if not is_string_dtype(series):
msg = f"Column {column} does not contain values of the string dtype. Cannot apply Butterfingers."
raise TypeError(msg)

def _apply(self: Butterfinger, table: pd.DataFrame, error_mask: pd.DataFrame, column: int | str) -> pd.Series:
"""Apply butterfinger.
table: the pandas dataframe to-be-corrupted
error_mask: binary mask the marks the error positions
column: column into which errors shall be inserted
error_period: specifies how frequent butterfinger corruptions are - see class description for details.
"""
series = get_column(table, column).copy()
series_mask = get_column(error_mask, column)

def butterfn(x: str) -> str:
return butterfinger(x, self.config.error_period, self.config.keyboard_layout)

series.loc[series_mask] = series.loc[series_mask].apply(butterfn)
return series


def butterfinger(input_text: str, error_period: int = 10, layout: str = "ansi-qwerty") -> str:
"""Inserts realistic typos into a string.
Butterfinger imitates a typist who misses the correct key. For a given keyboard-layout and key, Butterfinger maps
all keys that physically border the given key on the given layout. It assumes that all bordering keys are equally
likely to be hit by the typist.
Butterfinger assumes that words are separated by whitespaces. It will corrupt words in the input text with a period
controlled by the parameter `error_period`. By default, Butterfinger will insert a typo into every 10th word.
Butterfinger will always insert at least one typo into the input text.
Args:
input_text: the string to be corrupted
error_period: specifies how frequent butterfinger corruptions are - see class description for details.
layout: the keyboard layout to be used for the corruption. Currently, only "ansi-qwerty" is supported.
Returns:
the corrupted string
"""
if layout == "ansi-qwerty":
neighbors = {
"q": "12wa",
"w": "q23esa",
"e": "34rdsw",
"r": "e45tfd",
"t": "56ygfr",
"y": "t67uhg",
"u": "y78ijh",
"i": "u89okj",
"o": "i90plk",
"p": "o0-[;l",
"a": "qwsz",
"s": "awedxz",
"d": "serfcx",
"f": "drtgvc",
"g": "ftyhbv",
"h": "gyujnb",
"j": "huikmn",
"k": "jiol,m",
"l": "kop;.,",
"z": "asx",
"x": "sdcz",
"c": "dfvx",
"v": "cfgb",
"b": "vghn",
"n": "bhjm",
"m": "njk,",
"1": "2q`",
"2": "13wq",
"3": "24ew",
"4": "35re",
"5": "46tr",
"6": "57yt",
"7": "68uy",
"8": "79iu",
"9": "80oi",
"0": "9-po",
"-": "0=[p",
"=": "-][",
"[": "-=]';p",
"]": "[=\\'",
";": "lp['/.",
"'": ";[]/",
",": "mkl.",
".": ",l;/",
"/": ".;'",
"\\": "]",
}
else:
message = f"Unsupported keyboard layout {layout}."
raise ValueError(message)

if error_period < 1:
message = "error_period smaller than 1 is invalid, as multiple errors per word are not supported."
raise ValueError(message)

splits = input_text.split(" ")

# draw only from splits that have a content
valid_positions = [i for i, w in enumerate(splits) if len(w) > 0]
n_draws = max(len(valid_positions) // error_period, 1)
positions = random.sample(valid_positions, n_draws)

for p in positions:
word = splits[p] # select the to-be-corrupted word
char_position = random.choice(list(range(len(word))))
char = word[char_position]
is_upper = char.isupper()

new_char = random.choice(neighbors.get(char.lower(), [char.lower()]))

new_char = new_char.upper() if is_upper else new_char
new_word = "".join([x if i != char_position else new_char for i, x in enumerate(word)])
splits[p] = new_word

return " ".join(splits)
11 changes: 7 additions & 4 deletions error_generation/error_type/mojibake.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@ def _check_type(table: pd.DataFrame, column: int | str) -> None:
msg = f"Column {column} does not contain values of the string dtype. Cannot insert Mojibake."
raise TypeError(msg)

@staticmethod
def _apply(table: pd.DataFrame, error_mask: pd.DataFrame, column: int | str) -> pd.Series:
def _apply(self: Mojibake, table: pd.DataFrame, error_mask: pd.DataFrame, column: int | str) -> pd.Series:
# Top 10 most used encodings on the internet
# https://w3techs.com/technologies/overview/character_encoding
top10 = {"utf_8", "iso-8859-1", "windows-1252", "windows-1251", "shift_jis", "euc_jp", "gb2312", "euc_kr", "windows-1250", "iso-8859-2"}
Expand All @@ -44,8 +43,12 @@ def _apply(table: pd.DataFrame, error_mask: pd.DataFrame, column: int | str) ->
}

series = get_column(table, column).copy()
encoding_sender = random.choice(list(top10))
encoding_receiver = random.choice(list(encodings[encoding_sender]))
encoding_sender = self.config.encoding_sender
encoding_receiver = self.config.encoding_receiver

if encoding_sender is None or encoding_receiver is None:
encoding_sender = random.choice(list(top10))
encoding_receiver = random.choice(list(encodings[encoding_sender]))

series_mask = get_column(error_mask, column)
series.loc[series_mask] = (
Expand Down
Loading

0 comments on commit aabd16b

Please sign in to comment.