diff --git a/error_generation/api/low_level.py b/error_generation/api/low_level.py
index c9bb34b..f0902db 100644
--- a/error_generation/api/low_level.py
+++ b/error_generation/api/low_level.py
@@ -2,31 +2,28 @@
from typing import TYPE_CHECKING
-from error_generation.utils import ErrorConfig, set_column
+from error_generation.utils import set_column
if TYPE_CHECKING:
import pandas as pd
+ from error_generation.error_mechanism import ErrorMechanism
+ from error_generation.error_type import ErrorType
-def create_errors(
- table: pd.DataFrame,
- column: str | int,
- error_config: ErrorConfig | dict,
-) -> tuple[pd.DataFrame, pd.DataFrame]:
+
+def create_errors(table: pd.DataFrame, column: str | int, error_mechanism: ErrorMechanism, error_type: ErrorType) -> tuple[pd.DataFrame, pd.DataFrame]:
"""Creates errors in a given column of a pandas DataFrame.
Args:
table: The pandas DataFrame to create errors in.
column: The column to create errors in.
- error_config: The error configuration to use.
+ error_mechanism: The mechanism, controls the error distribution.
+ error_type: The type of the error that will be distributed.
Returns:
A tuple of the original DataFrame and the error mask.
"""
- if isinstance(error_config, dict):
- error_config = ErrorConfig(**error_config)
-
- error_mask = error_config.mechanism.sample(table, error_config.error_rate, error_config.condition_to_column, seed=None)
- series = error_config.error_type.apply(table, error_mask, column)
+ error_mask = error_mechanism.sample(table, seed=None)
+ series = error_type.apply(table, error_mask, column)
set_column(table, column, series)
return table, error_mask
diff --git a/error_generation/error_mechanism/_base.py b/error_generation/error_mechanism/_base.py
index ab7d861..b65292f 100644
--- a/error_generation/error_mechanism/_base.py
+++ b/error_generation/error_mechanism/_base.py
@@ -5,29 +5,32 @@
import pandas as pd
+from error_generation.utils import ErrorMechanismConfig
+
if TYPE_CHECKING:
from pandas._typing import Dtype
-class NotInstantiableError(Exception):
- def __init__(self: NotInstantiableError) -> None:
- super().__init__("This class is not meant to be instantiated.")
-
-
class ErrorMechanism(ABC):
- def __init__(self: ErrorMechanism) -> None:
- raise NotInstantiableError
+ def __init__(self: ErrorMechanism, config: ErrorMechanismConfig | dict) -> None:
+ if isinstance(config, dict):
+ self.config = ErrorMechanismConfig(**config)
+ elif isinstance(config, ErrorMechanismConfig):
+ self.config = config
+ elif config is None:
+ msg = "'config' need to be ErrorMechanismConfig or dict."
+ raise TypeError(msg)
+ else:
+ msg = "Invalid config type."
+ raise TypeError(msg)
- @classmethod
def sample(
- cls: type[ErrorMechanism],
+ self: ErrorMechanism,
data: pd.DataFrame,
- error_rate: float,
- condition_to_column: Dtype | None = None,
seed: int | None = None,
) -> pd.DataFrame:
error_rate_msg = "'error_rate' need to be float: 0 <= error_rate <= 1."
- if error_rate < 0 or error_rate > 1:
+ if self.config.error_rate < 0 or self.config.error_rate > 1:
raise ValueError(error_rate_msg)
if not (isinstance(seed, int) or seed is None):
@@ -42,11 +45,11 @@ def sample(
raise ValueError(data_msg)
# At least two columns are necessary if we condition to another
- if condition_to_column is not None and len(data.columns) < 2: # noqa: PLR2004
+ if self.config.condition_to_column is not None and len(data.columns) < 2: # noqa: PLR2004
msg = "'data' need at least 2 columns if 'condition_to_column' is given."
raise ValueError(msg)
- return cls._sample(data=data, error_rate=error_rate, condition_to_column=condition_to_column, seed=seed)
+ return self._sample(data=data, error_rate=self.config.error_rate, condition_to_column=self.config.condition_to_column, seed=seed)
@staticmethod
@abstractmethod
diff --git a/error_generation/error_type/__init__.py b/error_generation/error_type/__init__.py
index da75825..16b9ced 100644
--- a/error_generation/error_type/__init__.py
+++ b/error_generation/error_type/__init__.py
@@ -1,2 +1,3 @@
from ._base import ErrorType
+from .butterfinger import Butterfinger
from .mojibake import Mojibake
diff --git a/error_generation/error_type/_base.py b/error_generation/error_type/_base.py
index 72cc72c..c1b1f2c 100644
--- a/error_generation/error_type/_base.py
+++ b/error_generation/error_type/_base.py
@@ -3,20 +3,24 @@
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING
+from error_generation.utils import ErrorTypeConfig
+
if TYPE_CHECKING:
import pandas as pd
-class NotInstantiableError(Exception):
- def __init__(self: NotInstantiableError) -> None:
- super().__init__("This class is not meant to be instantiated.")
-
-
class ErrorType(ABC):
- def __init__(self: ErrorType) -> None:
- raise NotInstantiableError
+ def __init__(self: ErrorType, config: ErrorTypeConfig | dict | None = None) -> None:
+ if config is None:
+ self.config = ErrorTypeConfig()
+ elif isinstance(config, dict):
+ self.config = ErrorTypeConfig(**config)
+ elif isinstance(config, ErrorTypeConfig):
+ self.config = config
+ else:
+ msg = "config must be of type ErrorTypeConfig or dict"
+ raise TypeError(msg)
- @classmethod
# TODO (seja): def apply(cls: type[ErrorType], table: pd.DataFrame, error_mask: pd.DataFrame, preserve_dtypes: bool = True)
# -> tuple[pd.DataFrame, pd.DataFrame]:
# 1. prüft parameters, sodass table.shape == error_mask.shape
@@ -24,9 +28,9 @@ def __init__(self: ErrorType) -> None:
# 3. ruft '_get_valid_columns' auf um mögliche Spalten zu bekommen
# 4. ruft '_apply' mit 'table[valid_columns]' auf um geänderte 'table' zu bekommen
# 5. gibt gänderte 'table' und maske zurück, die anzeigt welche Zellen verändert wurden
- def apply(cls: type[ErrorType], table: pd.DataFrame, error_mask: pd.DataFrame, column: str | int) -> pd.Series:
- cls._check_type(table, column)
- return cls._apply(table, error_mask, column)
+ def apply(self: ErrorType, table: pd.DataFrame, error_mask: pd.DataFrame, column: str | int) -> pd.Series:
+ self._check_type(table, column)
+ return self._apply(table, error_mask, column)
@staticmethod
@abstractmethod
@@ -35,10 +39,9 @@ def apply(cls: type[ErrorType], table: pd.DataFrame, error_mask: pd.DataFrame, c
def _check_type(table: pd.DataFrame, column: str | int) -> None:
pass
- @staticmethod
@abstractmethod
# TODO (seja): def _apply(table: pd.DataFrame, error_mask: pd.DataFrame) -> pd.DataFrame:
# erwartet, dass 'table' ausschließlich valide columns hat. Wendet fehler stumpf auf alle Zellen an, wenn 'error_mask' True ist
# Gibt geänderte 'table' zurück.
- def _apply(table: pd.DataFrame, error_mask: pd.DataFrame, column: str | int) -> pd.Series:
+ def _apply(self: ErrorType, table: pd.DataFrame, error_mask: pd.DataFrame, column: str | int) -> pd.Series:
pass
diff --git a/error_generation/error_type/butterfinger.py b/error_generation/error_type/butterfinger.py
new file mode 100644
index 0000000..6ca27f9
--- /dev/null
+++ b/error_generation/error_type/butterfinger.py
@@ -0,0 +1,148 @@
+from __future__ import annotations
+
+import random
+from typing import TYPE_CHECKING
+
+from pandas.api.types import is_string_dtype
+
+from error_generation.error_type import ErrorType
+from error_generation.utils import get_column
+
+if TYPE_CHECKING:
+ import pandas as pd
+
+
+class Butterfinger(ErrorType):
+ """Inserts realistic typos into a column containing strings.
+
+ Butterfinger imitates a typist who misses the correct key. For a given keyboard-layout and key, Butterfinger maps
+ all keys that physically border the given key on the given layout. It assumes that all bordering keys are equally
+ likely to be hit by the typist.
+
+ Butterfinger assumes that words are separated by whitespaces. Applied to a cell, the period with which Butterfinger
+ will corrupt words in that cell is controlled by the parameter `error_period`. By default, Butterfinger will insert
+ a typo into every 10th word. Butterfinger will always insert at least one typo into an affected cell.
+ """
+
+ @staticmethod
+ def _check_type(table: pd.DataFrame, column: int | str) -> None:
+ series = get_column(table, column)
+
+ if not is_string_dtype(series):
+ msg = f"Column {column} does not contain values of the string dtype. Cannot apply Butterfingers."
+ raise TypeError(msg)
+
+ def _apply(self: Butterfinger, table: pd.DataFrame, error_mask: pd.DataFrame, column: int | str) -> pd.Series:
+ """Apply butterfinger.
+
+ table: the pandas dataframe to-be-corrupted
+ error_mask: binary mask the marks the error positions
+ column: column into which errors shall be inserted
+ error_period: specifies how frequent butterfinger corruptions are - see class description for details.
+ """
+ series = get_column(table, column).copy()
+ series_mask = get_column(error_mask, column)
+
+ def butterfn(x: str) -> str:
+ return butterfinger(x, self.config.error_period, self.config.keyboard_layout)
+
+ series.loc[series_mask] = series.loc[series_mask].apply(butterfn)
+ return series
+
+
+def butterfinger(input_text: str, error_period: int = 10, layout: str = "ansi-qwerty") -> str:
+ """Inserts realistic typos into a string.
+
+ Butterfinger imitates a typist who misses the correct key. For a given keyboard-layout and key, Butterfinger maps
+ all keys that physically border the given key on the given layout. It assumes that all bordering keys are equally
+ likely to be hit by the typist.
+
+ Butterfinger assumes that words are separated by whitespaces. It will corrupt words in the input text with a period
+ controlled by the parameter `error_period`. By default, Butterfinger will insert a typo into every 10th word.
+ Butterfinger will always insert at least one typo into the input text.
+
+ Args:
+ input_text: the string to be corrupted
+ error_period: specifies how frequent butterfinger corruptions are - see class description for details.
+ layout: the keyboard layout to be used for the corruption. Currently, only "ansi-qwerty" is supported.
+
+ Returns:
+ the corrupted string
+ """
+ if layout == "ansi-qwerty":
+ neighbors = {
+ "q": "12wa",
+ "w": "q23esa",
+ "e": "34rdsw",
+ "r": "e45tfd",
+ "t": "56ygfr",
+ "y": "t67uhg",
+ "u": "y78ijh",
+ "i": "u89okj",
+ "o": "i90plk",
+ "p": "o0-[;l",
+ "a": "qwsz",
+ "s": "awedxz",
+ "d": "serfcx",
+ "f": "drtgvc",
+ "g": "ftyhbv",
+ "h": "gyujnb",
+ "j": "huikmn",
+ "k": "jiol,m",
+ "l": "kop;.,",
+ "z": "asx",
+ "x": "sdcz",
+ "c": "dfvx",
+ "v": "cfgb",
+ "b": "vghn",
+ "n": "bhjm",
+ "m": "njk,",
+ "1": "2q`",
+ "2": "13wq",
+ "3": "24ew",
+ "4": "35re",
+ "5": "46tr",
+ "6": "57yt",
+ "7": "68uy",
+ "8": "79iu",
+ "9": "80oi",
+ "0": "9-po",
+ "-": "0=[p",
+ "=": "-][",
+ "[": "-=]';p",
+ "]": "[=\\'",
+ ";": "lp['/.",
+ "'": ";[]/",
+ ",": "mkl.",
+ ".": ",l;/",
+ "/": ".;'",
+ "\\": "]",
+ }
+ else:
+ message = f"Unsupported keyboard layout {layout}."
+ raise ValueError(message)
+
+ if error_period < 1:
+ message = "error_period smaller than 1 is invalid, as multiple errors per word are not supported."
+ raise ValueError(message)
+
+ splits = input_text.split(" ")
+
+ # draw only from splits that have a content
+ valid_positions = [i for i, w in enumerate(splits) if len(w) > 0]
+ n_draws = max(len(valid_positions) // error_period, 1)
+ positions = random.sample(valid_positions, n_draws)
+
+ for p in positions:
+ word = splits[p] # select the to-be-corrupted word
+ char_position = random.choice(list(range(len(word))))
+ char = word[char_position]
+ is_upper = char.isupper()
+
+ new_char = random.choice(neighbors.get(char.lower(), [char.lower()]))
+
+ new_char = new_char.upper() if is_upper else new_char
+ new_word = "".join([x if i != char_position else new_char for i, x in enumerate(word)])
+ splits[p] = new_word
+
+ return " ".join(splits)
diff --git a/error_generation/error_type/mojibake.py b/error_generation/error_type/mojibake.py
index d0e8c7d..92bf799 100644
--- a/error_generation/error_type/mojibake.py
+++ b/error_generation/error_type/mojibake.py
@@ -23,8 +23,7 @@ def _check_type(table: pd.DataFrame, column: int | str) -> None:
msg = f"Column {column} does not contain values of the string dtype. Cannot insert Mojibake."
raise TypeError(msg)
- @staticmethod
- def _apply(table: pd.DataFrame, error_mask: pd.DataFrame, column: int | str) -> pd.Series:
+ def _apply(self: Mojibake, table: pd.DataFrame, error_mask: pd.DataFrame, column: int | str) -> pd.Series:
# Top 10 most used encodings on the internet
# https://w3techs.com/technologies/overview/character_encoding
top10 = {"utf_8", "iso-8859-1", "windows-1252", "windows-1251", "shift_jis", "euc_jp", "gb2312", "euc_kr", "windows-1250", "iso-8859-2"}
@@ -44,8 +43,12 @@ def _apply(table: pd.DataFrame, error_mask: pd.DataFrame, column: int | str) ->
}
series = get_column(table, column).copy()
- encoding_sender = random.choice(list(top10))
- encoding_receiver = random.choice(list(encodings[encoding_sender]))
+ encoding_sender = self.config.encoding_sender
+ encoding_receiver = self.config.encoding_receiver
+
+ if encoding_sender is None or encoding_receiver is None:
+ encoding_sender = random.choice(list(top10))
+ encoding_receiver = random.choice(list(encodings[encoding_sender]))
series_mask = get_column(error_mask, column)
series.loc[series_mask] = (
diff --git a/error_generation/samples.ipynb b/error_generation/samples.ipynb
deleted file mode 100644
index c48266b..0000000
--- a/error_generation/samples.ipynb
+++ /dev/null
@@ -1,155 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "a06b4cd8-6bd2-4321-a4ac-2beea45d67db",
- "metadata": {},
- "outputs": [],
- "source": [
- "%load_ext autoreload\n",
- "%autoreload 2"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "74855e3b-472f-4d3f-834b-adb36adc0419",
- "metadata": {},
- "source": [
- "# Use `low_level` API to create `Mojibake` in one column"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 282,
- "id": "3b62f455-52b2-474b-9ba0-ba4d24e56ae1",
- "metadata": {},
- "outputs": [],
- "source": [
- "import pandas as pd\n",
- "\n",
- "from error_generation.api.low_level import create_errors\n",
- "from error_generation.error_mechanism import ECAR\n",
- "from error_generation.error_type import Mojibake"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 283,
- "id": "92c3a871-3078-4552-b9e6-d583e36e2ec2",
- "metadata": {},
- "outputs": [],
- "source": [
- "config = {\"error_rate\": 1.0, \"mechanism\": ECAR, \"error_type\": Mojibake}"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 286,
- "id": "3e705b44-36c4-459b-ba78-55ab83fbbb68",
- "metadata": {},
- "outputs": [],
- "source": [
- "df_mojibake = pd.DataFrame({\"a\": [0, 1, 2], \"b\": [\"Ente\", \"Haus\", \"Grünfelder Straße 17, 13357 Öppeln\"]})"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 288,
- "id": "11c576a2-1cd1-4fad-829e-bc036230051c",
- "metadata": {},
- "outputs": [],
- "source": [
- "df_corrupted, error_mask = create_errors(df_mojibake, \"b\", config)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 290,
- "id": "288759e4-f634-49d6-a285-deb0e0abf999",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " a | \n",
- " b | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 0 | \n",
- " Ente | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 1 | \n",
- " Haus | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 2 | \n",
- " Gr㉨nfelder Strae 17, 13357 ppeln | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " a b\n",
- "0 0 Ente\n",
- "1 1 Haus\n",
- "2 2 Gr㉨nfelder Strae 17, 13357 ppeln"
- ]
- },
- "execution_count": 290,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df_corrupted"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.12"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/error_generation/utils/__init__.py b/error_generation/utils/__init__.py
index 9f247a7..199317d 100644
--- a/error_generation/utils/__init__.py
+++ b/error_generation/utils/__init__.py
@@ -1 +1 @@
-from .utils import ErrorConfig, get_column, set_column
+from .utils import ErrorMechanismConfig, ErrorTypeConfig, get_column, set_column
diff --git a/error_generation/utils/utils.py b/error_generation/utils/utils.py
index 624b616..f0f2d75 100644
--- a/error_generation/utils/utils.py
+++ b/error_generation/utils/utils.py
@@ -6,27 +6,42 @@
if TYPE_CHECKING:
import pandas as pd
- from error_generation.error_mechanism import ErrorMechanism
- from error_generation.error_type import ErrorType
-
@dataclass
-class ErrorConfig:
- """Parameters that describe the error and its distribution.
+class ErrorMechanismConfig:
+ """Parameters that describe the error distribution, which we call the mechanism.
Args:
error_rate: The rate at which the error occurs.
- mechanism: The mechanism that generates the error.
error_type: The type of error that is generated.
condition_to_column: The column that determines whether the error is generated.
"""
error_rate: float
- mechanism: ErrorMechanism
- error_type: ErrorType
condition_to_column: int | str | None = None
+@dataclass
+class ErrorTypeConfig:
+ """Parameters that describe the error type.
+
+ Arguments that are specific to the error type. Most error types do not share the same arguments, which
+ is why there are many attributes of this dataclass that are mostly default values.
+
+ Args:
+ encoding_sender: When creating Mojibake, used to encode strings to bytes.
+ encoding_receiver: When creating Mojibake, used to decode bytes back to strings.
+ keyboard_layout: When using Butterfinger, the keyboard layout used by the typer.
+ error_period: When using Butterfinger, the period at which the error occurs.
+ """
+
+ encoding_sender: str | None = None
+ encoding_receiver: str | None = None
+
+ keyboard_layout: str = "ansi-qwerty"
+ error_period: int = 10
+
+
def get_column(table: pd.DataFrame, column: int | str) -> pd.Series:
"""Selects a column from a dataframe and returns it as a series."""
if isinstance(column, int):
diff --git a/pyproject.toml b/pyproject.toml
index 1e65ff4..30c2f19 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,7 +10,7 @@ license = "Apache License 2.0"
readme = "README.md"
[tool.poetry.dependencies]
-python = "^3.10,<3.12"
+python = "^3.9,<3.12"
pandas = "^2.2.1"
[tool.poetry.group.dev.dependencies]
diff --git a/samples.ipynb b/samples.ipynb
new file mode 100644
index 0000000..a1c405f
--- /dev/null
+++ b/samples.ipynb
@@ -0,0 +1,278 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "a06b4cd8-6bd2-4321-a4ac-2beea45d67db",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%load_ext autoreload\n",
+ "%autoreload 2"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "74855e3b-472f-4d3f-834b-adb36adc0419",
+ "metadata": {},
+ "source": [
+ "# Use `low_level` API to create `Mojibake` in one column"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "3b62f455-52b2-474b-9ba0-ba4d24e56ae1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "from error_generation.api.low_level import create_errors\n",
+ "from error_generation.error_mechanism import ECAR\n",
+ "from error_generation.error_type import Butterfinger, Mojibake"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "f09057a2-ae58-4012-bd4a-82b678aabfa0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ecar = ECAR({\"error_rate\": 1.0})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "92c3a871-3078-4552-b9e6-d583e36e2ec2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "mojibake = Mojibake()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "id": "3e705b44-36c4-459b-ba78-55ab83fbbb68",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_mojibake = pd.DataFrame({\"a\": [0, 1, 2], \"b\": [\"Ente\", \"Haus\", \"Grünfelder Straße 17, 13357 Öppeln\"]})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "id": "11c576a2-1cd1-4fad-829e-bc036230051c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_corrupted, error_mask = create_errors(df_mojibake, \"b\", ecar, mojibake)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "id": "288759e4-f634-49d6-a285-deb0e0abf999",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " a | \n",
+ " b | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " Ente | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " Haus | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2 | \n",
+ " Grnfelder Strae 17, 13357 ppeln | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " a b\n",
+ "0 0 Ente\n",
+ "1 1 Haus\n",
+ "2 2 Grnfelder Strae 17, 13357 ppeln"
+ ]
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_corrupted"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2aee8cbe-ea80-4e32-ac5d-d002d1198c10",
+ "metadata": {},
+ "source": [
+ "# Butterfinger"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 58,
+ "id": "8f5fa21b-0af8-43ae-9ac4-daa7c09c592a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "butterfinger = Butterfinger()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 59,
+ "id": "1b5a2be4-671e-465b-bad2-469f9108bea6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_butterfinger = pd.DataFrame({\"a\": [0, 1, 2], \"b\": [\"Entspannung\", \"Genugtuung\", \"Ausgeglichenheit\"]})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "id": "438a011a-6ff7-4428-91e9-df3cf5790323",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_corrupted, error_mask = create_errors(df_butterfinger, \"b\", ecar, butterfinger)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 61,
+ "id": "5fb3f20d-8e30-47fa-aa8d-aeb541264b81",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " a | \n",
+ " b | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " Ents;annung | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " Gebugtuung | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2 | \n",
+ " Ausgegl8chenheit | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " a b\n",
+ "0 0 Ents;annung\n",
+ "1 1 Gebugtuung\n",
+ "2 2 Ausgegl8chenheit"
+ ]
+ },
+ "execution_count": 61,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_corrupted"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "82cfc1e2-0aba-4a20-8e58-cb51b015ba23",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}