Add a Permutation Error Type (#5)

The Permutation error type shuffles strings that are formatted by a separator. It supports arbitrary separators and two modes of shuffling: 1) *fixed* mode makes it so that all shuffled formats are shuffled following the same pattern. 2) *random* mode randomly shuffles each cell in a column
calgo-lab · Jun 25, 2024 · 26434bb · 26434bb
1 parent 2262187
commit 26434bb
Show file tree

Hide file tree

Showing 4 changed files with 209 additions and 5 deletions.
diff --git a/error_generation/error_type/__init__.py b/error_generation/error_type/__init__.py
@@ -3,4 +3,5 @@
 from .mislabel import Mislabel
 from .missing import MissingValue
 from .mojibake import Mojibake
+from .permutate import Permutate
 from .wrong_unit import WrongUnit
diff --git a/error_generation/error_type/permutate.py b/error_generation/error_type/permutate.py
@@ -0,0 +1,72 @@
+from __future__ import annotations
+
+import random
+from typing import TYPE_CHECKING, Callable
+
+from pandas.api.types import is_string_dtype
+
+from error_generation.error_type import ErrorType
+from error_generation.utils import get_column
+
+if TYPE_CHECKING:
+    import pandas as pd
+
+
+def fixed_shuffle_pattern(format_len: int, permutation_separator: str) -> Callable:
+    """Returns a function that shuffles the values in a column following a fixed pattern."""
+    initial_pattern = list(range(format_len + 1))  # list that indicates the positions of each value
+    new_pattern = initial_pattern
+
+    while initial_pattern == new_pattern:
+        new_pattern = random.sample(initial_pattern, len(initial_pattern))
+
+    def shuffle_pattern(old_string: str) -> str:
+        old_list = old_string.split(permutation_separator)
+        new = ["" for _ in range(len(old_list))]
+        for i, n in zip(initial_pattern, new_pattern):
+            new[n] = old_list[i]
+        return permutation_separator.join(new)
+
+    return shuffle_pattern
+
+
+class Permutate(ErrorType):
+    """Permutates the values in a column."""
+
+    @staticmethod
+    def _check_type(table: pd.DataFrame, column: int | str) -> None:
+        series = get_column(table, column)
+
+        if not is_string_dtype(series):
+            msg = f"Column {column} does not contain values of the string dtype. Cannot Permutate values."
+            raise TypeError(msg)
+
+    def _apply(self: Permutate, table: pd.DataFrame, error_mask: pd.DataFrame, column: int | str) -> pd.Series:
+        series = get_column(table, column).copy()
+        series_mask = get_column(error_mask, column)
+
+        separator_counts = [x.count(self.config.permutation_separator) for x in series.dropna()]
+        for i, count in enumerate(separator_counts):
+            if count == 0:
+                msg = f'Cannot permutate values, because column {column} contains value "{series[i]}" that is not separated by the separator '
+                msg += f'"{self.config.permutation_separator}". To use another separator, define it in the ErrorTypeconfig.'
+                raise ValueError(msg)
+
+        if self.config.permutation_pattern == "fixed":
+            if len(set(separator_counts)) > 1:
+                msg = f"Column {column} cannot be permutated using a fixed permutation_pattern: A fixed permutation_pattern requires all values "
+                msg += "to be formatted in the same way."
+                raise ValueError(msg)
+            shuffle_pattern = fixed_shuffle_pattern(separator_counts[0], self.config.permutation_separator)
+
+        if self.config.permutation_pattern == "random":
+
+            def shuffle_pattern(old_string: str) -> str:
+                old_list = old_string.split(self.config.permutation_separator)
+                new = old_list
+                while new == old_list:
+                    new = random.sample(old_list, len(old_list))
+                return self.config.permutation_separator.join(new)
+
+        series.loc[series_mask] = series.loc[series_mask].apply(shuffle_pattern)
+        return series
diff --git a/error_generation/utils/utils.py b/error_generation/utils/utils.py
@@ -23,6 +23,8 @@ class ErrorTypeConfig:
         mislabel_weighing: Weight of the distribution that mislables are drawn from. Either "uniform", "frequency" or "custom".
         mistype_dtype: Pandas dtype of the column that is incorrectly types.
         wrong_unit_scaling: Function that scales a value from one unit to another.
+        permutation_separator: A Char that separates structured text, e.g. ' ' in an address or '-' in a date.
+        permutation_pattern: Permutations either all follow the same pattern (fixed) or not (random).
     """
 
     encoding_sender: str | None = None
@@ -40,6 +42,9 @@ class ErrorTypeConfig:
 
     wrong_unit_scaling: Callable | None = None
 
+    permutation_separator: str = " "
+    permutation_pattern: str = "random"
+
 
 def get_column(table: pd.DataFrame, column: int | str) -> pd.Series:
     """Selects a column from a dataframe and returns it as a series."""

diff --git a/samples.ipynb b/samples.ipynb
@@ -2,10 +2,19 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 45,
    "id": "a06b4cd8-6bd2-4321-a4ac-2beea45d67db",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The autoreload extension is already loaded. To reload it, use:\n",
+      "  %reload_ext autoreload\n"
+     ]
+    }
+   ],
    "source": [
     "%load_ext autoreload\n",
     "%autoreload 2"
@@ -21,7 +30,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 123,
+   "execution_count": 46,
    "id": "3b62f455-52b2-474b-9ba0-ba4d24e56ae1",
    "metadata": {},
    "outputs": [],
@@ -30,19 +39,136 @@
     "\n",
     "from error_generation.api.low_level import create_errors\n",
     "from error_generation.error_mechanism import ECAR\n",
-    "from error_generation.error_type import Butterfinger, Mislabel, MissingValue, Mojibake, WrongUnit"
+    "from error_generation.error_type import Butterfinger, Mislabel, MissingValue, Mojibake, Permutate, WrongUnit"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 47,
    "id": "f09057a2-ae58-4012-bd4a-82b678aabfa0",
    "metadata": {},
    "outputs": [],
    "source": [
     "ecar = ECAR(error_rate=1.0)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "0ff517d2-761d-435c-ac21-e2d6cf3bc411",
+   "metadata": {},
+   "source": [
+    "## Permutation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "id": "dff2611e-b16b-4104-ba1d-4696a18c8330",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = {\"A\": [\"apple\", \"banana\", \"cherry\", \"pineapple\"], \"B\": [\"red apple\", \"yellow banana\", \"dark cherry\", \"blue pineapple\"], \"C\": [10, 20, 30, 40]}\n",
+    "df_permutate = pd.DataFrame(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "id": "16529889-cc14-48ae-914a-53a99b1ed676",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "permutate = Permutate({\"permutation_separator\": \" \", \"permutation_pattern\": \"fixed\"})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "id": "9b399f77-83da-470f-910b-5b161566577a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_corrupted, error_mask = create_errors(df_permutate, \"B\", ecar, permutate)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "id": "1221cb45-2f54-4167-8ebb-26b4f6723555",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>A</th>\n",
+       "      <th>B</th>\n",
+       "      <th>C</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>apple</td>\n",
+       "      <td>apple red</td>\n",
+       "      <td>10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>banana</td>\n",
+       "      <td>banana yellow</td>\n",
+       "      <td>20</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>cherry</td>\n",
+       "      <td>cherry dark</td>\n",
+       "      <td>30</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>pineapple</td>\n",
+       "      <td>pineapple blue</td>\n",
+       "      <td>40</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "           A               B   C\n",
+       "0      apple       apple red  10\n",
+       "1     banana   banana yellow  20\n",
+       "2     cherry     cherry dark  30\n",
+       "3  pineapple  pineapple blue  40"
+      ]
+     },
+     "execution_count": 54,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_corrupted"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "bdd6cfc7-2c00-4956-b10f-cdd154f00955",