Skip to content

Commit

Permalink
Add a Permutation Error Type (#5)
Browse files Browse the repository at this point in the history
The Permutation error type shuffles strings that are formatted by a
separator. It supports arbitrary separators and two modes of shuffling:

1) *fixed* mode makes it so that all shuffled formats are shuffled
   following the same pattern.
2) *random* mode randomly shuffles each cell in a column
  • Loading branch information
philipp-jung authored Jun 25, 2024
1 parent 2262187 commit 26434bb
Show file tree
Hide file tree
Showing 4 changed files with 209 additions and 5 deletions.
1 change: 1 addition & 0 deletions error_generation/error_type/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@
from .mislabel import Mislabel
from .missing import MissingValue
from .mojibake import Mojibake
from .permutate import Permutate
from .wrong_unit import WrongUnit
72 changes: 72 additions & 0 deletions error_generation/error_type/permutate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
from __future__ import annotations

import random
from typing import TYPE_CHECKING, Callable

from pandas.api.types import is_string_dtype

from error_generation.error_type import ErrorType
from error_generation.utils import get_column

if TYPE_CHECKING:
import pandas as pd


def fixed_shuffle_pattern(format_len: int, permutation_separator: str) -> Callable:
"""Returns a function that shuffles the values in a column following a fixed pattern."""
initial_pattern = list(range(format_len + 1)) # list that indicates the positions of each value
new_pattern = initial_pattern

while initial_pattern == new_pattern:
new_pattern = random.sample(initial_pattern, len(initial_pattern))

def shuffle_pattern(old_string: str) -> str:
old_list = old_string.split(permutation_separator)
new = ["" for _ in range(len(old_list))]
for i, n in zip(initial_pattern, new_pattern):
new[n] = old_list[i]
return permutation_separator.join(new)

return shuffle_pattern


class Permutate(ErrorType):
"""Permutates the values in a column."""

@staticmethod
def _check_type(table: pd.DataFrame, column: int | str) -> None:
series = get_column(table, column)

if not is_string_dtype(series):
msg = f"Column {column} does not contain values of the string dtype. Cannot Permutate values."
raise TypeError(msg)

def _apply(self: Permutate, table: pd.DataFrame, error_mask: pd.DataFrame, column: int | str) -> pd.Series:
series = get_column(table, column).copy()
series_mask = get_column(error_mask, column)

separator_counts = [x.count(self.config.permutation_separator) for x in series.dropna()]
for i, count in enumerate(separator_counts):
if count == 0:
msg = f'Cannot permutate values, because column {column} contains value "{series[i]}" that is not separated by the separator '
msg += f'"{self.config.permutation_separator}". To use another separator, define it in the ErrorTypeconfig.'
raise ValueError(msg)

if self.config.permutation_pattern == "fixed":
if len(set(separator_counts)) > 1:
msg = f"Column {column} cannot be permutated using a fixed permutation_pattern: A fixed permutation_pattern requires all values "
msg += "to be formatted in the same way."
raise ValueError(msg)
shuffle_pattern = fixed_shuffle_pattern(separator_counts[0], self.config.permutation_separator)

if self.config.permutation_pattern == "random":

def shuffle_pattern(old_string: str) -> str:
old_list = old_string.split(self.config.permutation_separator)
new = old_list
while new == old_list:
new = random.sample(old_list, len(old_list))
return self.config.permutation_separator.join(new)

series.loc[series_mask] = series.loc[series_mask].apply(shuffle_pattern)
return series
5 changes: 5 additions & 0 deletions error_generation/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ class ErrorTypeConfig:
mislabel_weighing: Weight of the distribution that mislables are drawn from. Either "uniform", "frequency" or "custom".
mistype_dtype: Pandas dtype of the column that is incorrectly types.
wrong_unit_scaling: Function that scales a value from one unit to another.
permutation_separator: A Char that separates structured text, e.g. ' ' in an address or '-' in a date.
permutation_pattern: Permutations either all follow the same pattern (fixed) or not (random).
"""

encoding_sender: str | None = None
Expand All @@ -40,6 +42,9 @@ class ErrorTypeConfig:

wrong_unit_scaling: Callable | None = None

permutation_separator: str = " "
permutation_pattern: str = "random"


def get_column(table: pd.DataFrame, column: int | str) -> pd.Series:
"""Selects a column from a dataframe and returns it as a series."""
Expand Down
136 changes: 131 additions & 5 deletions samples.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,19 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 45,
"id": "a06b4cd8-6bd2-4321-a4ac-2beea45d67db",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The autoreload extension is already loaded. To reload it, use:\n",
" %reload_ext autoreload\n"
]
}
],
"source": [
"%load_ext autoreload\n",
"%autoreload 2"
Expand All @@ -21,7 +30,7 @@
},
{
"cell_type": "code",
"execution_count": 123,
"execution_count": 46,
"id": "3b62f455-52b2-474b-9ba0-ba4d24e56ae1",
"metadata": {},
"outputs": [],
Expand All @@ -30,19 +39,136 @@
"\n",
"from error_generation.api.low_level import create_errors\n",
"from error_generation.error_mechanism import ECAR\n",
"from error_generation.error_type import Butterfinger, Mislabel, MissingValue, Mojibake, WrongUnit"
"from error_generation.error_type import Butterfinger, Mislabel, MissingValue, Mojibake, Permutate, WrongUnit"
]
},
{
"cell_type": "code",
"execution_count": 37,
"execution_count": 47,
"id": "f09057a2-ae58-4012-bd4a-82b678aabfa0",
"metadata": {},
"outputs": [],
"source": [
"ecar = ECAR(error_rate=1.0)"
]
},
{
"cell_type": "markdown",
"id": "0ff517d2-761d-435c-ac21-e2d6cf3bc411",
"metadata": {},
"source": [
"## Permutation"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "dff2611e-b16b-4104-ba1d-4696a18c8330",
"metadata": {},
"outputs": [],
"source": [
"data = {\"A\": [\"apple\", \"banana\", \"cherry\", \"pineapple\"], \"B\": [\"red apple\", \"yellow banana\", \"dark cherry\", \"blue pineapple\"], \"C\": [10, 20, 30, 40]}\n",
"df_permutate = pd.DataFrame(data)"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "16529889-cc14-48ae-914a-53a99b1ed676",
"metadata": {},
"outputs": [],
"source": [
"permutate = Permutate({\"permutation_separator\": \" \", \"permutation_pattern\": \"fixed\"})"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "9b399f77-83da-470f-910b-5b161566577a",
"metadata": {},
"outputs": [],
"source": [
"df_corrupted, error_mask = create_errors(df_permutate, \"B\", ecar, permutate)"
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "1221cb45-2f54-4167-8ebb-26b4f6723555",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>A</th>\n",
" <th>B</th>\n",
" <th>C</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>apple</td>\n",
" <td>apple red</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>banana</td>\n",
" <td>banana yellow</td>\n",
" <td>20</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>cherry</td>\n",
" <td>cherry dark</td>\n",
" <td>30</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>pineapple</td>\n",
" <td>pineapple blue</td>\n",
" <td>40</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A B C\n",
"0 apple apple red 10\n",
"1 banana banana yellow 20\n",
"2 cherry cherry dark 30\n",
"3 pineapple pineapple blue 40"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_corrupted"
]
},
{
"cell_type": "markdown",
"id": "bdd6cfc7-2c00-4956-b10f-cdd154f00955",
Expand Down

0 comments on commit 26434bb

Please sign in to comment.