From 32250d18190f6234fbbeda59f78cfdc2a0e3fe03 Mon Sep 17 00:00:00 2001 From: Philipp Jung Date: Fri, 5 Jul 2024 11:17:39 +0200 Subject: [PATCH] Make EAR and ENAR handle non-empty error_masks When implementing the mid-level API, I forgot to add capabilities to ENAR and EAR to correctly resolve non-empty error_masks. I changed this now: EAR and ENAR will only draw new errors for cells that are not already error position. There is error-handling in case the user requires more errors than the table has rows. --- error_generation/api/mid_level.py | 2 +- error_generation/error_mechanism/_ear.py | 14 +- error_generation/error_mechanism/_ecar.py | 2 +- error_generation/error_mechanism/_enar.py | 23 ++- error_generation/utils/utils.py | 2 +- samples.ipynb | 194 +++++++++++----------- 6 files changed, 131 insertions(+), 106 deletions(-) diff --git a/error_generation/api/mid_level.py b/error_generation/api/mid_level.py index 55708b9..6b796bd 100644 --- a/error_generation/api/mid_level.py +++ b/error_generation/api/mid_level.py @@ -27,6 +27,6 @@ def create_errors(table: pd.DataFrame, config: MidLevelConfig) -> tuple[pd.DataF old_error_mask = error_mask.copy() error_mask = error_mechanism.sample(table, column, error_rate, error_mask) - series = error_type.apply(table, old_error_mask != error_mask, column) + series = error_type.apply(table_dirty, old_error_mask != error_mask, column) set_column(table_dirty, column, series) return table_dirty, error_mask diff --git a/error_generation/error_mechanism/_ear.py b/error_generation/error_mechanism/_ear.py index ceb7d3f..285f192 100644 --- a/error_generation/error_mechanism/_ear.py +++ b/error_generation/error_mechanism/_ear.py @@ -35,10 +35,20 @@ def _sample(self: EAR, data: pd.DataFrame, column: str | int, error_rate: float, se_mask = get_column(error_mask, column) n_errors = int(se_data.size * error_rate) - upper_bound = len(se_data) - n_errors + se_mask_error_free = se_mask[~se_mask] + data_column_error_free = data.loc[se_mask_error_free.index, :] + + if len(se_mask_error_free) < n_errors: + msg = f"The error rate of {error_rate} requires {n_errors} error-free cells. " + msg += f"However, only {len(se_mask_error_free)} error-free cells are available." + raise ValueError(msg) + + # we offset the upper bound of the lower_error_index by a) the existing number of errors in the row, and b) the number of errors to-be generated. + upper_bound = len(se_data) - sum(se_mask) - n_errors lower_error_index = np.random.default_rng(self.seed).integers(0, upper_bound) if upper_bound > 0 else 0 error_index_range = range(lower_error_index, lower_error_index + n_errors) + selected_rows = data_column_error_free.sort_values(by=condition_to_column).iloc[error_index_range, :] - se_mask.loc[data.sort_values(by=condition_to_column).index[error_index_range]] = True + se_mask.loc[selected_rows.index] = True return error_mask diff --git a/error_generation/error_mechanism/_ecar.py b/error_generation/error_mechanism/_ecar.py index 768ed4a..a90b155 100644 --- a/error_generation/error_mechanism/_ecar.py +++ b/error_generation/error_mechanism/_ecar.py @@ -26,7 +26,7 @@ def _sample(self: ECAR, data: pd.DataFrame, column: str | int, error_rate: float n_errors = int(se_mask.size * error_rate) if len(se_mask_error_free) < n_errors: - msg = f"The error rate of {error_rate} requires {len(se_mask_error_free)} error-free cells. " + msg = f"The error rate of {error_rate} requires {n_errors} error-free cells. " msg += f"However, only {len(se_mask_error_free)} error-free cells are available." raise ValueError(msg) diff --git a/error_generation/error_mechanism/_enar.py b/error_generation/error_mechanism/_enar.py index 17cf931..1d90641 100644 --- a/error_generation/error_mechanism/_enar.py +++ b/error_generation/error_mechanism/_enar.py @@ -21,17 +21,24 @@ def _sample(self: ENAR, data: pd.DataFrame, column: str | int, error_rate: float if self.condition_to_column is not None: warnings.warn("'condition_to_column' is set but will be ignored by ENAR.", stacklevel=1) - # distribute errors equally over all columns - n_errors = int(se_data.size * error_rate) + n_errors = int(len(se_data) * error_rate) - if n_errors < len(se_data): # noqa: SIM108 - lower_error_index = np.random.default_rng(seed=self.seed).integers(0, len(se_data) - n_errors) - else: # all cells are errors + # if mid-level or high-level API call ENAR, the error_mask already contains errors. Below we make sure that we only sample rows that do not + # already contain errors. + se_data_error_free = se_data[~se_mask] + + if len(se_data_error_free) < n_errors: + msg = f"The error rate of {error_rate} requires {n_errors} error-free cells. " + msg += f"However, only {len(se_data_error_free)} error-free cells are available." + raise ValueError(msg) + + if len(se_data_error_free) != n_errors: # noqa: SIM108 + lower_error_index = np.random.default_rng(seed=self.seed).integers(0, len(se_data_error_free) - n_errors) + else: lower_error_index = 0 error_index_range = range(lower_error_index, lower_error_index + n_errors) + selected_rows = se_data_error_free.sort_values().iloc[error_index_range] - se_mask.loc[se_data.sort_values().index[error_index_range]] = True - - # TODO(PJ): Remember to run if isinstance(seed, int): seed += 1 in mid-level API + se_mask.loc[selected_rows.index] = True return error_mask diff --git a/error_generation/utils/utils.py b/error_generation/utils/utils.py index 8fa7e1b..3a3e4e0 100644 --- a/error_generation/utils/utils.py +++ b/error_generation/utils/utils.py @@ -63,7 +63,7 @@ class ErrorTypeConfig: keyboard_layout: str = "ansi-qwerty" error_period: int = 10 - na_value = None + na_value: str | None = None mislabel_weighing: str = "uniform" mislabel_weights: dict[Any, float] | None = None diff --git a/samples.ipynb b/samples.ipynb index 762201f..0be6a13 100644 --- a/samples.ipynb +++ b/samples.ipynb @@ -69,7 +69,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 3, "id": "e4cea319-2cce-4639-8b1e-aa9a8f8d5fdd", "metadata": {}, "outputs": [], @@ -127,15 +127,15 @@ " \n", " \n", " 0\n", - " Aservice-01-02-2024\n", + " Aservice-2024-02-01\n", " \n", " \n", " 1\n", - " Aservice-02-02-2024\n", + " Aservice-2024-02-02\n", " \n", " \n", " 2\n", - " Aservice-03-02-2024\n", + " Aservice-2024-02-03\n", " \n", " \n", " 3\n", @@ -147,15 +147,15 @@ " \n", " \n", " 5\n", - " Bservice-2024-02-03\n", + " Bservice-03-02-2024\n", " \n", " \n", " 6\n", - " Cservice-2024-02-01\n", + " Cservice-01-02-2024\n", " \n", " \n", " 7\n", - " Cservice-2024-02-02\n", + " Cservice-02-02-2024\n", " \n", " \n", " 8\n", @@ -167,14 +167,14 @@ ], "text/plain": [ " service\n", - "0 Aservice-01-02-2024\n", - "1 Aservice-02-02-2024\n", - "2 Aservice-03-02-2024\n", + "0 Aservice-2024-02-01\n", + "1 Aservice-2024-02-02\n", + "2 Aservice-2024-02-03\n", "3 Bservice-2024-02-01\n", "4 Bservice-2024-02-02\n", - "5 Bservice-2024-02-03\n", - "6 Cservice-2024-02-01\n", - "7 Cservice-2024-02-02\n", + "5 Bservice-03-02-2024\n", + "6 Cservice-01-02-2024\n", + "7 Cservice-02-02-2024\n", "8 Cservice-2024-02-03" ] }, @@ -198,7 +198,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 5, "id": "0ef70d60-cf62-4b66-856d-43db6f4a9378", "metadata": {}, "outputs": [ @@ -225,7 +225,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 6, "id": "bd456d16-a8cb-4a3d-b496-35c755fd25ac", "metadata": {}, "outputs": [ @@ -258,17 +258,17 @@ " \n", " 0\n", " Alice\n", - " To Kill q Mockingbird\n", + " To Kil; a Mockingbird\n", " \n", " \n", " 1\n", " Alice\n", - " 1983\n", + " 1i84\n", " \n", " \n", " 2\n", " Alice\n", - " Pride ans Prejudice\n", + " Pride wnd Prejudice\n", " \n", " \n", " 3\n", @@ -291,15 +291,15 @@ ], "text/plain": [ " typist book_title\n", - "0 Alice To Kill q Mockingbird\n", - "1 Alice 1983\n", - "2 Alice Pride ans Prejudice\n", + "0 Alice To Kil; a Mockingbird\n", + "1 Alice 1i84\n", + "2 Alice Pride wnd Prejudice\n", "3 Bob The Great Gatsby\n", "4 Bob Moby-Dick\n", "5 Bob The Catcher in the Rye" ] }, - "execution_count": 28, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -319,7 +319,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 7, "id": "cb40305c-daaa-42fc-bf90-64d4f6d7861d", "metadata": {}, "outputs": [], @@ -337,7 +337,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 8, "id": "41464b03-b38c-4607-92cd-59165d01965d", "metadata": {}, "outputs": [ @@ -370,7 +370,7 @@ " \n", " 0\n", " Alice\n", - " ¿Cómo estás?\n", + " ¿Cómo estás?\n", " \n", " \n", " 1\n", @@ -380,12 +380,12 @@ " \n", " 2\n", " Bob\n", - " 今日はどうですか\n", + " 今日はどうですか\n", " \n", " \n", " 3\n", " Bob\n", - " Ça va bien, merci.\n", + " Ça va bien, merci.\n", " \n", " \n", " 4\n", @@ -395,7 +395,7 @@ " \n", " 5\n", " David\n", - " Ich hätte Hunger.\n", + " Ich hätte Hunger.\n", " \n", " \n", "\n", @@ -403,15 +403,15 @@ ], "text/plain": [ " user content\n", - "0 Alice ¿Cómo estás?\n", + "0 Alice ¿Cómo estás?\n", "1 Alice Привет, как дела?\n", - "2 Bob 今日はどうですか\n", - "3 Bob Ça va bien, merci.\n", + "2 Bob 今日はどうですか\n", + "3 Bob Ça va bien, merci.\n", "4 Clara ¡Nos vemos mañana!\n", - "5 David Ich hätte Hunger." + "5 David Ich hätte Hunger." ] }, - "execution_count": 30, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -441,7 +441,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 9, "id": "98633b5e-957c-4f2b-813f-208fbed6d855", "metadata": {}, "outputs": [], @@ -454,7 +454,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 10, "id": "a7d421e5-1103-4cde-849b-adb689043081", "metadata": {}, "outputs": [ @@ -510,7 +510,7 @@ "2 3.0 blau" ] }, - "execution_count": 32, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -529,7 +529,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 11, "id": "dff2611e-b16b-4104-ba1d-4696a18c8330", "metadata": {}, "outputs": [], @@ -542,7 +542,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 12, "id": "1221cb45-2f54-4167-8ebb-26b4f6723555", "metadata": {}, "outputs": [ @@ -609,7 +609,7 @@ "3 pineapple pineapple blue 40" ] }, - "execution_count": 34, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -628,7 +628,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 13, "id": "92c3a871-3078-4552-b9e6-d583e36e2ec2", "metadata": {}, "outputs": [], @@ -640,7 +640,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 14, "id": "288759e4-f634-49d6-a285-deb0e0abf999", "metadata": {}, "outputs": [ @@ -696,7 +696,7 @@ "2 2 Grnfelder Strae 17, 13357 ppeln" ] }, - "execution_count": 36, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -715,7 +715,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 15, "id": "8f5fa21b-0af8-43ae-9ac4-daa7c09c592a", "metadata": {}, "outputs": [], @@ -727,7 +727,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 16, "id": "5fb3f20d-8e30-47fa-aa8d-aeb541264b81", "metadata": {}, "outputs": [ @@ -760,17 +760,17 @@ " \n", " 0\n", " 0\n", - " Entspannujg\n", + " Wntspannung\n", " \n", " \n", " 1\n", " 1\n", - " Genigtuung\n", + " Genugtuumg\n", " \n", " \n", " 2\n", " 2\n", - " Ausgeglichenbeit\n", + " Ausgeglichemheit\n", " \n", " \n", "\n", @@ -778,12 +778,12 @@ ], "text/plain": [ " a b\n", - "0 0 Entspannujg\n", - "1 1 Genigtuung\n", - "2 2 Ausgeglichenbeit" + "0 0 Wntspannung\n", + "1 1 Genugtuumg\n", + "2 2 Ausgeglichemheit" ] }, - "execution_count": 38, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -802,7 +802,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 17, "id": "53d9ebc6-7e12-4736-9734-babf114fa479", "metadata": {}, "outputs": [], @@ -814,7 +814,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 18, "id": "656082b2-8cac-495a-aa59-eb662888cfc5", "metadata": {}, "outputs": [ @@ -870,7 +870,7 @@ "2 2 0.06" ] }, - "execution_count": 40, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -889,7 +889,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 19, "id": "6332c825-5cf3-421c-bb2e-f03cb55c34e6", "metadata": {}, "outputs": [], @@ -902,7 +902,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 20, "id": "8f064b77-b988-4c60-8363-42f93c90439c", "metadata": {}, "outputs": [ @@ -958,7 +958,7 @@ "2 3 gelb" ] }, - "execution_count": 42, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -977,7 +977,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 21, "id": "5a0b6f34-6d5d-4e3a-9cd3-295fe2b0f1bd", "metadata": {}, "outputs": [], @@ -989,7 +989,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 22, "id": "73e37424-6433-42ce-b85e-58c5510c48c9", "metadata": {}, "outputs": [ @@ -1045,7 +1045,7 @@ "2 3 None" ] }, - "execution_count": 44, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -1064,7 +1064,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 23, "id": "4d2eae7e-8ad5-4f0f-bd15-00f7001da275", "metadata": {}, "outputs": [], @@ -1076,7 +1076,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 24, "id": "e8e3ab8e-19f0-4b1e-9e58-b56771d783d2", "metadata": {}, "outputs": [ @@ -1132,7 +1132,7 @@ "2 3 11/10 6 p.m." ] }, - "execution_count": 46, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -1151,7 +1151,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 25, "id": "3045944f-f9f1-46e9-8bf4-3d9d56677425", "metadata": {}, "outputs": [], @@ -1163,7 +1163,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 26, "id": "48d3ea05-9670-4b11-856e-7d5678d6db45", "metadata": {}, "outputs": [ @@ -1219,7 +1219,7 @@ "2 3 6 p.m." ] }, - "execution_count": 49, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -1238,7 +1238,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 27, "id": "3b9a33cd-a512-44aa-939b-a51a86f6193d", "metadata": {}, "outputs": [], @@ -1252,7 +1252,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 28, "id": "1a29bbeb-6395-422d-8622-7acbd5cd3c2a", "metadata": {}, "outputs": [ @@ -1319,7 +1319,7 @@ "3 pineapple blue pineapple 40.00001" ] }, - "execution_count": 35, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -1361,7 +1361,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 29, "id": "ff1393bf-2ac6-41bc-a341-a384b74aad5a", "metadata": {}, "outputs": [], @@ -1385,7 +1385,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 30, "id": "63f1e8e0-68ae-4151-8c2b-2c7e69bee3ab", "metadata": {}, "outputs": [ @@ -1417,27 +1417,27 @@ " \n", " \n", " 0\n", - " Alice\n", + " None\n", " To Kill a Mockingbird\n", " \n", " \n", " 1\n", - " Alice\n", - " 2984\n", + " None\n", + " 1o84\n", " \n", " \n", " 2\n", " None\n", - " Pride anr Prejudice\n", + " Prkde and Prejudice\n", " \n", " \n", " 3\n", - " None\n", + " Bob\n", " The Great Gatwby\n", " \n", " \n", " 4\n", - " None\n", + " Bob\n", " Moby-Dick\n", " \n", " \n", @@ -1451,15 +1451,15 @@ ], "text/plain": [ " typist book_title\n", - "0 Alice To Kill a Mockingbird\n", - "1 Alice 2984\n", - "2 None Pride anr Prejudice\n", - "3 None The Great Gatwby\n", - "4 None Moby-Dick\n", + "0 None To Kill a Mockingbird\n", + "1 None 1o84\n", + "2 None Prkde and Prejudice\n", + "3 Bob The Great Gatwby\n", + "4 Bob Moby-Dick\n", "5 Bob The Catcher in the Rye" ] }, - "execution_count": 46, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -1470,7 +1470,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 31, "id": "b7553e47-dac9-4a01-aa27-f64f3d5464a2", "metadata": {}, "outputs": [ @@ -1502,12 +1502,12 @@ " \n", " \n", " 0\n", - " False\n", + " True\n", " False\n", " \n", " \n", " 1\n", - " False\n", + " True\n", " True\n", " \n", " \n", @@ -1517,12 +1517,12 @@ " \n", " \n", " 3\n", - " True\n", + " False\n", " True\n", " \n", " \n", " 4\n", - " True\n", + " False\n", " False\n", " \n", " \n", @@ -1536,15 +1536,15 @@ ], "text/plain": [ " typist book_title\n", - "0 False False\n", - "1 False True\n", + "0 True False\n", + "1 True True\n", "2 True True\n", - "3 True True\n", - "4 True False\n", + "3 False True\n", + "4 False False\n", "5 False False" ] }, - "execution_count": 47, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -1552,6 +1552,14 @@ "source": [ "error_mask" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6b11375e-e892-48dd-baab-da6581cd002e", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {