From d0b3071fe882de1a99c36a33ca560b30041c9bf9 Mon Sep 17 00:00:00 2001 From: Felipe Hofmann Date: Wed, 10 Aug 2022 19:51:30 -0700 Subject: [PATCH 1/8] Select intervals using starts instead of means (fixes the bug) --- rdt/transformers/categorical.py | 13 ++++++------- rdt/transformers/utils.py | 2 +- tests/integration/transformers/test_categorical.py | 3 ++- tests/unit/transformers/test_categorical.py | 6 +++--- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/rdt/transformers/categorical.py b/rdt/transformers/categorical.py index 6d5f99939..2e78a2a8c 100644 --- a/rdt/transformers/categorical.py +++ b/rdt/transformers/categorical.py @@ -43,7 +43,6 @@ class FrequencyEncoder(BaseTransformer): starts = None means = None dtype = None - _get_category_from_index = None def __setstate__(self, state): """Replace any ``null`` key by the actual ``np.nan`` instance.""" @@ -195,15 +194,15 @@ def _transform(self, data): def _reverse_transform_by_matrix(self, data): """Reverse transform the data with matrix operations.""" num_rows = len(data) - num_categories = len(self.means) + num_categories = len(self.starts) data = np.broadcast_to(data, (num_categories, num_rows)).T - means = np.broadcast_to(self.means, (num_rows, num_categories)) - diffs = np.abs(data - means) - indexes = np.argmin(diffs, axis=1) + starts = np.broadcast_to(self.starts.index, (num_rows, num_categories)) + diffs = (data >= starts)[:, ::-1] + indexes = num_categories - np.argmax(diffs, axis=1) - 1 - self._get_category_from_index = list(self.means.index).__getitem__ - return pd.Series(indexes).apply(self._get_category_from_index).astype(self.dtype) + get_category_from_index = list(self.starts['category']).__getitem__ + return pd.Series(indexes).apply(get_category_from_index).astype(self.dtype) def _reverse_transform_by_category(self, data): """Reverse transform the data by iterating over all the categories.""" diff --git a/rdt/transformers/utils.py b/rdt/transformers/utils.py index 1f689f4c5..3c546536f 100644 --- a/rdt/transformers/utils.py +++ b/rdt/transformers/utils.py @@ -1,10 +1,10 @@ """Tools to generate strings from regular expressions.""" import re -import sre_parse import string import numpy as np +import sre_parse def _literal(character, max_repeat): diff --git a/tests/integration/transformers/test_categorical.py b/tests/integration/transformers/test_categorical.py index dcf32755a..10ee16966 100644 --- a/tests/integration/transformers/test_categorical.py +++ b/tests/integration/transformers/test_categorical.py @@ -196,7 +196,8 @@ def test_frequency_encoder_mixed(): # run transformer.fit(data, column) - reverse = transformer.reverse_transform(transformer.transform(data)) + transformed = transformer.transform(data) + reverse = transformer.reverse_transform(transformed) # assert pd.testing.assert_frame_equal(data, reverse) diff --git a/tests/unit/transformers/test_categorical.py b/tests/unit/transformers/test_categorical.py index 774e9473d..d39147835 100644 --- a/tests/unit/transformers/test_categorical.py +++ b/tests/unit/transformers/test_categorical.py @@ -589,13 +589,13 @@ def test__reverse_transform_by_matrix_called(self, psutil_mock): @patch('psutil.virtual_memory') def test__reverse_transform_by_matrix(self, psutil_mock): - """Test the _reverse_transform_by_matrix method with numerical data + """Test the _reverse_transform_by_matrix method with numerical data. Expect that the transformed data is correctly reverse transformed. Setup: The categorical transformer is instantiated with 4 categories and means. Also patch - the `psutil.virtual_memory` function to return a large enough `available_memory`. + the ``psutil.virtual_memory`` function to return a large enough ``available_memory``. Input: - transformed data with 4 rows Ouptut: @@ -606,7 +606,7 @@ def test__reverse_transform_by_matrix(self, psutil_mock): transformed = pd.Series([0.875, 0.625, 0.375, 0.125]) transformer = FrequencyEncoder() - transformer.means = pd.Series([0.125, 0.375, 0.625, 0.875], index=[4, 3, 2, 1]) + transformer.starts = pd.DataFrame({'category': [4, 3, 2, 1]}, index=[0., 0.25, 0.5, 0.75]) transformer.dtype = data.dtype virtual_memory = Mock() From e28ae55b0ef76282382d72a0053a58695a2d8645 Mon Sep 17 00:00:00 2001 From: Felipe Hofmann Date: Thu, 11 Aug 2022 09:01:02 -0700 Subject: [PATCH 2/8] Fix forward transform noise --- rdt/transformers/categorical.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/rdt/transformers/categorical.py b/rdt/transformers/categorical.py index 2e78a2a8c..2545c1d51 100644 --- a/rdt/transformers/categorical.py +++ b/rdt/transformers/categorical.py @@ -127,13 +127,24 @@ def _fit(self, data): self.dtype = data.dtype self.intervals, self.means, self.starts = self._get_intervals(data) + def _clip_noised_transform(self, result, start, end): + """Clip transformed values. + + Used to ensure the noise added to transformed values doesn't make it + go out of the bounds of a given category. + + The upper bound must be slightly lower than ``end`` + so it doesn't get treated as the next category. + """ + return result.clip(start, end - 1e-9) + def _transform_by_category(self, data): """Transform the data by iterating over the different categories.""" result = np.empty(shape=(len(data), ), dtype=float) # loop over categories for category, values in self.intervals.items(): - mean, std = values[2:] + start, end, mean, std = values if category is np.nan: mask = data.isna() else: @@ -141,6 +152,7 @@ def _transform_by_category(self, data): if self.add_noise: result[mask] = norm.rvs(mean, std, size=mask.sum()) + result[mask] = self._clip_noised_transform(result[mask], start, end) else: result[mask] = mean @@ -151,10 +163,11 @@ def _get_value(self, category): if pd.isna(category): category = np.nan - mean, std = self.intervals[category][2:] + start, end, mean, std = self.intervals[category] if self.add_noise: - return norm.rvs(mean, std) + result = norm.rvs(mean, std) + return self._clip_noised_transform(result, start, end) return mean From 7f6c48d39a3ca9a4e041c3137cf3bada6f1c2e4b Mon Sep 17 00:00:00 2001 From: Felipe Hofmann Date: Thu, 11 Aug 2022 14:52:40 -0700 Subject: [PATCH 3/8] Add integration test + fix bug --- rdt/transformers/categorical.py | 2 +- .../transformers/test_categorical.py | 24 +++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/rdt/transformers/categorical.py b/rdt/transformers/categorical.py index 2545c1d51..12b2a3d0d 100644 --- a/rdt/transformers/categorical.py +++ b/rdt/transformers/categorical.py @@ -136,7 +136,7 @@ def _clip_noised_transform(self, result, start, end): The upper bound must be slightly lower than ``end`` so it doesn't get treated as the next category. """ - return result.clip(start, end - 1e-9) + return np.clip(result, start, end - 1e-9) def _transform_by_category(self, data): """Transform the data by iterating over the different categories.""" diff --git a/tests/integration/transformers/test_categorical.py b/tests/integration/transformers/test_categorical.py index 10ee16966..9650e8213 100644 --- a/tests/integration/transformers/test_categorical.py +++ b/tests/integration/transformers/test_categorical.py @@ -265,6 +265,30 @@ def test_frequency_encoder_mixed_more_rows(psutil_mock): pd.testing.assert_frame_equal(transform_data, reverse) +def test_frequency_encoder_noise(): + """Test the FrequencyEncoder with ``add_noise``. + + Ensure that the FrequencyEncoder can fit, transform, and reverse + transform when ``add_noise = True``. + + Input: + - Many rows of int data + Output: + - The reverse transformed data + """ + # setup + data = pd.DataFrame(np.random.choice(a=range(100), size=10000), columns=['column_name']) + column = 'column_name' + transformer = FrequencyEncoder(add_noise=True) + + # run + transformer.fit(data, column) + reverse = transformer.reverse_transform(transformer.transform(data)) + + # assert + pd.testing.assert_frame_equal(data, reverse) + + def test_one_hot_numerical_nans(): """Ensure OneHotEncoder works on numerical + nan only columns.""" From de3cf5045dd2844f083241d8cbd81261ec0a0f79 Mon Sep 17 00:00:00 2001 From: Felipe Hofmann Date: Thu, 11 Aug 2022 14:55:37 -0700 Subject: [PATCH 4/8] Fix lint --- rdt/transformers/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rdt/transformers/utils.py b/rdt/transformers/utils.py index 3c546536f..1f689f4c5 100644 --- a/rdt/transformers/utils.py +++ b/rdt/transformers/utils.py @@ -1,10 +1,10 @@ """Tools to generate strings from regular expressions.""" import re +import sre_parse import string import numpy as np -import sre_parse def _literal(character, max_repeat): From f59fc427fe7940f4affc8cb455d01d1f1fc80d8a Mon Sep 17 00:00:00 2001 From: Felipe Hofmann Date: Thu, 11 Aug 2022 15:10:02 -0700 Subject: [PATCH 5/8] Remove integration test change --- tests/integration/transformers/test_categorical.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/integration/transformers/test_categorical.py b/tests/integration/transformers/test_categorical.py index 9650e8213..c6d9b03bc 100644 --- a/tests/integration/transformers/test_categorical.py +++ b/tests/integration/transformers/test_categorical.py @@ -196,8 +196,7 @@ def test_frequency_encoder_mixed(): # run transformer.fit(data, column) - transformed = transformer.transform(data) - reverse = transformer.reverse_transform(transformed) + reverse = transformer.reverse_transform(transformer.transform(data)) # assert pd.testing.assert_frame_equal(data, reverse) From e709269796f505dc78c842879f7ff6909fe072ed Mon Sep 17 00:00:00 2001 From: Felipe Hofmann Date: Thu, 11 Aug 2022 18:36:01 -0700 Subject: [PATCH 6/8] Make it always deterministic --- rdt/transformers/categorical.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/rdt/transformers/categorical.py b/rdt/transformers/categorical.py index 12b2a3d0d..2c7957cef 100644 --- a/rdt/transformers/categorical.py +++ b/rdt/transformers/categorical.py @@ -66,15 +66,6 @@ def is_transform_deterministic(self): """ return not self.add_noise - def is_composition_identity(self): - """Return whether composition of transform and reverse transform produces the input data. - - Returns: - bool: - Whether or not transforming and then reverse transforming returns the input data. - """ - return self.COMPOSITION_IS_IDENTITY and not self.add_noise - @staticmethod def _get_intervals(data): """Compute intervals for each categorical value. From 6ca09a61aa2e741648b692b25d840d5410a5454b Mon Sep 17 00:00:00 2001 From: Felipe Hofmann Date: Thu, 11 Aug 2022 18:42:11 -0700 Subject: [PATCH 7/8] Remove unnecessary test --- tests/unit/transformers/test_categorical.py | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/tests/unit/transformers/test_categorical.py b/tests/unit/transformers/test_categorical.py index d39147835..b490222cc 100644 --- a/tests/unit/transformers/test_categorical.py +++ b/tests/unit/transformers/test_categorical.py @@ -66,27 +66,6 @@ def test_is_transform_deterministic(self): # Assert assert output is False - def test_is_composition_identity(self): - """Test the ``is_composition_identity`` method. - - Since ``COMPOSITION_IS_IDENTITY`` is True, just validates that the method - returns the opposite boolean value of the ``add_noise`` parameter. - - Setup: - - initialize a ``FrequencyEncoder`` with ``add_noise = True``. - - Output: - - the boolean value which is the opposite of ``add_noise``. - """ - # Setup - transformer = FrequencyEncoder(add_noise=True) - - # Run - output = transformer.is_composition_identity() - - # Assert - assert output is False - def test__get_intervals(self): """Test the ``_get_intervals`` method. From f9c021a3e4764eaf117ce79b8e856d890e739f32 Mon Sep 17 00:00:00 2001 From: Felipe Hofmann Date: Fri, 12 Aug 2022 16:28:30 -0700 Subject: [PATCH 8/8] Address feedback --- rdt/transformers/categorical.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rdt/transformers/categorical.py b/rdt/transformers/categorical.py index 2c7957cef..fac258028 100644 --- a/rdt/transformers/categorical.py +++ b/rdt/transformers/categorical.py @@ -202,11 +202,11 @@ def _reverse_transform_by_matrix(self, data): data = np.broadcast_to(data, (num_categories, num_rows)).T starts = np.broadcast_to(self.starts.index, (num_rows, num_categories)) - diffs = (data >= starts)[:, ::-1] - indexes = num_categories - np.argmax(diffs, axis=1) - 1 + is_data_greater_than_starts = (data >= starts)[:, ::-1] + interval_indexes = num_categories - np.argmax(is_data_greater_than_starts, axis=1) - 1 get_category_from_index = list(self.starts['category']).__getitem__ - return pd.Series(indexes).apply(get_category_from_index).astype(self.dtype) + return pd.Series(interval_indexes).apply(get_category_from_index).astype(self.dtype) def _reverse_transform_by_category(self, data): """Reverse transform the data by iterating over all the categories."""