Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensure reversibility FrequencyEncoder #534

Merged
merged 8 commits into from
Aug 13, 2022
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 22 additions & 19 deletions rdt/transformers/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ class FrequencyEncoder(BaseTransformer):
starts = None
means = None
dtype = None
_get_category_from_index = None

def __setstate__(self, state):
"""Replace any ``null`` key by the actual ``np.nan`` instance."""
Expand All @@ -67,15 +66,6 @@ def is_transform_deterministic(self):
"""
return not self.add_noise

def is_composition_identity(self):
"""Return whether composition of transform and reverse transform produces the input data.

Returns:
bool:
Whether or not transforming and then reverse transforming returns the input data.
"""
return self.COMPOSITION_IS_IDENTITY and not self.add_noise

@staticmethod
def _get_intervals(data):
"""Compute intervals for each categorical value.
Expand Down Expand Up @@ -128,20 +118,32 @@ def _fit(self, data):
self.dtype = data.dtype
self.intervals, self.means, self.starts = self._get_intervals(data)

def _clip_noised_transform(self, result, start, end):
"""Clip transformed values.

Used to ensure the noise added to transformed values doesn't make it
go out of the bounds of a given category.

The upper bound must be slightly lower than ``end``
so it doesn't get treated as the next category.
"""
return np.clip(result, start, end - 1e-9)

def _transform_by_category(self, data):
"""Transform the data by iterating over the different categories."""
result = np.empty(shape=(len(data), ), dtype=float)

# loop over categories
for category, values in self.intervals.items():
mean, std = values[2:]
start, end, mean, std = values
if category is np.nan:
mask = data.isna()
else:
mask = (data.to_numpy() == category)

if self.add_noise:
result[mask] = norm.rvs(mean, std, size=mask.sum())
result[mask] = self._clip_noised_transform(result[mask], start, end)
else:
result[mask] = mean

Expand All @@ -152,10 +154,11 @@ def _get_value(self, category):
if pd.isna(category):
category = np.nan

mean, std = self.intervals[category][2:]
start, end, mean, std = self.intervals[category]

if self.add_noise:
return norm.rvs(mean, std)
result = norm.rvs(mean, std)
return self._clip_noised_transform(result, start, end)

return mean

Expand Down Expand Up @@ -195,15 +198,15 @@ def _transform(self, data):
def _reverse_transform_by_matrix(self, data):
"""Reverse transform the data with matrix operations."""
num_rows = len(data)
num_categories = len(self.means)
num_categories = len(self.starts)

data = np.broadcast_to(data, (num_categories, num_rows)).T
means = np.broadcast_to(self.means, (num_rows, num_categories))
diffs = np.abs(data - means)
indexes = np.argmin(diffs, axis=1)
starts = np.broadcast_to(self.starts.index, (num_rows, num_categories))
diffs = (data >= starts)[:, ::-1]
indexes = num_categories - np.argmax(diffs, axis=1) - 1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What exactly are these two like doing?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I printed each line for an example dataset to clarify things:

Original Data
0    0.875
1    0.625
2    0.375
3    0.125
dtype: float64
Broadcasted Data
[[0.875 0.875 0.875 0.875]
 [0.625 0.625 0.625 0.625]
 [0.375 0.375 0.375 0.375]
 [0.125 0.125 0.125 0.125]]
Starts
[[0.   0.25 0.5  0.75]
 [0.   0.25 0.5  0.75]
 [0.   0.25 0.5  0.75]
 [0.   0.25 0.5  0.75]]
Diffs
[[ True  True  True  True]
 [False  True  True  True]
 [False False  True  True]
 [False False False  True]]
Indexes
[3 2 1 0]

Basically, Diffs marks the first interval our data fits in (the smallest value of starts which our data is greater than) with True. Indexes just finds this first True value and converts its position into the category it corresponds.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cool, can we rename diffs to interval_starts_data_is_greater_than or something like that? and indexes to interval_indexes?


self._get_category_from_index = list(self.means.index).__getitem__
return pd.Series(indexes).apply(self._get_category_from_index).astype(self.dtype)
get_category_from_index = list(self.starts['category']).__getitem__
return pd.Series(indexes).apply(get_category_from_index).astype(self.dtype)

def _reverse_transform_by_category(self, data):
"""Reverse transform the data by iterating over all the categories."""
Expand Down
24 changes: 24 additions & 0 deletions tests/integration/transformers/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,30 @@ def test_frequency_encoder_mixed_more_rows(psutil_mock):
pd.testing.assert_frame_equal(transform_data, reverse)


def test_frequency_encoder_noise():
"""Test the FrequencyEncoder with ``add_noise``.

Ensure that the FrequencyEncoder can fit, transform, and reverse
transform when ``add_noise = True``.

Input:
- Many rows of int data
Output:
- The reverse transformed data
"""
# setup
data = pd.DataFrame(np.random.choice(a=range(100), size=10000), columns=['column_name'])
column = 'column_name'
transformer = FrequencyEncoder(add_noise=True)

# run
transformer.fit(data, column)
reverse = transformer.reverse_transform(transformer.transform(data))

# assert
pd.testing.assert_frame_equal(data, reverse)


def test_one_hot_numerical_nans():
"""Ensure OneHotEncoder works on numerical + nan only columns."""

Expand Down
27 changes: 3 additions & 24 deletions tests/unit/transformers/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,27 +66,6 @@ def test_is_transform_deterministic(self):
# Assert
assert output is False

def test_is_composition_identity(self):
"""Test the ``is_composition_identity`` method.

Since ``COMPOSITION_IS_IDENTITY`` is True, just validates that the method
returns the opposite boolean value of the ``add_noise`` parameter.

Setup:
- initialize a ``FrequencyEncoder`` with ``add_noise = True``.

Output:
- the boolean value which is the opposite of ``add_noise``.
"""
# Setup
transformer = FrequencyEncoder(add_noise=True)

# Run
output = transformer.is_composition_identity()

# Assert
assert output is False

def test__get_intervals(self):
"""Test the ``_get_intervals`` method.

Expand Down Expand Up @@ -589,13 +568,13 @@ def test__reverse_transform_by_matrix_called(self, psutil_mock):

@patch('psutil.virtual_memory')
def test__reverse_transform_by_matrix(self, psutil_mock):
"""Test the _reverse_transform_by_matrix method with numerical data
"""Test the _reverse_transform_by_matrix method with numerical data.

Expect that the transformed data is correctly reverse transformed.

Setup:
The categorical transformer is instantiated with 4 categories and means. Also patch
the `psutil.virtual_memory` function to return a large enough `available_memory`.
the ``psutil.virtual_memory`` function to return a large enough ``available_memory``.
Input:
- transformed data with 4 rows
Ouptut:
Expand All @@ -606,7 +585,7 @@ def test__reverse_transform_by_matrix(self, psutil_mock):
transformed = pd.Series([0.875, 0.625, 0.375, 0.125])

transformer = FrequencyEncoder()
transformer.means = pd.Series([0.125, 0.375, 0.625, 0.875], index=[4, 3, 2, 1])
transformer.starts = pd.DataFrame({'category': [4, 3, 2, 1]}, index=[0., 0.25, 0.5, 0.75])
transformer.dtype = data.dtype

virtual_memory = Mock()
Expand Down