Skip to content

Commit

Permalink
add integration_test + _get_output_to_property
Browse files Browse the repository at this point in the history
  • Loading branch information
R-Palazzo committed Sep 7, 2023
1 parent 43b2fa0 commit be30d90
Show file tree
Hide file tree
Showing 3 changed files with 245 additions and 4 deletions.
29 changes: 27 additions & 2 deletions rdt/transformers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,7 +362,7 @@ def _fit(self, columns_data):
raise NotImplementedError()

def _set_seed(self, data):
hash_value = self.get_input_column()
hash_value = self.columns[0]
for value in data.head(5):
hash_value += str(value)

Expand Down Expand Up @@ -508,6 +508,14 @@ def get_input_columns(self):
"""
return self.columns

def _get_output_to_property(self, property_):
result = {
output_column: properties[property_]
for output_column, properties in self.output_properties.items()
}

return result

def _fit(self, columns_data, columns_to_sdtypes):
"""Fit the transformer to the data.
Expand All @@ -519,6 +527,7 @@ def _fit(self, columns_data, columns_to_sdtypes):
"""
raise NotImplementedError()

@random_state
def fit(self, data, columns_to_sdtypes):
"""Fit the transformer to a ``column`` of the ``data``.
Expand All @@ -528,9 +537,25 @@ def fit(self, data, columns_to_sdtypes):
columns_to_sdtypes (dict):
Dictionary mapping column names to their sdtypes.
"""
column_names = list(columns_to_sdtypes.keys())
column_names = tuple(columns_to_sdtypes.keys())
self._store_columns(column_names, data)
self._set_seed(data)
columns_data = self._get_columns_data(data, self.columns)
self._fit(columns_data, columns_to_sdtypes)
self._build_output_columns(data)

def fit_transform(self, data, columns_to_sdtypes):
"""Fit the transformer to a `column` of the `data` and then transform it.
Args:
data (pandas.DataFrame):
The entire table.
columns_to_sdtypes (dict):
Dictionary mapping column names to their sdtypes.
Returns:
pd.DataFrame:
The entire table, containing the transformed data.
"""
self.fit(data, columns_to_sdtypes)
return self.transform(data)
171 changes: 170 additions & 1 deletion tests/integration/transformers/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import numpy as np
import pandas as pd

from rdt.transformers.base import BaseTransformer
from rdt.transformers.base import BaseMultiColumnTransformer, BaseTransformer


def test_dummy_transformer_series_output():
Expand Down Expand Up @@ -129,3 +129,172 @@ def _reverse_transform(self, data):
})
pd.testing.assert_frame_equal(expected_transform, transformed)
pd.testing.assert_frame_equal(reverse, data)


def test_multi_column_transformer_same_number_of_columns_input_output():
"""Test a multi-column transformer when the same of input and output columns."""
# Setup
class AdditionTransformer(BaseMultiColumnTransformer):
"""This transformer takes 3 columns and return the cumulative sum of each row."""
def _fit(self, columns_data, columns_to_sdtypes):
self.output_properties = {
column: {'sdtype': 'numerical'} for column in self.columns
}
self.dtypes = columns_data.dtypes

def _transform(self, data):
return data.cumsum(axis=1)

def _reverse_transform(self, data):
result = data.diff(axis=1)
result.iloc[:, 0] = data.iloc[:, 0]

return result.astype(self.dtypes)

data_test = pd.DataFrame({
'col_1': [1, 2, 3],
'col_2': [10, 20, 30],
'col_3': [100, 200, 300]
})

column_to_sdtype = {
'col_1': 'numerical',
'col_2': 'numerical',
'col_3': 'numerical'
}
transformer = AdditionTransformer()

# Run
transformed = transformer.fit_transform(data_test, column_to_sdtype)
reverse = transformer.reverse_transform(transformed)

# Assert
expected_transform = pd.DataFrame({
'col_1': [1, 2, 3],
'col_2': [11, 22, 33],
'col_3': [111, 222, 333]
})
pd.testing.assert_frame_equal(expected_transform, transformed)
pd.testing.assert_frame_equal(reverse, data_test)


def test_multi_column_transformer_less_output_than_input_columns():
"""Test a multi-column transformer when the output has less columns than the input."""
class ConcatenateTransformer(BaseMultiColumnTransformer):
"""This transformer takes 4 columns and concatenate them into 2 columns.
The two first and last columns are concatenated together.
"""
def _fit(self, columns_data, columns_to_sdtypes):
self.name_1 = self.columns[0] + '#' + self.columns[1]
self.name_2 = self.columns[2] + '#' + self.columns[3]
self.output_properties = {
self.name_1: {'sdtype': 'categorical'},
self.name_2: {'sdtype': 'categorical'}
}
self.dtypes = columns_data.dtypes

def _transform(self, data):
data[self.name_1] = data.iloc[:, 0] + '#' + data.iloc[:, 1]
data[self.name_2] = data.iloc[:, 2] + '#' + data.iloc[:, 3]

return data.drop(columns=self.columns)

def _reverse_transform(self, data):
result = data.copy()
column_names = list(data.columns)

col1, col2 = column_names[0].split('#')
result[[col1, col2]] = result[column_names[0]].str.split('#', expand=True)

col3, col4 = column_names[1].split('#')
result[[col3, col4]] = result[column_names[1]].str.split('#', expand=True)

return result.astype(self.dtypes).drop(columns=column_names)

data_test = pd.DataFrame({
'col_1': ['A', 'B', 'C'],
'col_2': ['D', 'E', 'F'],
'col_3': ['G', 'H', 'I'],
'col_4': ['J', 'K', 'L']
})

column_to_sdtype = {
'col_1': 'categorical',
'col_2': 'categorical',
'col_3': 'categorical',
'col_4': 'categorical'
}
transformer = ConcatenateTransformer()

# Run
transformer.fit(data_test, column_to_sdtype)
transformed = transformer.transform(data_test)
reverse = transformer.reverse_transform(transformed)

# Assert
expected_transform = pd.DataFrame({
'col_1#col_2': ['A#D', 'B#E', 'C#F'],
'col_3#col_4': ['G#J', 'H#K', 'I#L']
})
pd.testing.assert_frame_equal(expected_transform, transformed)
pd.testing.assert_frame_equal(reverse, data_test)


def test_multi_column_transformer_more_output_than_input_columns():
"""Test a multi-column transformer when the output has more columns than the input."""
class ExpandTransformer(BaseMultiColumnTransformer):

def _fit(self, columns_data, columns_to_sdtypes):
name_1 = self.columns[0] + '.first_part'
name_2 = self.columns[0] + '.second_part'
name_3 = self.columns[1] + '.first_part'
name_4 = self.columns[1] + '.second_part'
self.output_properties = {
name_1: {'sdtype': 'categorical'},
name_2: {'sdtype': 'categorical'},
name_3: {'sdtype': 'categorical'},
name_4: {'sdtype': 'categorical'}
}
self.names = [name_1, name_2, name_3, name_4]
self.dtypes = columns_data.dtypes

def _transform(self, data):
data[self.names[0]] = data[self.columns[0]].str[0]
data[self.names[1]] = data[self.columns[0]].str[1]
data[self.names[2]] = data[self.columns[1]].str[0]
data[self.names[3]] = data[self.columns[1]].str[1]

return data.drop(columns=self.columns)

def _reverse_transform(self, data):
result = data.copy()
result[self.columns[0]] = result[self.names[0]] + result[self.names[1]]
result[self.columns[1]] = result[self.names[2]] + result[self.names[3]]

return result.astype(self.dtypes).drop(columns=self.names)

data_test = pd.DataFrame({
'col_1': ['AB', 'CD', 'EF'],
'col_2': ['GH', 'IJ', 'KL'],
})

column_to_sdtype = {
'col_1': 'categorical',
'col_2': 'categorical',
}
transformer = ExpandTransformer()

# Run
transformer.fit(data_test, column_to_sdtype)
transformed = transformer.transform(data_test)
reverse = transformer.reverse_transform(transformed)

# Assert
expected_transform = pd.DataFrame({
'col_1.first_part': ['A', 'C', 'E'],
'col_1.second_part': ['B', 'D', 'F'],
'col_2.first_part': ['G', 'I', 'K'],
'col_2.second_part': ['H', 'J', 'L']
})
pd.testing.assert_frame_equal(expected_transform, transformed)
pd.testing.assert_frame_equal(reverse, data_test)
49 changes: 48 additions & 1 deletion tests/unit/transformers/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1303,6 +1303,27 @@ def test_get_input_columns(self):
# Assert
assert output == ['a', 'b', 'c']

def test__get_output_to_property(self):
"""Test the ``_get_output_to_property`` method."""
# Setup
transformer = BaseMultiColumnTransformer()
transformer.output_properties = {
'col_1': {'sdtype': 'numerical'},
'col_2': {'sdtype': 'categorical'},
'col_3': {'sdtype': 'boolean'},
}

# Run
output = transformer._get_output_to_property('sdtype')

# Assert
expected_output = {
'col_1': 'numerical',
'col_2': 'categorical',
'col_3': 'boolean',
}
assert output == expected_output

def test__fit(self):
"""Test the ``_fit`` method.
Expand Down Expand Up @@ -1341,9 +1362,35 @@ def test_fit(self):

# Assert
transformer._store_columns.assert_called_once_with(
list(columns_to_sdtypes.keys()), data
tuple(columns_to_sdtypes.keys()), data
)
transformer._set_seed.assert_called_once_with(data)
transformer._get_columns_data.assert_called_once_with(data, ['a', 'b'])
transformer._fit.assert_called_once_with(data_transformer, columns_to_sdtypes)
transformer._build_output_columns.assert_called_once_with(data)

def test_fit_transform(self):
"""Test the ``fit_transform`` method."""
# Setup
transformer = BaseMultiColumnTransformer()
columns_to_sdtypes = {
'a': 'numerical',
'b': 'categorical',
'c': 'boolean'
}
data = pd.DataFrame({
'a': [1, 2, 3],
'b': ['a', 'b', 'c'],
})
transformer.columns = ['a', 'b']
mock_fit = Mock()
mock_transform = Mock(return_value=data)
transformer.fit = mock_fit
transformer.transform = mock_transform

# Run
transformer.fit_transform(data, columns_to_sdtypes)

# Assert
mock_fit.assert_called_once_with(data, columns_to_sdtypes)
mock_transform.assert_called_once_with(data)

0 comments on commit be30d90

Please sign in to comment.