Skip to content

Commit

Permalink
make release-tag: Merge branch 'main' into stable
Browse files Browse the repository at this point in the history
  • Loading branch information
amontanez24 committed Sep 5, 2024
2 parents 2a5d750 + e6e1cfd commit 867ae4b
Show file tree
Hide file tree
Showing 9 changed files with 158 additions and 9 deletions.
6 changes: 3 additions & 3 deletions latest_requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Faker==26.3.0
copulas==0.11.0
numpy==2.0.1
Faker==28.1.0
copulas==0.11.1
numpy==2.0.2
pandas==2.2.2
scikit-learn==1.5.1
scipy==1.13.1
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ collect_ignore = ['pyproject.toml']
exclude_lines = ['NotImplementedError()']

[tool.bumpversion]
current_version = "1.12.3"
current_version = "1.12.4.dev1"
parse = '(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?'
serialize = [
'{major}.{minor}.{patch}.{release}{candidate}',
Expand Down
2 changes: 1 addition & 1 deletion rdt/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

__author__ = 'DataCebo, Inc.'
__email__ = 'info@sdv.dev'
__version__ = '1.12.3'
__version__ = '1.12.4.dev1'


import sys
Expand Down
17 changes: 16 additions & 1 deletion rdt/hyper_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -871,8 +871,23 @@ def create_anonymized_columns(self, num_rows, column_names):
'list of valid column names.'
)

columns_to_generate = set()
for column in column_names:
if column not in self._multi_column_fields:
columns_to_generate.add(column)
continue

multi_columns = self._multi_column_fields[column]
if any(col not in column_names for col in multi_columns):
raise InvalidConfigError(
f"Column '{column}' is part of a multi-column field. You must include all "
'columns inside the multi-column field to generate the anonymized columns.'
)

columns_to_generate.add(multi_columns)

transformers = []
for column_name in column_names:
for column_name in sorted(columns_to_generate):
transformer = self.field_transformers.get(column_name)
if not transformer.is_generator():
raise TransformerProcessingError(
Expand Down
3 changes: 2 additions & 1 deletion rdt/transformers/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,8 +236,9 @@ def _set_fitted_parameters(
self._min_value = min(min_max_values)
self._max_value = max(min_max_values)

if rounding_digits:
if rounding_digits is not None:
self._rounding_digits = rounding_digits
self.learn_rounding_scheme = True

if self.null_transformer.models_missing_values():
self.output_columns.append(column_name + '.is_null')
Expand Down
4 changes: 2 additions & 2 deletions static_code_analysis.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
Run started:2024-07-09 19:56:52.363070
Run started:2024-08-14 20:11:54.714181

Test results:
No issues identified.

Code scanned:
Total lines of code: 5539
Total lines of code: 5530
Total lines skipped (#nosec): 0
Total potential issues skipped due to specifically being disabled (e.g., #nosec BXXX): 0

Expand Down
20 changes: 20 additions & 0 deletions tests/integration/transformers/test_numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,26 @@ def test__support__nullable_numerical_pandas_dtypes(self):
reverse_transformed[column].round(expected_rounding_digits[column]),
)

def test__set_fitted_parameter_rounding_to_integer(self):
"""Test the ``_set_fitted_parameters`` method with rounding_digits set to 0."""
# Setup
data = pd.DataFrame({
'col 1': 100 * np.random.random(10),
})
transformer = FloatFormatter()

# Run
transformer._set_fitted_parameters(
column_name='col 1',
null_transformer=NullTransformer(),
rounding_digits=0,
dtype='float',
)
reverse_transformed_data = transformer.reverse_transform(data)

# Assert
pd.testing.assert_frame_equal(reverse_transformed_data, data.round(0))


class TestGaussianNormalizer:
def test_stats(self):
Expand Down
112 changes: 112 additions & 0 deletions tests/unit/test_hyper_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1478,6 +1478,7 @@ def test_create_anonymized_columns(self):
instance._modified_config = False
instance._subset.return_value = False
instance.random_state = {}
instance._multi_column_fields = {}

random_element = AnonymizedFaker(
function_name='random_element', function_kwargs={'elements': ['a']}
Expand Down Expand Up @@ -1622,6 +1623,7 @@ def test_create_anonymized_columns_invalid_transformers(self):
instance._fitted = True
instance._modified_config = False
instance._subset.return_value = False
instance._multi_column_fields = {}

instance.field_transformers = {
'datetime': FloatFormatter(),
Expand All @@ -1641,6 +1643,116 @@ def test_create_anonymized_columns_invalid_transformers(self):
column_names=['datetime', 'random_element'],
)

def test_create_anonymized_columns_multi_column_transformer(self):
"""Test ``create_anonymized_columns`` with a multi-column transformer."""

class GeneratorTransformer(BaseMultiColumnTransformer):
IS_GENERATOR = True

def __init__(self):
super().__init__()
self.output_properties = {}

def _fit(self, data):
self.columns = list(data.columns)

def _transform(self, data):
return pd.DataFrame()

def _get_prefix(self):
return

def _reverse_transform(self, data):
num_rows = data.shape[0]
for column in self.columns:
data[column] = np.arange(num_rows)

return data

# Setup
instance = HyperTransformer()
instance._multi_column_fields = {
'col1': ('col1', 'col2'),
'col2': ('col1', 'col2'),
}
generator = GeneratorTransformer()
instance.field_transformers = {
('col1', 'col2'): generator,
}
instance.field_sdtypes = {
'col1': 'numerical',
'col2': 'numerical',
}
instance.fit(pd.DataFrame({'col1': [1, 2, 3], 'col2': [1, 2, 3]}))

# Run
output = instance.create_anonymized_columns(num_rows=5, column_names=['col1', 'col2'])

# Assert
expected_output = pd.DataFrame({
'col1': [0, 1, 2, 3, 4],
'col2': [0, 1, 2, 3, 4],
})
pd.testing.assert_frame_equal(output, expected_output, check_dtype=False)

def test_create_anonymized_columns_multi_column_transformer_error(self):
"""Test ``create_anonymized_columns`` raises error with multi-column transformer.
Test that:
- An error occurs when some columns in the column_name list are part of a multi-column
transformer, but not all the required columns of the multi-column
transformer are present.
- An error is raised when a multi-column transformer is not a generator.
"""

class MultiColumnTransformer(BaseMultiColumnTransformer):
IS_GENERATOR = False

def __init__(self):
super().__init__()
self.output_properties = {}

def _fit(self, data):
self.columns = list(data.columns)

def _transform(self, data):
return pd.DataFrame()

def _get_prefix(self):
return

# Setup
instance = HyperTransformer()
instance._multi_column_fields = {
'col1': ('col1', 'col2'),
'col2': ('col1', 'col2'),
}
not_generator = MultiColumnTransformer()
instance.field_transformers = {
('col1', 'col2'): not_generator,
}
instance.field_sdtypes = {
'col1': 'numerical',
'col2': 'numerical',
}
instance.fit(pd.DataFrame({'col1': [1, 2, 3], 'col2': [1, 2, 3]}))

# Run and Assert
error_msg_not_all_multi_column = re.escape(
"Column 'col1' is part of a multi-column field. You must include all "
'columns inside the multi-column field to generate the anonymized columns.'
)
with pytest.raises(InvalidConfigError, match=error_msg_not_all_multi_column):
instance.create_anonymized_columns(num_rows=5, column_names=['col1'])

error_msg_not_generator = re.escape(
"Column '('col1', 'col2')' cannot be anonymized. All columns must be assigned to "
"'AnonymizedFaker', 'RegexGenerator' or other ``generator``. Use "
"'get_config()' to see the current transformer assignments."
)
with pytest.raises(TransformerProcessingError, match=error_msg_not_generator):
instance.create_anonymized_columns(num_rows=5, column_names=['col1', 'col2'])

def test_reverse_transform(self):
"""Test the ``reverse_transform`` method.
Expand Down
1 change: 1 addition & 0 deletions tests/unit/transformers/test_numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -748,6 +748,7 @@ def test__set_fitted_parameters(self):
assert transformer._max_value == 100.0
assert transformer._rounding_digits == rounding_digits
assert transformer._dtype == dtype
assert transformer.learn_rounding_scheme is True

def test__set_fitted_parameters_from_column(self):
"""Test ``_set_fitted_parameters`` sets the required parameters for transformer."""
Expand Down

0 comments on commit 867ae4b

Please sign in to comment.