From a9b344f92d8138ec8e92cf5631c3b789b9417716 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Wed, 11 Sep 2024 14:43:52 +0200 Subject: [PATCH] Align ID to Text --- rdt/performance/datasets/__init__.py | 6 +- rdt/performance/datasets/{text.py => id.py} | 6 +- rdt/transformers/__init__.py | 9 +- rdt/transformers/id.py | 253 ++++++++ rdt/transformers/text.py | 253 +------- tests/integration/test_transformers.py | 8 +- .../transformers/pii/test_anonymizer.py | 2 +- .../transformers/{test_text.py => test_id.py} | 4 +- tests/unit/transformers/test___init__.py | 2 +- tests/unit/transformers/test_id.py | 591 +++++++++++++++++ tests/unit/transformers/test_text.py | 595 +----------------- 11 files changed, 882 insertions(+), 847 deletions(-) rename rdt/performance/datasets/{text.py => id.py} (92%) create mode 100644 rdt/transformers/id.py rename tests/integration/transformers/{test_text.py => test_id.py} (99%) create mode 100644 tests/unit/transformers/test_id.py diff --git a/rdt/performance/datasets/__init__.py b/rdt/performance/datasets/__init__.py index eaae213d..50af9fcb 100644 --- a/rdt/performance/datasets/__init__.py +++ b/rdt/performance/datasets/__init__.py @@ -6,19 +6,21 @@ boolean, categorical, datetime, + id, numerical, pii, - text, ) +from rdt.performance.datasets import id as text from rdt.performance.datasets.base import BaseDatasetGenerator __all__ = [ 'boolean', 'categorical', 'datetime', + 'id', 'numerical', - 'text', 'pii', + 'text', 'BaseDatasetGenerator', ] diff --git a/rdt/performance/datasets/text.py b/rdt/performance/datasets/id.py similarity index 92% rename from rdt/performance/datasets/text.py rename to rdt/performance/datasets/id.py index 5473d982..05b3e4a4 100644 --- a/rdt/performance/datasets/text.py +++ b/rdt/performance/datasets/id.py @@ -1,4 +1,4 @@ -"""Dataset Generators for Text transformers.""" +"""Dataset Generators for ID transformers.""" from abc import ABC @@ -9,9 +9,9 @@ class RegexGeneratorGenerator(BaseDatasetGenerator, ABC): - """Base class for generators that generate PII data.""" + """Base class for generators that generate ID data.""" - SDTYPE = 'text' + SDTYPE = 'id' class RandomStringGenerator(RegexGeneratorGenerator): diff --git a/rdt/transformers/__init__.py b/rdt/transformers/__init__.py index 4f8f79b6..0c190fa3 100644 --- a/rdt/transformers/__init__.py +++ b/rdt/transformers/__init__.py @@ -21,6 +21,7 @@ OptimizedTimestampEncoder, UnixTimestampEncoder, ) +from rdt.transformers.id import IDGenerator, RegexGenerator from rdt.transformers.null import NullTransformer from rdt.transformers.numerical import ( ClusterBasedNormalizer, @@ -31,7 +32,6 @@ AnonymizedFaker, PseudoAnonymizedFaker, ) -from rdt.transformers.text import IDGenerator, RegexGenerator __all__ = [ 'BaseTransformer', @@ -90,12 +90,13 @@ def get_transformer_name(transformer): DEFAULT_TRANSFORMERS = { - 'numerical': FloatFormatter(), - 'categorical': UniformEncoder(), 'boolean': UniformEncoder(), + 'categorical': UniformEncoder(), 'datetime': UnixTimestampEncoder(), - 'text': RegexGenerator(), + 'id': RegexGenerator(), + 'numerical': FloatFormatter(), 'pii': AnonymizedFaker(), + 'text': RegexGenerator(), } diff --git a/rdt/transformers/id.py b/rdt/transformers/id.py new file mode 100644 index 00000000..e1eaedc2 --- /dev/null +++ b/rdt/transformers/id.py @@ -0,0 +1,253 @@ +"""Transformers for ID data.""" + +import logging +import warnings + +import numpy as np +import pandas as pd + +from rdt.transformers.base import BaseTransformer +from rdt.transformers.utils import strings_from_regex + +LOGGER = logging.getLogger(__name__) + + +class IDGenerator(BaseTransformer): + """Generate an ID column. + + This transformer generates an ID column based on a given prefix, starting value and suffix. + + Args: + prefix (str): + Prefix of the generated IDs column. + Defaults to ``None``. + starting_value (int): + Starting value of the generated IDs column. + Defaults to ``0``. + suffix (str): + Suffix of the generated IDs column. + Defaults to ``None``. + """ + + IS_GENERATOR = True + INPUT_SDTYPE = 'id' + SUPPORTED_SDTYPES = ['id', 'text'] + + def __init__(self, prefix=None, starting_value=0, suffix=None): + super().__init__() + self.prefix = prefix + self.starting_value = starting_value + self.suffix = suffix + self._counter = 0 + self.output_properties = {None: {'next_transformer': None}} + + def reset_randomization(self): + """Reset the sampling _counter.""" + self._counter = 0 + + def _fit(self, data): + pass + + def _transform(self, _data): + """Drop the input column by returning ``None``.""" + return None + + def _reverse_transform(self, data): + """Generate new id column. + + Args: + data (pd.Series or numpy.ndarray): + Data to transform. + + Returns: + pd.Series + """ + start = self.starting_value + self._counter + prefix_str = self.prefix if self.prefix is not None else '' + suffix_str = self.suffix if self.suffix is not None else '' + + values = [f'{prefix_str}{start + idx}{suffix_str}' for idx in range(len(data))] + self._counter += len(data) + + return pd.Series(values) + + +class RegexGenerator(BaseTransformer): + """RegexGenerator transformer. + + This transformer will drop a column and regenerate it with the previously specified + ``regex`` format. + + Args: + regex (str): + String representing the regex function. + enforce_uniqueness (bool): + Whether or not to ensure that the new generated data is all unique. If it isn't + possible to create the requested number of rows, then an error will be raised. + Defaults to ``False``. + generation_order (str): + String defining how to generate the output. If set to ``alphanumeric``, it will + generate the output in alphanumeric order (ie. 'aaa', 'aab' or '1', '2'...). If + set to ``scrambled``, the the output will be scrambled in order. Defaults to + ``alphanumeric``. + """ + + IS_GENERATOR = True + INPUT_SDTYPE = 'id' + SUPPORTED_SDTYPES = ['id', 'text'] + + def __getstate__(self): + """Remove the generator when pickling.""" + state = self.__dict__.copy() + state.pop('generator') + return state + + def __setstate__(self, state): + """Set the generator when pickling.""" + generator_size = state.get('generator_size') + generated = state.get('generated') + generator, size = strings_from_regex(state.get('regex_format')) + if generator_size is None: + state['generator_size'] = size + if generated is None: + state['generated'] = 0 + + if generated: + for _ in range(generated): + next(generator) + + state['generator'] = generator + self.__dict__ = state + + def __init__( + self, + regex_format='[A-Za-z]{5}', + enforce_uniqueness=False, + generation_order='alphanumeric', + ): + super().__init__() + self.output_properties = {None: {'next_transformer': None}} + self.enforce_uniqueness = enforce_uniqueness + self.regex_format = regex_format + self.data_length = None + self.generator = None + self.generator_size = None + self.generated = None + if generation_order not in ['alphanumeric', 'scrambled']: + raise ValueError("generation_order must be one of 'alphanumeric' or 'scrambled'.") + + self.generation_order = generation_order + + def reset_randomization(self): + """Create a new generator and reset the generated values counter.""" + super().reset_randomization() + self.generator, self.generator_size = strings_from_regex(self.regex_format) + self.generated = 0 + + def _fit(self, data): + """Fit the transformer to the data. + + Args: + data (pandas.Series): + Data to fit to. + """ + self.reset_randomization() + self.data_length = len(data) + + def _transform(self, _data): + """Drop the input column by returning ``None``.""" + return None + + def _warn_not_enough_unique_values(self, sample_size): + """Warn the user that the regex cannot generate enough unique values. + + Args: + sample_size (int): + Number of samples to be generated. + """ + warned = False + if sample_size > self.generator_size: + if self.enforce_uniqueness: + warnings.warn( + f"The regex for '{self.get_input_column()}' can only generate " + f'{self.generator_size} unique values. Additional values may not exactly ' + 'follow the provided regex.' + ) + warned = True + else: + LOGGER.info( + "The data has %s rows but the regex for '%s' can only create %s unique values." + " Some values in '%s' may be repeated.", + sample_size, + self.get_input_column(), + self.generator_size, + self.get_input_column(), + ) + + remaining = self.generator_size - self.generated + if sample_size > remaining and self.enforce_uniqueness and not warned: + warnings.warn( + f'The regex generator is not able to generate {sample_size} new unique ' + f'values (only {max(remaining, 0)} unique values left).' + ) + + def _reverse_transform(self, data): + """Generate new data using the provided ``regex_format``. + + Args: + data (pd.Series or numpy.ndarray): + Data to transform. + + Returns: + pandas.Series + """ + if data is not None and len(data): + sample_size = len(data) + else: + sample_size = self.data_length + + self._warn_not_enough_unique_values(sample_size) + + remaining = self.generator_size - self.generated + if sample_size > remaining: + self.reset_randomization() + remaining = self.generator_size + + generated_values = [] + while len(generated_values) < sample_size: + try: + generated_values.append(next(self.generator)) + self.generated += 1 + except (RuntimeError, StopIteration): + # Can't generate more rows without collision so breaking out of loop + break + + reverse_transformed = generated_values[:] + + if len(reverse_transformed) < sample_size: + if self.enforce_uniqueness: + try: + remaining_samples = sample_size - len(reverse_transformed) + start = int(generated_values[-1]) + 1 + reverse_transformed.extend([ + str(i) for i in range(start, start + remaining_samples) + ]) + + except ValueError: + counter = 0 + while len(reverse_transformed) < sample_size: + remaining_samples = sample_size - len(reverse_transformed) + reverse_transformed.extend([ + f'{i}({counter})' for i in generated_values[:remaining_samples] + ]) + counter += 1 + + else: + while len(reverse_transformed) < sample_size: + remaining_samples = sample_size - len(reverse_transformed) + reverse_transformed.extend(generated_values[:remaining_samples]) + + if getattr(self, 'generation_order', 'alphanumeric') == 'scrambled': + np.random.shuffle(reverse_transformed) + + return np.array(reverse_transformed, dtype=object) diff --git a/rdt/transformers/text.py b/rdt/transformers/text.py index 2d02650d..2de3d0eb 100644 --- a/rdt/transformers/text.py +++ b/rdt/transformers/text.py @@ -1,251 +1,12 @@ """Transformers for text data.""" -import logging import warnings -import numpy as np -import pandas as pd +from rdt.transformers.id import IDGenerator, RegexGenerator # noqa: F401 -from rdt.transformers.base import BaseTransformer -from rdt.transformers.utils import strings_from_regex - -LOGGER = logging.getLogger(__name__) - - -class IDGenerator(BaseTransformer): - """Generate an ID column. - - This transformer generates an ID column based on a given prefix, starting value and suffix. - - Args: - prefix (str): - Prefix of the generated IDs column. - Defaults to ``None``. - starting_value (int): - Starting value of the generated IDs column. - Defaults to ``0``. - suffix (str): - Suffix of the generated IDs column. - Defaults to ``None``. - """ - - IS_GENERATOR = True - INPUT_SDTYPE = 'text' - - def __init__(self, prefix=None, starting_value=0, suffix=None): - super().__init__() - self.prefix = prefix - self.starting_value = starting_value - self.suffix = suffix - self._counter = 0 - self.output_properties = {None: {'next_transformer': None}} - - def reset_randomization(self): - """Reset the sampling _counter.""" - self._counter = 0 - - def _fit(self, data): - pass - - def _transform(self, _data): - """Drop the input column by returning ``None``.""" - return None - - def _reverse_transform(self, data): - """Generate new id column. - - Args: - data (pd.Series or numpy.ndarray): - Data to transform. - - Returns: - pd.Series - """ - start = self.starting_value + self._counter - prefix_str = self.prefix if self.prefix is not None else '' - suffix_str = self.suffix if self.suffix is not None else '' - - values = [f'{prefix_str}{start + idx}{suffix_str}' for idx in range(len(data))] - self._counter += len(data) - - return pd.Series(values) - - -class RegexGenerator(BaseTransformer): - """RegexGenerator transformer. - - This transformer will drop a column and regenerate it with the previously specified - ``regex`` format. - - Args: - regex (str): - String representing the regex function. - enforce_uniqueness (bool): - Whether or not to ensure that the new generated data is all unique. If it isn't - possible to create the requested number of rows, then an error will be raised. - Defaults to ``False``. - generation_order (str): - String defining how to generate the output. If set to ``alphanumeric``, it will - generate the output in alphanumeric order (ie. 'aaa', 'aab' or '1', '2'...). If - set to ``scrambled``, the the output will be scrambled in order. Defaults to - ``alphanumeric``. - """ - - IS_GENERATOR = True - INPUT_SDTYPE = 'text' - - def __getstate__(self): - """Remove the generator when pickling.""" - state = self.__dict__.copy() - state.pop('generator') - return state - - def __setstate__(self, state): - """Set the generator when pickling.""" - generator_size = state.get('generator_size') - generated = state.get('generated') - generator, size = strings_from_regex(state.get('regex_format')) - if generator_size is None: - state['generator_size'] = size - if generated is None: - state['generated'] = 0 - - if generated: - for _ in range(generated): - next(generator) - - state['generator'] = generator - self.__dict__ = state - - def __init__( - self, - regex_format='[A-Za-z]{5}', - enforce_uniqueness=False, - generation_order='alphanumeric', - ): - super().__init__() - self.output_properties = {None: {'next_transformer': None}} - self.enforce_uniqueness = enforce_uniqueness - self.regex_format = regex_format - self.data_length = None - self.generator = None - self.generator_size = None - self.generated = None - if generation_order not in ['alphanumeric', 'scrambled']: - raise ValueError("generation_order must be one of 'alphanumeric' or 'scrambled'.") - - self.generation_order = generation_order - - def reset_randomization(self): - """Create a new generator and reset the generated values counter.""" - super().reset_randomization() - self.generator, self.generator_size = strings_from_regex(self.regex_format) - self.generated = 0 - - def _fit(self, data): - """Fit the transformer to the data. - - Args: - data (pandas.Series): - Data to fit to. - """ - self.reset_randomization() - self.data_length = len(data) - - def _transform(self, _data): - """Drop the input column by returning ``None``.""" - return None - - def _warn_not_enough_unique_values(self, sample_size): - """Warn the user that the regex cannot generate enough unique values. - - Args: - sample_size (int): - Number of samples to be generated. - """ - warned = False - if sample_size > self.generator_size: - if self.enforce_uniqueness: - warnings.warn( - f"The regex for '{self.get_input_column()}' can only generate " - f'{self.generator_size} unique values. Additional values may not exactly ' - 'follow the provided regex.' - ) - warned = True - else: - LOGGER.info( - "The data has %s rows but the regex for '%s' can only create %s unique values." - " Some values in '%s' may be repeated.", - sample_size, - self.get_input_column(), - self.generator_size, - self.get_input_column(), - ) - - remaining = self.generator_size - self.generated - if sample_size > remaining and self.enforce_uniqueness and not warned: - warnings.warn( - f'The regex generator is not able to generate {sample_size} new unique ' - f'values (only {max(remaining, 0)} unique values left).' - ) - - def _reverse_transform(self, data): - """Generate new data using the provided ``regex_format``. - - Args: - data (pd.Series or numpy.ndarray): - Data to transform. - - Returns: - pandas.Series - """ - if data is not None and len(data): - sample_size = len(data) - else: - sample_size = self.data_length - - self._warn_not_enough_unique_values(sample_size) - - remaining = self.generator_size - self.generated - if sample_size > remaining: - self.reset_randomization() - remaining = self.generator_size - - generated_values = [] - while len(generated_values) < sample_size: - try: - generated_values.append(next(self.generator)) - self.generated += 1 - except (RuntimeError, StopIteration): - # Can't generate more rows without collision so breaking out of loop - break - - reverse_transformed = generated_values[:] - - if len(reverse_transformed) < sample_size: - if self.enforce_uniqueness: - try: - remaining_samples = sample_size - len(reverse_transformed) - start = int(generated_values[-1]) + 1 - reverse_transformed.extend([ - str(i) for i in range(start, start + remaining_samples) - ]) - - except ValueError: - counter = 0 - while len(reverse_transformed) < sample_size: - remaining_samples = sample_size - len(reverse_transformed) - reverse_transformed.extend([ - f'{i}({counter})' for i in generated_values[:remaining_samples] - ]) - counter += 1 - - else: - while len(reverse_transformed) < sample_size: - remaining_samples = sample_size - len(reverse_transformed) - reverse_transformed.extend(generated_values[:remaining_samples]) - - if getattr(self, 'generation_order', 'alphanumeric') == 'scrambled': - np.random.shuffle(reverse_transformed) - - return np.array(reverse_transformed, dtype=object) +warnings.warn( + "Importing 'IDGenerator' or 'RegexGenerator' for ID columns from 'rdt.transformers.text' " + "is deprecated. Please use 'rdt.transformers.id' instead.", + DeprecationWarning, + stacklevel=2, +) diff --git a/tests/integration/test_transformers.py b/tests/integration/test_transformers.py index aeefe293..ad8b399e 100644 --- a/tests/integration/test_transformers.py +++ b/tests/integration/test_transformers.py @@ -34,7 +34,7 @@ 'integer': ['i'], 'float': ['f', 'i'], 'pii': ['O', 'i', 'f'], - 'text': ['O', 'i', 'f'], + 'id': ['O', 'i', 'f'], } @@ -71,7 +71,11 @@ def _is_valid_transformer(transformer_name): def _get_all_transformers(): """Get all transformers to be tested.""" all_transformers = BaseTransformer.get_subclasses() - return [t for t in all_transformers if _is_valid_transformer(t.__name__)] + return [ + transformer + for transformer in all_transformers + if _is_valid_transformer(transformer.__name__) + ] def _build_generator_map(): diff --git a/tests/integration/transformers/pii/test_anonymizer.py b/tests/integration/transformers/pii/test_anonymizer.py index 1d5a1d5f..d99dc90e 100644 --- a/tests/integration/transformers/pii/test_anonymizer.py +++ b/tests/integration/transformers/pii/test_anonymizer.py @@ -47,7 +47,7 @@ def test_get_supported_sdtypes(self): supported_sdtypes = AnonymizedFaker.get_supported_sdtypes() # Assert - assert sorted(supported_sdtypes) == sorted(['pii', 'text']) + assert sorted(supported_sdtypes) == sorted(['pii', 'text', 'id']) def test_custom_provider(self): """End to end test with a custom provider and function for the ``AnonymizedFaker``.""" diff --git a/tests/integration/transformers/test_text.py b/tests/integration/transformers/test_id.py similarity index 99% rename from tests/integration/transformers/test_text.py rename to tests/integration/transformers/test_id.py index 02085cc9..96f7b004 100644 --- a/tests/integration/transformers/test_text.py +++ b/tests/integration/transformers/test_id.py @@ -4,7 +4,7 @@ import pandas as pd from rdt import HyperTransformer, get_demo -from rdt.transformers.text import IDGenerator, RegexGenerator +from rdt.transformers.id import IDGenerator, RegexGenerator class TestIDGenerator: @@ -348,7 +348,7 @@ def test_end_to_end_scrambled(self): customers['id'] = ['id_a', 'id_b', 'id_c', 'id_d', 'id_e'] ht = HyperTransformer() ht.detect_initial_config(customers) - ht.update_sdtypes({'id': 'text'}) + ht.update_sdtypes({'id': 'id'}) ht.update_transformers({ 'id': RegexGenerator(regex_format='id_[a-z]', generation_order='scrambled') }) diff --git a/tests/unit/transformers/test___init__.py b/tests/unit/transformers/test___init__.py index e6542704..69c7d813 100644 --- a/tests/unit/transformers/test___init__.py +++ b/tests/unit/transformers/test___init__.py @@ -98,7 +98,7 @@ def test_get_default_transformers(): 'categorical': UniformEncoder, 'boolean': UniformEncoder, 'datetime': UnixTimestampEncoder, - 'text': RegexGenerator, + 'id': RegexGenerator, 'pii': AnonymizedFaker, } diff --git a/tests/unit/transformers/test_id.py b/tests/unit/transformers/test_id.py new file mode 100644 index 00000000..b3f8d093 --- /dev/null +++ b/tests/unit/transformers/test_id.py @@ -0,0 +1,591 @@ +"""Test for ID transformers.""" + +from string import ascii_uppercase +from unittest.mock import Mock, patch + +import numpy as np +import pandas as pd +import pytest + +from rdt.transformers.id import IDGenerator, RegexGenerator + + +class AsciiGenerator: + """Ascii Upercase Generator.""" + + def __init__(self, max_size=26): + self.pos = 0 + self.max_size = max_size + + def __iter__(self): + return self + + def __next__(self): + if self.pos >= self.max_size: + raise StopIteration + + char = ascii_uppercase[self.pos] + self.pos += 1 + + return char + + +class TestIDGenerator: + def test___init__default(self): + """Test the ``__init__`` method.""" + # Run + transformer = IDGenerator() + + # Assert + assert transformer.prefix is None + assert transformer.starting_value == 0 + assert transformer.suffix is None + assert transformer._counter == 0 + assert transformer.output_properties == {None: {'next_transformer': None}} + + def test___init__with_parameters(self): + """Test the ``__init__`` method with paremeters.""" + # Run + transformer_prefix = IDGenerator(prefix='prefix_') + transformer_suffix = IDGenerator(suffix='_suffix') + transformer_starting_value = IDGenerator(starting_value=10) + transformer_all = IDGenerator(prefix='prefix_', starting_value=10, suffix='_suffix') + + # Assert + assert transformer_prefix.prefix == 'prefix_' + assert transformer_prefix.starting_value == 0 + assert transformer_prefix.suffix is None + assert transformer_prefix._counter == 0 + assert transformer_prefix.output_properties == {None: {'next_transformer': None}} + + assert transformer_suffix.prefix is None + assert transformer_suffix.starting_value == 0 + assert transformer_suffix.suffix == '_suffix' + assert transformer_suffix._counter == 0 + assert transformer_suffix.output_properties == {None: {'next_transformer': None}} + + assert transformer_starting_value.prefix is None + assert transformer_starting_value.starting_value == 10 + assert transformer_starting_value.suffix is None + assert transformer_starting_value._counter == 0 + assert transformer_starting_value.output_properties == {None: {'next_transformer': None}} + + assert transformer_all.prefix == 'prefix_' + assert transformer_all.starting_value == 10 + assert transformer_all.suffix == '_suffix' + assert transformer_all._counter == 0 + assert transformer_all.output_properties == {None: {'next_transformer': None}} + + def test_reset_randomization(self): + """Test the ``reset_randomization`` method.""" + # Setup + transformer = IDGenerator() + transformer._counter = 10 + + # Run + transformer.reset_randomization() + + # Assert + assert transformer._counter == 0 + + def test__fit(self): + """Test the ``_fit`` method.""" + # Setup + transformer = IDGenerator() + + # Run + transformer._fit(None) + + # Assert + assert True + + def test__transform(self): + """Test the ``_transform`` method.""" + # Setup + transformer = IDGenerator() + + # Run + result = transformer._transform(None) + + # Assert + assert result is None + + def test__reverse_transform(self): + """Test the ``_reverse_transform`` method.""" + # Setup + transformer = IDGenerator() + transformer._counter = 10 + + # Run + result = transformer._reverse_transform(np.array([1, 2, 3])) + + # Assert + assert isinstance(result, pd.Series) + assert result.tolist() == ['10', '11', '12'] + assert transformer._counter == 13 + + def test__reverse_transform_with_everything(self): + """Test the ``_reverse_transform`` method with all parameters.""" + # Setup + transformer = IDGenerator(prefix='prefix_', starting_value=100, suffix='_suffix') + + # Run + result = transformer._reverse_transform(np.array([1, 2, 3])) + + # Assert + assert isinstance(result, pd.Series) + assert result.tolist() == [ + 'prefix_100_suffix', + 'prefix_101_suffix', + 'prefix_102_suffix', + ] + assert transformer._counter == 3 + + +class TestRegexGenerator: + """Test class for ``RegexGenerator``.""" + + def test___getstate__(self): + """Test that ``__getstate__`` returns a dictionary without the generator.""" + # Setup + instance = RegexGenerator() + instance.reset_randomization() + mock_random_sates = Mock() + instance.random_states = mock_random_sates + + # Run + state = instance.__getstate__() + + # Assert + assert state == { + 'data_length': None, + 'enforce_uniqueness': False, + 'generated': 0, + 'generator_size': 380204032, + 'output_properties': {None: {'next_transformer': None}}, + 'regex_format': '[A-Za-z]{5}', + 'random_states': mock_random_sates, + 'generation_order': 'alphanumeric', + } + + @patch('rdt.transformers.id.strings_from_regex') + def test___setstate__generated_and_generator_size(self, mock_strings_from_regex): + """Test that ``__setstate__`` will initialize a generator and wind it forward.""" + # Setup + state = { + 'data_length': None, + 'enforce_uniqueness': False, + 'generated': 10, + 'generator_size': 380204032, + 'output_properties': {None: {'next_transformer': None}}, + 'regex_format': '[A-Za-z]{5}', + } + generator = AsciiGenerator() + mock_strings_from_regex.return_value = (generator, 26) + instance = RegexGenerator() + + # Run + instance.__setstate__(state) + + # Assert + assert next(generator) == 'K' + assert instance.generated == 10 + assert instance.generator_size == 380204032 + mock_strings_from_regex.assert_called_once_with('[A-Za-z]{5}') + + @patch('rdt.transformers.id.strings_from_regex') + def test___setstate__(self, mock_strings_from_regex): + """Test that ``__setstate__`` will initialize a generator but not forward it. + + When ``generated`` is ``None`` and ``generator_size`` is ``None`` this will be assigned + the ``0`` and the ``generator_size`` respectively. + """ + # Setup + state = { + 'data_length': None, + 'enforce_uniqueness': False, + 'generated': None, + 'generator_size': None, + 'output_properties': {None: {'next_transformer': None}}, + 'regex_format': '[A-Za-z]{5}', + } + generator = AsciiGenerator() + mock_strings_from_regex.return_value = (generator, 26) + instance = RegexGenerator() + + # Run + instance.__setstate__(state) + + # Assert + assert next(generator) == 'A' + assert instance.generated == 0 + assert instance.generator_size == 26 + mock_strings_from_regex.assert_called_once_with('[A-Za-z]{5}') + + def test___init__default(self): + """Test the default instantiation of the transformer. + + Test that ``RegexGenerator`` defaults to ``regex_format='[A-Za-z]{5}'`` + + Side effects: + - the ``instance.regex_format`` is ``'[A-Za-z]{5}'``'. + """ + # Run + instance = RegexGenerator() + + # Assert + assert instance.data_length is None + assert instance.regex_format == '[A-Za-z]{5}' + assert instance.enforce_uniqueness is False + assert instance.generation_order == 'alphanumeric' + + def test___init__custom(self): + """Test the default instantiation of the transformer. + + Test that when creating an instance of ``RegexGenerator`` and passing a + ``regex_format`` this is being stored. + + Side effects: + - the ``instance.regex_format`` is ``'[A-Za-z]{5}'``'. + - ``instance.enforce_uniqueness`` is ``True``. + """ + # Run + instance = RegexGenerator( + regex_format='[0-9]', + enforce_uniqueness=True, + generation_order='scrambled', + ) + + # Assert + assert instance.data_length is None + assert instance.regex_format == '[0-9]' + assert instance.enforce_uniqueness + assert instance.generation_order == 'scrambled' + + def test___init__bad_value_generation_order(self): + """Test that an error is raised if a bad value is given for `generation_order`.""" + # Run and Assert + error_message = "generation_order must be one of 'alphanumeric' or 'scrambled'." + with pytest.raises(ValueError, match=error_message): + RegexGenerator(generation_order='afdsfd') + + @patch('rdt.transformers.id.BaseTransformer.reset_randomization') + @patch('rdt.transformers.id.strings_from_regex') + def test_reset_randomization(self, mock_strings_from_regex, mock_base_reset): + """Test that this method creates a new generator. + + This method should create a new ``instance.generator``, ``instance.generator_size`` and + restart the ``instance.generated`` values to 0. + """ + # Setup + generator = AsciiGenerator(5) + mock_strings_from_regex.return_value = (generator, 2) + instance = RegexGenerator() + + # Run + instance.reset_randomization() + + # Assert + assert instance.generator == generator + assert instance.generator_size == 2 + assert instance.generated == 0 + mock_strings_from_regex.assert_called_once_with('[A-Za-z]{5}') + mock_base_reset.assert_called_once() + + def test__fit(self): + """Test the ``_fit`` method. + + Validate that the ``_fit`` method learns the original data length. + + Setup: + - Initialize a ``RegexGenerator`` transformer. + + Input: + - ``pd.Series`` containing 3 strings. + + Side Effects: + - ``instance.data_length`` equals to the length of the input data. + """ + # Setup + instance = RegexGenerator() + columns_data = pd.Series(['1', '2', '3']) + + # Run + instance._fit(columns_data) + + # Assert + assert instance.data_length == 3 + assert instance.output_properties == {None: {'next_transformer': None}} + + def test__transform(self): + """Test the ``_transform`` method. + + Validate that the ``_transform`` method returns ``None``. + + Setup: + - Initialize a ``RegexGenerator`` transformer. + + Input: + - ``pd.Series`` with three values. + + Output: + - ``None``. + """ + # Setup + columns_data = pd.Series([1, 2, 3]) + instance = RegexGenerator() + + # Run + result = instance._transform(columns_data) + + # Assert + assert result is None + + @patch('rdt.transformers.id.np.random.shuffle') + def test__reverse_transform_generation_order_scrambled(self, shuffle_mock): + """Test the ``_reverse_transform`` method with ``generation_order`` set to scrambled. + + Validate that when ``generation_order`` is ``'scrambled'``, the data is not in order. + """ + # Setup + instance = RegexGenerator('[A-Z]') + columns_data = pd.Series() + instance.data_length = 3 + generator = AsciiGenerator(max_size=5) + instance.generator = generator + instance.generator_size = 5 + instance.generated = 0 + instance.generation_order = 'scrambled' + + # Run + result = instance._reverse_transform(columns_data) + + # Assert + np.testing.assert_array_equal(result, np.array(['A', 'B', 'C'])) + shuffle_mock.assert_called_once_with(['A', 'B', 'C']) + + def test__reverse_transform_generator_size_bigger_than_data_length(self): + """Test the ``_reverse_transform`` method. + + Validate that the ``_reverse_transform`` method uses the ``instance.generator`` + to generate the ``instance.data_length`` number of data. + + Setup: + - Initialize a ``RegexGenerator`` instance. + - Set ``data_length`` to 3. + - Initialize a generator. + - Set a generator, generator size and generated values. + + Output: + - A ``numpy.array`` with the first three letters from the generator. + """ + # Setup + instance = RegexGenerator('[A-Z]') + columns_data = pd.Series() + instance.data_length = 3 + generator = AsciiGenerator(max_size=5) + instance.generator = generator + instance.generator_size = 5 + instance.generated = 0 + + # Run + result = instance._reverse_transform(columns_data) + + # Assert + np.testing.assert_array_equal(result, np.array(['A', 'B', 'C'])) + + def test__reverse_transform_generator_size_smaller_than_data_length(self): + """Test the ``_reverse_transform`` method. + + Validate that the ``_reverse_transform`` method uses the ``instance.generator`` + to generate the ``instance.data_length`` number of data when ``enforce_uniqueness`` is + ``False`` but the data to be created is bigger. + + Setup: + - Initialize a ``RegexGenerator`` instance. + - Set ``data_length`` to 11. + - Initialize a generator. + + Output: + - A ``numpy.array`` with the first five letters from the generator repeated. + """ + # Setup + instance = RegexGenerator('[A-Z]', enforce_uniqueness=False) + columns_data = pd.Series() + instance.reset_randomization = Mock() + instance.data_length = 11 + generator = AsciiGenerator(5) + instance.columns = ['a'] + instance.generator = generator + instance.generator_size = 5 + instance.generated = 0 + + # Run + result = instance._reverse_transform(columns_data) + + # Assert + expected_result = np.array([ + 'A', + 'B', + 'C', + 'D', + 'E', + 'A', + 'B', + 'C', + 'D', + 'E', + 'A', + ]) + np.testing.assert_array_equal(result, expected_result) + + def test__reverse_transform_generator_size_of_input_data(self): + """Test the ``_reverse_transform`` method. + + Validate that the ``_reverse_transform`` method uses the ``instance.generator`` + to generate the ``instance.data_length`` number of data when ``enforce_uniqueness`` is + ``False`` but the data to be created is bigger. + + Setup: + - Initialize a ``RegexGenerator`` instance. + - Set ``data_length`` to 2. + - Initialize a generator. + + Input: + - ``pandas.Series`` with a length of ``4``. + + Output: + - A ``numpy.array`` with the first five letters from the generator repeated. + """ + # Setup + instance = RegexGenerator('[A-Z]') + columns_data = pd.Series([1, 2, 3, 4]) + instance.data_length = 2 + generator = AsciiGenerator(5) + instance.generator = generator + instance.generator_size = 5 + instance.generated = 0 + instance.columns = ['a'] + + # Run + result = instance._reverse_transform(columns_data) + + # Assert + expected_result = np.array(['A', 'B', 'C', 'D']) + np.testing.assert_array_equal(result, expected_result) + assert instance.generated == 4 + + @patch('rdt.transformers.id.warnings') + def test__reverse_transform_not_enough_unique_values_enforce_uniqueness(self, mock_warnings): + """Test it when there are not enough unique values to generate.""" + # Setup + instance = RegexGenerator('[A-E]', enforce_uniqueness=True) + instance.data_length = 6 + generator = AsciiGenerator(5) + instance.generator = generator + instance.generator_size = 5 + instance.generated = 0 + instance.columns = ['a'] + columns_data = pd.Series() + + # Run + out = instance._reverse_transform(columns_data) + + # Assert + mock_warnings.warn.assert_called_once_with( + "The regex for 'a' can only generate 5 " + 'unique values. Additional values may not exactly follow the provided regex.' + ) + np.testing.assert_array_equal(out, np.array(['A', 'B', 'C', 'D', 'E', 'A(0)'])) + + def test__reverse_transform_not_enough_unique_values(self): + """Test it when there are not enough unique values to generate.""" + # Setup + instance = RegexGenerator('[A-E]', enforce_uniqueness=False) + instance.data_length = 6 + generator = AsciiGenerator(5) + instance.generator = generator + instance.generator_size = 5 + instance.generated = 0 + instance.columns = ['a'] + columns_data = pd.Series() + + # Run + out = instance._reverse_transform(columns_data) + + # Assert + np.testing.assert_array_equal(out, np.array(['A', 'B', 'C', 'D', 'E', 'A'])) + + @patch('rdt.transformers.id.warnings') + def test__reverse_transform_not_enough_unique_values_numerical(self, mock_warnings): + """Test it when there are not enough unique values to generate.""" + # Setup + instance = RegexGenerator('[1-3]', enforce_uniqueness=True) + instance.data_length = 6 + generator = AsciiGenerator(5) + instance.generator = generator + instance.generator_size = 3 + instance.generated = 0 + instance.columns = ['a'] + columns_data = pd.Series() + + # Run + out = instance._reverse_transform(columns_data) + + # Assert + mock_warnings.warn.assert_called_once_with( + "The regex for 'a' can only generate 3 " + 'unique values. Additional values may not exactly follow the provided regex.' + ) + np.testing.assert_array_equal(out, np.array(['1', '2', '3', '4', '5', '6'])) + + @patch('rdt.transformers.id.warnings') + def test__reverse_transform_enforce_uniqueness_not_enough_remaining(self, mock_warnings): + """Test the case when there are not enough unique values remaining.""" + # Setup + instance = RegexGenerator('[A-Z]', enforce_uniqueness=True) + instance.data_length = 6 + generator = AsciiGenerator(10) + instance.generator = generator + instance.generator_size = 10 + instance.generated = 9 + instance.columns = ['a'] + columns_data = pd.Series() + + # Run + out = instance._reverse_transform(columns_data) + + # Assert + mock_warnings.warn.assert_called_once_with( + 'The regex generator is not able to generate 6 new unique ' + 'values (only 1 unique values left).' + ) + np.testing.assert_array_equal(out, np.array(['A', 'B', 'C', 'D', 'E', 'F'])) + + @patch('rdt.transformers.id.LOGGER') + def test__reverse_transform_info_message(self, mock_logger): + """Test the ``_reverse_transform`` method. + + Validate that the ``_reverse_transform`` method logs an info message when + ``enforce_uniqueness`` is ``False`` and the ``instance.data_length`` is bigger than + ``instance.generator_size``. + """ + # Setup + instance = RegexGenerator('[A-Z]', enforce_uniqueness=False) + instance.data_length = 6 + instance.generator_size = 5 + instance.generated = 0 + instance.columns = ['a'] + columns_data = pd.Series() + + # Run + instance._reverse_transform(columns_data) + + # Assert + expected_format = ( + "The data has %s rows but the regex for '%s' can only create %s unique values. Some " + "values in '%s' may be repeated." + ) + expected_args = (6, 'a', 5, 'a') + + mock_logger.info.assert_called_once_with(expected_format, *expected_args) diff --git a/tests/unit/transformers/test_text.py b/tests/unit/transformers/test_text.py index 466cad7d..56cd528f 100644 --- a/tests/unit/transformers/test_text.py +++ b/tests/unit/transformers/test_text.py @@ -1,591 +1,14 @@ """Test Text Transformers.""" -from string import ascii_uppercase -from unittest.mock import Mock, patch - -import numpy as np -import pandas as pd import pytest -from rdt.transformers.text import IDGenerator, RegexGenerator - - -class AsciiGenerator: - """Ascii Upercase Generator.""" - - def __init__(self, max_size=26): - self.pos = 0 - self.max_size = max_size - - def __iter__(self): - return self - - def __next__(self): - if self.pos >= self.max_size: - raise StopIteration - - char = ascii_uppercase[self.pos] - self.pos += 1 - - return char - - -class TestIDGenerator: - def test___init__default(self): - """Test the ``__init__`` method.""" - # Run - transformer = IDGenerator() - - # Assert - assert transformer.prefix is None - assert transformer.starting_value == 0 - assert transformer.suffix is None - assert transformer._counter == 0 - assert transformer.output_properties == {None: {'next_transformer': None}} - - def test___init__with_parameters(self): - """Test the ``__init__`` method with paremeters.""" - # Run - transformer_prefix = IDGenerator(prefix='prefix_') - transformer_suffix = IDGenerator(suffix='_suffix') - transformer_starting_value = IDGenerator(starting_value=10) - transformer_all = IDGenerator(prefix='prefix_', starting_value=10, suffix='_suffix') - - # Assert - assert transformer_prefix.prefix == 'prefix_' - assert transformer_prefix.starting_value == 0 - assert transformer_prefix.suffix is None - assert transformer_prefix._counter == 0 - assert transformer_prefix.output_properties == {None: {'next_transformer': None}} - - assert transformer_suffix.prefix is None - assert transformer_suffix.starting_value == 0 - assert transformer_suffix.suffix == '_suffix' - assert transformer_suffix._counter == 0 - assert transformer_suffix.output_properties == {None: {'next_transformer': None}} - - assert transformer_starting_value.prefix is None - assert transformer_starting_value.starting_value == 10 - assert transformer_starting_value.suffix is None - assert transformer_starting_value._counter == 0 - assert transformer_starting_value.output_properties == {None: {'next_transformer': None}} - - assert transformer_all.prefix == 'prefix_' - assert transformer_all.starting_value == 10 - assert transformer_all.suffix == '_suffix' - assert transformer_all._counter == 0 - assert transformer_all.output_properties == {None: {'next_transformer': None}} - - def test_reset_randomization(self): - """Test the ``reset_randomization`` method.""" - # Setup - transformer = IDGenerator() - transformer._counter = 10 - - # Run - transformer.reset_randomization() - - # Assert - assert transformer._counter == 0 - - def test__fit(self): - """Test the ``_fit`` method.""" - # Setup - transformer = IDGenerator() - - # Run - transformer._fit(None) - - # Assert - assert True - - def test__transform(self): - """Test the ``_transform`` method.""" - # Setup - transformer = IDGenerator() - - # Run - result = transformer._transform(None) - - # Assert - assert result is None - - def test__reverse_transform(self): - """Test the ``_reverse_transform`` method.""" - # Setup - transformer = IDGenerator() - transformer._counter = 10 - - # Run - result = transformer._reverse_transform(np.array([1, 2, 3])) - - # Assert - assert isinstance(result, pd.Series) - assert result.tolist() == ['10', '11', '12'] - assert transformer._counter == 13 - - def test__reverse_transform_with_everything(self): - """Test the ``_reverse_transform`` method with all parameters.""" - # Setup - transformer = IDGenerator(prefix='prefix_', starting_value=100, suffix='_suffix') - - # Run - result = transformer._reverse_transform(np.array([1, 2, 3])) - - # Assert - assert isinstance(result, pd.Series) - assert result.tolist() == [ - 'prefix_100_suffix', - 'prefix_101_suffix', - 'prefix_102_suffix', - ] - assert transformer._counter == 3 - - -class TestRegexGenerator: - """Test class for ``RegexGenerator``.""" - - def test___getstate__(self): - """Test that ``__getstate__`` returns a dictionary without the generator.""" - # Setup - instance = RegexGenerator() - instance.reset_randomization() - mock_random_sates = Mock() - instance.random_states = mock_random_sates - - # Run - state = instance.__getstate__() - - # Assert - assert state == { - 'data_length': None, - 'enforce_uniqueness': False, - 'generated': 0, - 'generator_size': 380204032, - 'output_properties': {None: {'next_transformer': None}}, - 'regex_format': '[A-Za-z]{5}', - 'random_states': mock_random_sates, - 'generation_order': 'alphanumeric', - } - - @patch('rdt.transformers.text.strings_from_regex') - def test___setstate__generated_and_generator_size(self, mock_strings_from_regex): - """Test that ``__setstate__`` will initialize a generator and wind it forward.""" - # Setup - state = { - 'data_length': None, - 'enforce_uniqueness': False, - 'generated': 10, - 'generator_size': 380204032, - 'output_properties': {None: {'next_transformer': None}}, - 'regex_format': '[A-Za-z]{5}', - } - generator = AsciiGenerator() - mock_strings_from_regex.return_value = (generator, 26) - instance = RegexGenerator() - - # Run - instance.__setstate__(state) - - # Assert - assert next(generator) == 'K' - assert instance.generated == 10 - assert instance.generator_size == 380204032 - mock_strings_from_regex.assert_called_once_with('[A-Za-z]{5}') - - @patch('rdt.transformers.text.strings_from_regex') - def test___setstate__(self, mock_strings_from_regex): - """Test that ``__setstate__`` will initialize a generator but not forward it. - - When ``generated`` is ``None`` and ``generator_size`` is ``None`` this will be assigned - the ``0`` and the ``generator_size`` respectively. - """ - # Setup - state = { - 'data_length': None, - 'enforce_uniqueness': False, - 'generated': None, - 'generator_size': None, - 'output_properties': {None: {'next_transformer': None}}, - 'regex_format': '[A-Za-z]{5}', - } - generator = AsciiGenerator() - mock_strings_from_regex.return_value = (generator, 26) - instance = RegexGenerator() - - # Run - instance.__setstate__(state) - - # Assert - assert next(generator) == 'A' - assert instance.generated == 0 - assert instance.generator_size == 26 - mock_strings_from_regex.assert_called_once_with('[A-Za-z]{5}') - - def test___init__default(self): - """Test the default instantiation of the transformer. - - Test that ``RegexGenerator`` defaults to ``regex_format='[A-Za-z]{5}'`` - - Side effects: - - the ``instance.regex_format`` is ``'[A-Za-z]{5}'``'. - """ - # Run - instance = RegexGenerator() - - # Assert - assert instance.data_length is None - assert instance.regex_format == '[A-Za-z]{5}' - assert instance.enforce_uniqueness is False - assert instance.generation_order == 'alphanumeric' - - def test___init__custom(self): - """Test the default instantiation of the transformer. - - Test that when creating an instance of ``RegexGenerator`` and passing a - ``regex_format`` this is being stored. - - Side effects: - - the ``instance.regex_format`` is ``'[A-Za-z]{5}'``'. - - ``instance.enforce_uniqueness`` is ``True``. - """ - # Run - instance = RegexGenerator( - regex_format='[0-9]', - enforce_uniqueness=True, - generation_order='scrambled', - ) - - # Assert - assert instance.data_length is None - assert instance.regex_format == '[0-9]' - assert instance.enforce_uniqueness - assert instance.generation_order == 'scrambled' - - def test___init__bad_value_generation_order(self): - """Test that an error is raised if a bad value is given for `generation_order`.""" - # Run and Assert - error_message = "generation_order must be one of 'alphanumeric' or 'scrambled'." - with pytest.raises(ValueError, match=error_message): - RegexGenerator(generation_order='afdsfd') - - @patch('rdt.transformers.text.BaseTransformer.reset_randomization') - @patch('rdt.transformers.text.strings_from_regex') - def test_reset_randomization(self, mock_strings_from_regex, mock_base_reset): - """Test that this method creates a new generator. - - This method should create a new ``instance.generator``, ``instance.generator_size`` and - restart the ``instance.generated`` values to 0. - """ - # Setup - generator = AsciiGenerator(5) - mock_strings_from_regex.return_value = (generator, 2) - instance = RegexGenerator() - - # Run - instance.reset_randomization() - - # Assert - assert instance.generator == generator - assert instance.generator_size == 2 - assert instance.generated == 0 - mock_strings_from_regex.assert_called_once_with('[A-Za-z]{5}') - mock_base_reset.assert_called_once() - - def test__fit(self): - """Test the ``_fit`` method. - - Validate that the ``_fit`` method learns the original data length. - - Setup: - - Initialize a ``RegexGenerator`` transformer. - - Input: - - ``pd.Series`` containing 3 strings. - - Side Effects: - - ``instance.data_length`` equals to the length of the input data. - """ - # Setup - instance = RegexGenerator() - columns_data = pd.Series(['1', '2', '3']) - - # Run - instance._fit(columns_data) - - # Assert - assert instance.data_length == 3 - assert instance.output_properties == {None: {'next_transformer': None}} - - def test__transform(self): - """Test the ``_transform`` method. - - Validate that the ``_transform`` method returns ``None``. - - Setup: - - Initialize a ``RegexGenerator`` transformer. - - Input: - - ``pd.Series`` with three values. - - Output: - - ``None``. - """ - # Setup - columns_data = pd.Series([1, 2, 3]) - instance = RegexGenerator() - - # Run - result = instance._transform(columns_data) - - # Assert - assert result is None - - @patch('rdt.transformers.text.np.random.shuffle') - def test__reverse_transform_generation_order_scrambled(self, shuffle_mock): - """Test the ``_reverse_transform`` method with ``generation_order`` set to scrambled. - - Validate that when ``generation_order`` is ``'scrambled'``, the data is not in order. - """ - # Setup - instance = RegexGenerator('[A-Z]') - columns_data = pd.Series() - instance.data_length = 3 - generator = AsciiGenerator(max_size=5) - instance.generator = generator - instance.generator_size = 5 - instance.generated = 0 - instance.generation_order = 'scrambled' - - # Run - result = instance._reverse_transform(columns_data) - - # Assert - np.testing.assert_array_equal(result, np.array(['A', 'B', 'C'])) - shuffle_mock.assert_called_once_with(['A', 'B', 'C']) - - def test__reverse_transform_generator_size_bigger_than_data_length(self): - """Test the ``_reverse_transform`` method. - - Validate that the ``_reverse_transform`` method uses the ``instance.generator`` - to generate the ``instance.data_length`` number of data. - - Setup: - - Initialize a ``RegexGenerator`` instance. - - Set ``data_length`` to 3. - - Initialize a generator. - - Set a generator, generator size and generated values. - - Output: - - A ``numpy.array`` with the first three letters from the generator. - """ - # Setup - instance = RegexGenerator('[A-Z]') - columns_data = pd.Series() - instance.data_length = 3 - generator = AsciiGenerator(max_size=5) - instance.generator = generator - instance.generator_size = 5 - instance.generated = 0 - - # Run - result = instance._reverse_transform(columns_data) - - # Assert - np.testing.assert_array_equal(result, np.array(['A', 'B', 'C'])) - - def test__reverse_transform_generator_size_smaller_than_data_length(self): - """Test the ``_reverse_transform`` method. - - Validate that the ``_reverse_transform`` method uses the ``instance.generator`` - to generate the ``instance.data_length`` number of data when ``enforce_uniqueness`` is - ``False`` but the data to be created is bigger. - - Setup: - - Initialize a ``RegexGenerator`` instance. - - Set ``data_length`` to 11. - - Initialize a generator. - - Output: - - A ``numpy.array`` with the first five letters from the generator repeated. - """ - # Setup - instance = RegexGenerator('[A-Z]', enforce_uniqueness=False) - columns_data = pd.Series() - instance.reset_randomization = Mock() - instance.data_length = 11 - generator = AsciiGenerator(5) - instance.columns = ['a'] - instance.generator = generator - instance.generator_size = 5 - instance.generated = 0 - - # Run - result = instance._reverse_transform(columns_data) - - # Assert - expected_result = np.array([ - 'A', - 'B', - 'C', - 'D', - 'E', - 'A', - 'B', - 'C', - 'D', - 'E', - 'A', - ]) - np.testing.assert_array_equal(result, expected_result) - - def test__reverse_transform_generator_size_of_input_data(self): - """Test the ``_reverse_transform`` method. - - Validate that the ``_reverse_transform`` method uses the ``instance.generator`` - to generate the ``instance.data_length`` number of data when ``enforce_uniqueness`` is - ``False`` but the data to be created is bigger. - - Setup: - - Initialize a ``RegexGenerator`` instance. - - Set ``data_length`` to 2. - - Initialize a generator. - - Input: - - ``pandas.Series`` with a length of ``4``. - - Output: - - A ``numpy.array`` with the first five letters from the generator repeated. - """ - # Setup - instance = RegexGenerator('[A-Z]') - columns_data = pd.Series([1, 2, 3, 4]) - instance.data_length = 2 - generator = AsciiGenerator(5) - instance.generator = generator - instance.generator_size = 5 - instance.generated = 0 - instance.columns = ['a'] - - # Run - result = instance._reverse_transform(columns_data) - - # Assert - expected_result = np.array(['A', 'B', 'C', 'D']) - np.testing.assert_array_equal(result, expected_result) - assert instance.generated == 4 - - @patch('rdt.transformers.text.warnings') - def test__reverse_transform_not_enough_unique_values_enforce_uniqueness(self, mock_warnings): - """Test it when there are not enough unique values to generate.""" - # Setup - instance = RegexGenerator('[A-E]', enforce_uniqueness=True) - instance.data_length = 6 - generator = AsciiGenerator(5) - instance.generator = generator - instance.generator_size = 5 - instance.generated = 0 - instance.columns = ['a'] - columns_data = pd.Series() - - # Run - out = instance._reverse_transform(columns_data) - - # Assert - mock_warnings.warn.assert_called_once_with( - "The regex for 'a' can only generate 5 " - 'unique values. Additional values may not exactly follow the provided regex.' - ) - np.testing.assert_array_equal(out, np.array(['A', 'B', 'C', 'D', 'E', 'A(0)'])) - - def test__reverse_transform_not_enough_unique_values(self): - """Test it when there are not enough unique values to generate.""" - # Setup - instance = RegexGenerator('[A-E]', enforce_uniqueness=False) - instance.data_length = 6 - generator = AsciiGenerator(5) - instance.generator = generator - instance.generator_size = 5 - instance.generated = 0 - instance.columns = ['a'] - columns_data = pd.Series() - - # Run - out = instance._reverse_transform(columns_data) - - # Assert - np.testing.assert_array_equal(out, np.array(['A', 'B', 'C', 'D', 'E', 'A'])) - - @patch('rdt.transformers.text.warnings') - def test__reverse_transform_not_enough_unique_values_numerical(self, mock_warnings): - """Test it when there are not enough unique values to generate.""" - # Setup - instance = RegexGenerator('[1-3]', enforce_uniqueness=True) - instance.data_length = 6 - generator = AsciiGenerator(5) - instance.generator = generator - instance.generator_size = 3 - instance.generated = 0 - instance.columns = ['a'] - columns_data = pd.Series() - - # Run - out = instance._reverse_transform(columns_data) - - # Assert - mock_warnings.warn.assert_called_once_with( - "The regex for 'a' can only generate 3 " - 'unique values. Additional values may not exactly follow the provided regex.' - ) - np.testing.assert_array_equal(out, np.array(['1', '2', '3', '4', '5', '6'])) - - @patch('rdt.transformers.text.warnings') - def test__reverse_transform_enforce_uniqueness_not_enough_remaining(self, mock_warnings): - """Test the case when there are not enough unique values remaining.""" - # Setup - instance = RegexGenerator('[A-Z]', enforce_uniqueness=True) - instance.data_length = 6 - generator = AsciiGenerator(10) - instance.generator = generator - instance.generator_size = 10 - instance.generated = 9 - instance.columns = ['a'] - columns_data = pd.Series() - - # Run - out = instance._reverse_transform(columns_data) - - # Assert - mock_warnings.warn.assert_called_once_with( - 'The regex generator is not able to generate 6 new unique ' - 'values (only 1 unique values left).' - ) - np.testing.assert_array_equal(out, np.array(['A', 'B', 'C', 'D', 'E', 'F'])) - - @patch('rdt.transformers.text.LOGGER') - def test__reverse_transform_info_message(self, mock_logger): - """Test the ``_reverse_transform`` method. - - Validate that the ``_reverse_transform`` method logs an info message when - ``enforce_uniqueness`` is ``False`` and the ``instance.data_length`` is bigger than - ``instance.generator_size``. - """ - # Setup - instance = RegexGenerator('[A-Z]', enforce_uniqueness=False) - instance.data_length = 6 - instance.generator_size = 5 - instance.generated = 0 - instance.columns = ['a'] - columns_data = pd.Series() - - # Run - instance._reverse_transform(columns_data) - - # Assert - expected_format = ( - "The data has %s rows but the regex for '%s' can only create %s unique values. Some " - "values in '%s' may be repeated." - ) - expected_args = (6, 'a', 5, 'a') - mock_logger.info.assert_called_once_with(expected_format, *expected_args) +def test_deprecation_warning_is_raised(): + """Test that a deprecation warning is raised when importing from this module.""" + # Run and Assert + expected_message = ( + "Importing 'IDGenerator' or 'RegexGenerator' for ID columns from 'rdt.transformers.text' " + "is deprecated. Please use 'rdt.transformers.id' instead." + ) + with pytest.warns(DeprecationWarning, match=expected_message): + from rdt.transformers.text import IDGenerator, RegexGenerator # noqa: F401