From 97fed3f591564be75fec2e43c6a9bd9b3c1718b5 Mon Sep 17 00:00:00 2001 From: Felipe Date: Tue, 7 Feb 2023 08:37:51 -0800 Subject: [PATCH 1/7] Add code --- sdv/single_table/copulagan.py | 14 ++++++++++++++ sdv/single_table/copulas.py | 15 ++++++++++++++- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/sdv/single_table/copulagan.py b/sdv/single_table/copulagan.py index 27ac770a3..12e46a211 100644 --- a/sdv/single_table/copulagan.py +++ b/sdv/single_table/copulagan.py @@ -3,6 +3,7 @@ import rdt +from sdv.errors import SynthesizerInputError from sdv.single_table.copulas import GaussianCopulaSynthesizer from sdv.single_table.ctgan import CTGANSynthesizer from sdv.single_table.utils import validate_numerical_distributions @@ -109,6 +110,19 @@ class CopulaGANSynthesizer(CTGANSynthesizer): _gaussian_normalizer_hyper_transformer = None + def _validate_numerical_distributions(self, numerical_distributions): + if numerical_distributions: + if not isinstance(numerical_distributions, dict): + raise TypeError('numerical_distributions can only be None or a dict instance.') + + invalid_columns = numerical_distributions.keys() - dict(self.metadata._columns) + if invalid_columns: + raise SynthesizerInputError( + 'Invalid column names found in the numerical_distributions dictionary ' + f'{invalid_columns}. The column names you provide must be present ' + 'in the metadata.' + ) + def __init__(self, metadata, enforce_min_max_values=True, enforce_rounding=True, embedding_dim=128, generator_dim=(256, 256), discriminator_dim=(256, 256), generator_lr=2e-4, generator_decay=1e-6, discriminator_lr=2e-4, diff --git a/sdv/single_table/copulas.py b/sdv/single_table/copulas.py index 4dd13c88e..824438931 100644 --- a/sdv/single_table/copulas.py +++ b/sdv/single_table/copulas.py @@ -10,7 +10,7 @@ from copulas import multivariate from rdt.transformers import OneHotEncoder -from sdv.errors import NonParametricError +from sdv.errors import NonParametricError, SynthesizerInputError from sdv.single_table.base import BaseSingleTableSynthesizer from sdv.single_table.utils import flatten_dict, unflatten_dict, validate_numerical_distributions @@ -83,6 +83,19 @@ def get_distribution_class(cls, distribution): return cls._DISTRIBUTIONS[distribution] + def _validate_numerical_distributions(self, numerical_distributions): + if numerical_distributions: + if not isinstance(numerical_distributions, dict): + raise TypeError('numerical_distributions can only be None or a dict instance.') + + invalid_columns = numerical_distributions.keys() - dict(self.metadata._columns) + if invalid_columns: + raise SynthesizerInputError( + 'Invalid column names found in the numerical_distributions dictionary ' + f'{invalid_columns}. The column names you provide must be present ' + 'in the metadata.' + ) + def __init__(self, metadata, enforce_min_max_values=True, enforce_rounding=True, numerical_distributions=None, default_distribution=None): super().__init__( From b27a46cc18350b998bf8c75ed39acc2e4158228f Mon Sep 17 00:00:00 2001 From: Felipe Date: Sun, 5 Feb 2023 19:45:51 -0800 Subject: [PATCH 2/7] Add LOGGER, weird bug --- sdv/single_table/copulagan.py | 14 ++++++++++++-- sdv/single_table/copulas.py | 15 +++++++++++++-- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/sdv/single_table/copulagan.py b/sdv/single_table/copulagan.py index 12e46a211..00e4a423b 100644 --- a/sdv/single_table/copulagan.py +++ b/sdv/single_table/copulagan.py @@ -1,5 +1,6 @@ """Combination of GaussianCopula transformation and GANs.""" from copy import deepcopy +import logging import rdt @@ -8,6 +9,7 @@ from sdv.single_table.ctgan import CTGANSynthesizer from sdv.single_table.utils import validate_numerical_distributions +LOGGER = logging.getLogger(__name__) class CopulaGANSynthesizer(CTGANSynthesizer): """Combination of GaussianCopula transformation and GANs. @@ -115,7 +117,7 @@ def _validate_numerical_distributions(self, numerical_distributions): if not isinstance(numerical_distributions, dict): raise TypeError('numerical_distributions can only be None or a dict instance.') - invalid_columns = numerical_distributions.keys() - dict(self.metadata._columns) + invalid_columns = numerical_distributions.keys() - set(self.metadata._columns) if invalid_columns: raise SynthesizerInputError( 'Invalid column names found in the numerical_distributions dictionary ' @@ -159,7 +161,7 @@ def __init__(self, metadata, enforce_min_max_values=True, enforce_rounding=True, ) self._numerical_distributions = { field: GaussianCopulaSynthesizer.get_distribution_class(distribution) - for field, distribution in (numerical_distributions or {}).items() + for field, distribution in self.numerical_distributions.items() } def _create_gaussian_normalizer_config(self, processed_data): @@ -193,6 +195,14 @@ def _fit(self, processed_data): processed_data (pandas.DataFrame): Data to be learned. """ + unseen_columns = self._numerical_distributions.keys() - dict(processed_data.columns) + for column in unseen_columns: + LOGGER.info( + f"Requested distribution {self._numerical_distributions['column']} " + f'cannot be applied to column {column} because it no longer ' + 'exists after preprocessing.' + ) + gaussian_normalizer_config = self._create_gaussian_normalizer_config(processed_data) self._gaussian_normalizer_hyper_transformer = rdt.HyperTransformer() diff --git a/sdv/single_table/copulas.py b/sdv/single_table/copulas.py index 824438931..bb339aae4 100644 --- a/sdv/single_table/copulas.py +++ b/sdv/single_table/copulas.py @@ -1,4 +1,5 @@ """Wrappers around copulas models.""" +import logging import warnings from copy import deepcopy @@ -14,6 +15,7 @@ from sdv.single_table.base import BaseSingleTableSynthesizer from sdv.single_table.utils import flatten_dict, unflatten_dict, validate_numerical_distributions +LOGGER = logging.getLogger(__name__) class GaussianCopulaSynthesizer(BaseSingleTableSynthesizer): """Model wrapping ``copulas.multivariate.GaussianMultivariate`` copula. @@ -88,7 +90,7 @@ def _validate_numerical_distributions(self, numerical_distributions): if not isinstance(numerical_distributions, dict): raise TypeError('numerical_distributions can only be None or a dict instance.') - invalid_columns = numerical_distributions.keys() - dict(self.metadata._columns) + invalid_columns = numerical_distributions.keys() - set(self.metadata._columns) if invalid_columns: raise SynthesizerInputError( 'Invalid column names found in the numerical_distributions dictionary ' @@ -110,7 +112,7 @@ def __init__(self, metadata, enforce_min_max_values=True, enforce_rounding=True, self._default_distribution = self.get_distribution_class(self.default_distribution) self._numerical_distributions = { field: self.get_distribution_class(distribution) - for field, distribution in (numerical_distributions or {}).items() + for field, distribution in self.numerical_distributions.items() } self._num_rows = None @@ -121,6 +123,14 @@ def _fit(self, processed_data): processed_data (pandas.DataFrame): Data to be learned. """ + unseen_columns = self._numerical_distributions.keys() - set(processed_data.columns) + for column in unseen_columns: + LOGGER.info( + f"Requested distribution {numerical_distributions['column']} " + f'cannot be applied to column {column} because it no longer ' + 'exists after preprocessing.' + ) + self._num_rows = len(processed_data) numerical_distributions = deepcopy(self._numerical_distributions) @@ -128,6 +138,7 @@ def _fit(self, processed_data): if column not in numerical_distributions: numerical_distributions[column] = self._numerical_distributions.get( column, self._default_distribution) + self._model = multivariate.GaussianMultivariate( distribution=numerical_distributions From a413db72b8cecb1db763877213f7149ee15c5af3 Mon Sep 17 00:00:00 2001 From: Felipe Date: Sun, 5 Feb 2023 19:57:10 -0800 Subject: [PATCH 3/7] Fix last dict->set --- sdv/single_table/copulagan.py | 7 ++++--- sdv/single_table/copulas.py | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/sdv/single_table/copulagan.py b/sdv/single_table/copulagan.py index 00e4a423b..3237462b4 100644 --- a/sdv/single_table/copulagan.py +++ b/sdv/single_table/copulagan.py @@ -1,6 +1,6 @@ """Combination of GaussianCopula transformation and GANs.""" -from copy import deepcopy import logging +from copy import deepcopy import rdt @@ -11,6 +11,7 @@ LOGGER = logging.getLogger(__name__) + class CopulaGANSynthesizer(CTGANSynthesizer): """Combination of GaussianCopula transformation and GANs. @@ -195,10 +196,10 @@ def _fit(self, processed_data): processed_data (pandas.DataFrame): Data to be learned. """ - unseen_columns = self._numerical_distributions.keys() - dict(processed_data.columns) + unseen_columns = self._numerical_distributions.keys() - set(processed_data.columns) for column in unseen_columns: LOGGER.info( - f"Requested distribution {self._numerical_distributions['column']} " + f"Requested distribution {self.numerical_distributions['column']} " f'cannot be applied to column {column} because it no longer ' 'exists after preprocessing.' ) diff --git a/sdv/single_table/copulas.py b/sdv/single_table/copulas.py index bb339aae4..f1b282950 100644 --- a/sdv/single_table/copulas.py +++ b/sdv/single_table/copulas.py @@ -17,6 +17,7 @@ LOGGER = logging.getLogger(__name__) + class GaussianCopulaSynthesizer(BaseSingleTableSynthesizer): """Model wrapping ``copulas.multivariate.GaussianMultivariate`` copula. @@ -126,7 +127,7 @@ def _fit(self, processed_data): unseen_columns = self._numerical_distributions.keys() - set(processed_data.columns) for column in unseen_columns: LOGGER.info( - f"Requested distribution {numerical_distributions['column']} " + f"Requested distribution {self.numerical_distributions['column']} " f'cannot be applied to column {column} because it no longer ' 'exists after preprocessing.' ) @@ -138,7 +139,6 @@ def _fit(self, processed_data): if column not in numerical_distributions: numerical_distributions[column] = self._numerical_distributions.get( column, self._default_distribution) - self._model = multivariate.GaussianMultivariate( distribution=numerical_distributions From 73892bd693bdba2137eaac65fa7d88299ff2590c Mon Sep 17 00:00:00 2001 From: Felipe Date: Mon, 6 Feb 2023 06:53:18 -0800 Subject: [PATCH 4/7] Add bug fix --- tests/unit/single_table/test_copulagan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/single_table/test_copulagan.py b/tests/unit/single_table/test_copulagan.py index f3301c248..8741eecb9 100644 --- a/tests/unit/single_table/test_copulagan.py +++ b/tests/unit/single_table/test_copulagan.py @@ -248,7 +248,7 @@ def test__fit(self, mock_rdt, mock_ctgansynthesizer__fit): metadata = SingleTableMetadata() instance = CopulaGANSynthesizer(metadata) instance._create_gaussian_normalizer_config = Mock() - processed_data = Mock() + processed_data = pd.DataFrame() # Run instance._fit(processed_data) From 2fa55de00fea01c2719b2c901df1b4d84f3c9bf6 Mon Sep 17 00:00:00 2001 From: Felipe Date: Mon, 6 Feb 2023 07:21:51 -0800 Subject: [PATCH 5/7] Fix bugs + add tests --- sdv/single_table/copulagan.py | 4 ++-- sdv/single_table/copulas.py | 4 ++-- tests/unit/single_table/test_copulagan.py | 25 +++++++++++++++++++++++ tests/unit/single_table/test_copulas.py | 24 ++++++++++++++++++++++ 4 files changed, 53 insertions(+), 4 deletions(-) diff --git a/sdv/single_table/copulagan.py b/sdv/single_table/copulagan.py index 3237462b4..3ecab426d 100644 --- a/sdv/single_table/copulagan.py +++ b/sdv/single_table/copulagan.py @@ -199,8 +199,8 @@ def _fit(self, processed_data): unseen_columns = self._numerical_distributions.keys() - set(processed_data.columns) for column in unseen_columns: LOGGER.info( - f"Requested distribution {self.numerical_distributions['column']} " - f'cannot be applied to column {column} because it no longer ' + f"Requested distribution '{self.numerical_distributions[column]}' " + f"cannot be applied to column '{column}' because it no longer " 'exists after preprocessing.' ) diff --git a/sdv/single_table/copulas.py b/sdv/single_table/copulas.py index f1b282950..b1b0f804b 100644 --- a/sdv/single_table/copulas.py +++ b/sdv/single_table/copulas.py @@ -127,8 +127,8 @@ def _fit(self, processed_data): unseen_columns = self._numerical_distributions.keys() - set(processed_data.columns) for column in unseen_columns: LOGGER.info( - f"Requested distribution {self.numerical_distributions['column']} " - f'cannot be applied to column {column} because it no longer ' + f"Requested distribution '{self.numerical_distributions[column]}' " + f"cannot be applied to column '{column}' because it no longer " 'exists after preprocessing.' ) diff --git a/tests/unit/single_table/test_copulagan.py b/tests/unit/single_table/test_copulagan.py index 8741eecb9..e3df9d3f4 100644 --- a/tests/unit/single_table/test_copulagan.py +++ b/tests/unit/single_table/test_copulagan.py @@ -235,6 +235,31 @@ def test__create_gaussian_normalizer_config(self, mock_rdt): assert config == expected_config assert mock_rdt.transformers.GaussianNormalizer.call_args_list == expected_calls + @patch('sdv.single_table.copulagan.LOGGER') + @patch('sdv.single_table.copulagan.CTGANSynthesizer._fit') + @patch('sdv.single_table.copulagan.rdt') + def test__fit_logging(self, mock_rdt, mock_ctgansynthesizer__fit, mock_logger): + """Test a message is logged. + + A message should be logged if the columns passed in ``numerical_distributions`` + were renamed/dropped during preprocessing. + """ + # Setup + metadata = SingleTableMetadata() + metadata.add_column('col', sdtype='numerical') + numerical_distributions = {'col': 'gamma'} + instance = CopulaGANSynthesizer(metadata, numerical_distributions=numerical_distributions) + processed_data = pd.DataFrame() + + # Run + instance._fit(processed_data) + + # Assert + mock_logger.info.assert_called_once_with( + "Requested distribution 'gamma' cannot be applied to column 'col' " + 'because it no longer exists after preprocessing.' + ) + @patch('sdv.single_table.copulagan.CTGANSynthesizer._fit') @patch('sdv.single_table.copulagan.rdt') def test__fit(self, mock_rdt, mock_ctgansynthesizer__fit): diff --git a/tests/unit/single_table/test_copulas.py b/tests/unit/single_table/test_copulas.py index 61fcf7b4e..713a04317 100644 --- a/tests/unit/single_table/test_copulas.py +++ b/tests/unit/single_table/test_copulas.py @@ -131,6 +131,30 @@ def test_get_parameters(self): 'default_distribution': 'beta' } + @patch('sdv.single_table.copulas.LOGGER') + def test__fit_logging(self, mock_logger): + """Test a message is logged. + + A message should be logged if the columns passed in ``numerical_distributions`` + were renamed/dropped during preprocessing. + """ + # Setup + metadata = SingleTableMetadata() + metadata.add_column('col', sdtype='numerical') + numerical_distributions = {'col': 'gamma'} + instance = GaussianCopulaSynthesizer( + metadata, numerical_distributions=numerical_distributions) + processed_data = pd.DataFrame({'updated_col': [1, 2, 3]}) + + # Run + instance._fit(processed_data) + + # Assert + mock_logger.info.assert_called_once_with( + "Requested distribution 'gamma' cannot be applied to column 'col' " + 'because it no longer exists after preprocessing.' + ) + @patch('sdv.single_table.copulas.warnings') @patch('sdv.single_table.copulas.multivariate') def test__fit(self, mock_multivariate, mock_warnings): From ea3c13addce82f9b9c6a7e80e46adc6e4c6354a8 Mon Sep 17 00:00:00 2001 From: Felipe Date: Tue, 7 Feb 2023 09:12:36 -0800 Subject: [PATCH 6/7] Remove _validates added during rebase --- sdv/single_table/copulagan.py | 14 -------------- sdv/single_table/copulas.py | 15 +-------------- 2 files changed, 1 insertion(+), 28 deletions(-) diff --git a/sdv/single_table/copulagan.py b/sdv/single_table/copulagan.py index 3ecab426d..fa07f86b9 100644 --- a/sdv/single_table/copulagan.py +++ b/sdv/single_table/copulagan.py @@ -4,7 +4,6 @@ import rdt -from sdv.errors import SynthesizerInputError from sdv.single_table.copulas import GaussianCopulaSynthesizer from sdv.single_table.ctgan import CTGANSynthesizer from sdv.single_table.utils import validate_numerical_distributions @@ -113,19 +112,6 @@ class CopulaGANSynthesizer(CTGANSynthesizer): _gaussian_normalizer_hyper_transformer = None - def _validate_numerical_distributions(self, numerical_distributions): - if numerical_distributions: - if not isinstance(numerical_distributions, dict): - raise TypeError('numerical_distributions can only be None or a dict instance.') - - invalid_columns = numerical_distributions.keys() - set(self.metadata._columns) - if invalid_columns: - raise SynthesizerInputError( - 'Invalid column names found in the numerical_distributions dictionary ' - f'{invalid_columns}. The column names you provide must be present ' - 'in the metadata.' - ) - def __init__(self, metadata, enforce_min_max_values=True, enforce_rounding=True, embedding_dim=128, generator_dim=(256, 256), discriminator_dim=(256, 256), generator_lr=2e-4, generator_decay=1e-6, discriminator_lr=2e-4, diff --git a/sdv/single_table/copulas.py b/sdv/single_table/copulas.py index b1b0f804b..6131c41d4 100644 --- a/sdv/single_table/copulas.py +++ b/sdv/single_table/copulas.py @@ -11,7 +11,7 @@ from copulas import multivariate from rdt.transformers import OneHotEncoder -from sdv.errors import NonParametricError, SynthesizerInputError +from sdv.errors import NonParametricError from sdv.single_table.base import BaseSingleTableSynthesizer from sdv.single_table.utils import flatten_dict, unflatten_dict, validate_numerical_distributions @@ -86,19 +86,6 @@ def get_distribution_class(cls, distribution): return cls._DISTRIBUTIONS[distribution] - def _validate_numerical_distributions(self, numerical_distributions): - if numerical_distributions: - if not isinstance(numerical_distributions, dict): - raise TypeError('numerical_distributions can only be None or a dict instance.') - - invalid_columns = numerical_distributions.keys() - set(self.metadata._columns) - if invalid_columns: - raise SynthesizerInputError( - 'Invalid column names found in the numerical_distributions dictionary ' - f'{invalid_columns}. The column names you provide must be present ' - 'in the metadata.' - ) - def __init__(self, metadata, enforce_min_max_values=True, enforce_rounding=True, numerical_distributions=None, default_distribution=None): super().__init__( From 7ac90054bd0b5bee9cefa2b402474ba700c233f3 Mon Sep 17 00:00:00 2001 From: Felipe Date: Wed, 8 Feb 2023 07:02:40 -0800 Subject: [PATCH 7/7] Move logger to utils --- sdv/single_table/copulagan.py | 13 ++++--------- sdv/single_table/copulas.py | 16 ++++++---------- sdv/single_table/utils.py | 11 +++++++++++ 3 files changed, 21 insertions(+), 19 deletions(-) diff --git a/sdv/single_table/copulagan.py b/sdv/single_table/copulagan.py index fa07f86b9..f98aec237 100644 --- a/sdv/single_table/copulagan.py +++ b/sdv/single_table/copulagan.py @@ -6,7 +6,8 @@ from sdv.single_table.copulas import GaussianCopulaSynthesizer from sdv.single_table.ctgan import CTGANSynthesizer -from sdv.single_table.utils import validate_numerical_distributions +from sdv.single_table.utils import ( + log_numerical_distributions_error, validate_numerical_distributions) LOGGER = logging.getLogger(__name__) @@ -182,16 +183,10 @@ def _fit(self, processed_data): processed_data (pandas.DataFrame): Data to be learned. """ - unseen_columns = self._numerical_distributions.keys() - set(processed_data.columns) - for column in unseen_columns: - LOGGER.info( - f"Requested distribution '{self.numerical_distributions[column]}' " - f"cannot be applied to column '{column}' because it no longer " - 'exists after preprocessing.' - ) + log_numerical_distributions_error( + self.numerical_distributions, processed_data.columns, LOGGER) gaussian_normalizer_config = self._create_gaussian_normalizer_config(processed_data) - self._gaussian_normalizer_hyper_transformer = rdt.HyperTransformer() self._gaussian_normalizer_hyper_transformer.set_config(gaussian_normalizer_config) processed_data = self._gaussian_normalizer_hyper_transformer.fit_transform(processed_data) diff --git a/sdv/single_table/copulas.py b/sdv/single_table/copulas.py index 6131c41d4..f7d1aa4ad 100644 --- a/sdv/single_table/copulas.py +++ b/sdv/single_table/copulas.py @@ -13,7 +13,9 @@ from sdv.errors import NonParametricError from sdv.single_table.base import BaseSingleTableSynthesizer -from sdv.single_table.utils import flatten_dict, unflatten_dict, validate_numerical_distributions +from sdv.single_table.utils import ( + flatten_dict, log_numerical_distributions_error, unflatten_dict, + validate_numerical_distributions) LOGGER = logging.getLogger(__name__) @@ -111,17 +113,11 @@ def _fit(self, processed_data): processed_data (pandas.DataFrame): Data to be learned. """ - unseen_columns = self._numerical_distributions.keys() - set(processed_data.columns) - for column in unseen_columns: - LOGGER.info( - f"Requested distribution '{self.numerical_distributions[column]}' " - f"cannot be applied to column '{column}' because it no longer " - 'exists after preprocessing.' - ) - + log_numerical_distributions_error( + self.numerical_distributions, processed_data.columns, LOGGER) self._num_rows = len(processed_data) - numerical_distributions = deepcopy(self._numerical_distributions) + numerical_distributions = deepcopy(self._numerical_distributions) for column in processed_data.columns: if column not in numerical_distributions: numerical_distributions[column] = self._numerical_distributions.get( diff --git a/sdv/single_table/utils.py b/sdv/single_table/utils.py index 144eb3a68..081dca685 100644 --- a/sdv/single_table/utils.py +++ b/sdv/single_table/utils.py @@ -308,3 +308,14 @@ def validate_numerical_distributions(numerical_distributions, metadata_columns): f'{invalid_columns}. The column names you provide must be present ' 'in the metadata.' ) + + +def log_numerical_distributions_error(numerical_distributions, processed_data_columns, logger): + """Log error when numerical distributions columns don't exist anymore.""" + unseen_columns = numerical_distributions.keys() - set(processed_data_columns) + for column in unseen_columns: + logger.info( + f"Requested distribution '{numerical_distributions[column]}' " + f"cannot be applied to column '{column}' because it no longer " + 'exists after preprocessing.' + )