diff --git a/sdv/modeler.py b/sdv/modeler.py index 8e5145356..3fe74a434 100644 --- a/sdv/modeler.py +++ b/sdv/modeler.py @@ -6,6 +6,7 @@ from copulas import get_qualified_name from copulas.multivariate import GaussianMultivariate, TreeTypes from copulas.univariate import GaussianUnivariate +from rdt.transformers.positive_number import PositiveNumberTransformer # Configure logger logger = logging.getLogger(__name__) @@ -162,6 +163,15 @@ def flatten_model(self, model, name=''): values.append(row[:index + 1]) model.covariance = np.array(values) + if self.model_kwargs['distribution'] == get_qualified_name(DEFAULT_DISTRIBUTION): + transformer = PositiveNumberTransformer({ + 'name': 'field', + 'type': 'number' + }) + + for distribution in model.distribs.values(): + column = pd.DataFrame({'field': [distribution.std]}) + distribution.std = transformer.reverse_transform(column).loc[0, 'field'] return pd.Series(self._flatten_dict(model.to_dict(), name)) diff --git a/sdv/sampler.py b/sdv/sampler.py index 358d6e06b..b57f07c41 100644 --- a/sdv/sampler.py +++ b/sdv/sampler.py @@ -10,6 +10,18 @@ GAUSSIAN_COPULA = 'copulas.multivariate.gaussian.GaussianMultivariate' +MODEL_ERROR_MESSAGES = { + True: ( + 'There was an error recreating models from parameters. ' + 'Sampling could not continue.' + ), + False: ( + 'Modeler hasn\'t been fitted. ' + 'Please call Modeler.model_database() before sampling' + ) +} + + class Sampler: """Class to sample data from a model.""" @@ -51,6 +63,22 @@ def _square_matrix(triangular_matrix): return triangular_matrix + def _get_table_meta(self, metadata, table_name): + """Return metadata get table meta for a given table name. + + Args: + metadata (dict): Metadata for dataset. + table_name (str): Name of table to get metadata from. + + Returns: + dict: Metadata for given table. + """ + for table in metadata['tables']: + if table['name'] == table_name: + return table + + return None + def _prepare_sampled_covariance(self, covariance): """ @@ -79,11 +107,11 @@ def reset_indices_tables(sampled_tables): return sampled_tables - def transform_synthesized_rows(self, synthesized_rows, table_name, num_rows): + def transform_synthesized_rows(self, synthesized, table_name, num_rows): """Add primary key and reverse transform synthetized data. Args: - synthesized_rows(pandas.DataFrame): Generated data from model + synthesized(pandas.DataFrame): Generated data from model table_name(str): Name of the table. num_rows(int): Number of rows sampled. @@ -114,25 +142,24 @@ def transform_synthesized_rows(self, synthesized_rows, table_name, num_rows): ' to generate {} samples.'.format(table_name, regex, num_rows) ) - synthesized_rows[primary_key] = pd.Series(values) + synthesized[primary_key] = pd.Series(values) if (node['type'] == 'number') and (node['subtype'] == 'integer'): - synthesized_rows[primary_key] = pd.to_numeric(synthesized_rows[primary_key]) - - sample_info = (primary_key, synthesized_rows) + synthesized[primary_key] = pd.to_numeric(synthesized[primary_key]) + sample_info = (primary_key, synthesized) self.sampled = self.update_mapping_list(self.sampled, table_name, sample_info) # filter out parameters labels = list(self.dn.tables[table_name].data) - synthesized_rows = self._fill_text_columns(synthesized_rows, labels, table_name) + text_filled = self._fill_text_columns(synthesized, labels, table_name) # reverse transform data - reversed_data = self.dn.ht.reverse_transform_table(synthesized_rows, orig_meta) + reversed_data = self.dn.ht.reverse_transform_table(text_filled, orig_meta) - synthesized_rows.update(reversed_data) - return synthesized_rows[labels] + synthesized.update(reversed_data) + return synthesized[labels] def _get_parent_row(self, table_name): parents = self.dn.get_parents(table_name) @@ -331,23 +358,19 @@ def _unflatten_gaussian_copula(self, model_parameters): 'fitted': True, 'type': distribution_name } + model_parameters['distribution'] = distribution_name distribs = model_parameters['distribs'] - if any([distribs[key]['std'] <= 0 for key in distribs]): - metadata = { - 'name': 'std', - 'type': 'number' - } - transformer = PositiveNumberTransformer(metadata) - - model_parameters['distribution'] = distribution_name - for key in distribs: - distribs[key].update(distribution_kwargs) + metadata = { + 'name': 'std', + 'type': 'number' + } + transformer = PositiveNumberTransformer(metadata) - distribution_std = distribs[key]['std'] - if distribution_std <= 0: - df = pd.DataFrame({'std': [distribution_std]}) - distribs[key]['std'] = transformer.fit_transform(df)['std'].values[0] + for distribution in distribs.values(): + distribution.update(distribution_kwargs) + df = pd.DataFrame({'std': [distribution['std']]}) + distribution['std'] = transformer.transform(df).loc[0, 'std'] covariance = model_parameters['covariance'] covariance = self._prepare_sampled_covariance(covariance) @@ -384,6 +407,75 @@ def unflatten_model(self, parent_row, table_name, parent_name): return self.modeler.model.from_dict(model_parameters) + def _get_missing_valid_rows(self, synthesized, drop_indices, valid_rows, num_rows): + """ + + Args: + synthesized (pandas.DataFrame) + + Returns: + tuple[int, pandas.DataFrame]: Amount of missing values and actual valid rows + """ + valid_rows = pd.concat([valid_rows, synthesized[~drop_indices].copy()]) + valid_rows = valid_rows.reset_index(drop=True) + + missing_rows = num_rows - valid_rows.shape[0] + + return missing_rows, valid_rows + + def _sample_valid_rows(self, model, num_rows, table_name): + """Sample using `model` and discard invalid values until having `num_rows`. + + Args: + model (copula.multivariate.base): Fitted model. + num_rows (int): Number of rows to sample. + table_name (str): name of table to synthesize. + + Returns: + pandas.DataFrame: Sampled rows, shape (, num_rows) + """ + + if model and model.fitted: + synthesized = model.sample(num_rows) + valid_rows = pd.DataFrame(columns=synthesized.columns) + drop_indices = pd.Series(False, index=synthesized.index) + + categorical_columns = [] + table_metadata = self._get_table_meta(self.dn.meta, table_name) + + for field in table_metadata['fields']: + if field['type'] == 'categorical': + column_name = field['name'] + categorical_columns.append(column_name) + column = synthesized[column_name] + filtered_values = ((column < 0) | (column > 1)) + + if filtered_values.any(): + drop_indices |= filtered_values + + missing_rows, valid_rows = self._get_missing_valid_rows( + synthesized, drop_indices, valid_rows, num_rows) + + while missing_rows: + synthesized = model.sample(missing_rows) + drop_indices = pd.Series(False, index=synthesized.index) + + for column_name in categorical_columns: + column = synthesized[column_name] + filtered_values = ((column < 0) | (column > 1)) + + if filtered_values.any(): + drop_indices |= filtered_values + + missing_rows, valid_rows = self._get_missing_valid_rows( + synthesized, drop_indices, valid_rows, num_rows) + + return valid_rows + + else: + parents = bool(self.dn.get_parents(table_name)) + raise ValueError(MODEL_ERROR_MESSAGES[parents]) + def sample_rows(self, table_name, num_rows): """Sample specified number of rows for specified table. @@ -406,14 +498,7 @@ def sample_rows(self, table_name, num_rows): # get parameters from parent to make model model = self.unflatten_model(parent_row, table_name, random_parent) - # sample from that model - if model is not None and model.fitted: - synthesized_rows = model.sample(num_rows) - else: - raise ValueError( - 'There was an error recreating models from parameters. ' - 'Sampling could not continue.' - ) + synthesized_rows = self._sample_valid_rows(model, num_rows, table_name) # add foreign key value to row fk_val = parent_row.loc[0, fk] @@ -426,13 +511,7 @@ def sample_rows(self, table_name, num_rows): else: # there is no parent model = self.modeler.models[table_name] - if model.fitted: - synthesized_rows = model.sample(num_rows) - else: - raise ValueError( - 'Modeler hasn\'t been fitted. ' - 'Please call Modeler.model_database() before sampling' - ) + synthesized_rows = self._sample_valid_rows(model, num_rows, table_name) return self.transform_synthesized_rows(synthesized_rows, table_name, num_rows) @@ -506,14 +585,6 @@ def sample_all(self, num_rows=5): return self.reset_indices_tables(sampled_data) - def _get_table_meta(self, meta, table_name): - """Return metadata get table meta for a given table name""" - for table in meta['tables']: - if table['name'] == table_name: - return table - - return None - def _fill_text_columns(self, row, labels, table_name): """Fill in the column values for every non numeric column that isn't the primary key. diff --git a/tests/sdv/test_modeler.py b/tests/sdv/test_modeler.py index cedcba425..9d04eabf4 100644 --- a/tests/sdv/test_modeler.py +++ b/tests/sdv/test_modeler.py @@ -36,9 +36,9 @@ def test__create_extension(self): 'covariance__1__0': 0.0, 'covariance__1__1': 1.4999999999999991, 'distribs__a__mean': 0.0, - 'distribs__a__std': 0.001, + 'distribs__a__std': -6.907755278982137, 'distribs__b__mean': 3.0, - 'distribs__b__std': 1.632993161855452 + 'distribs__b__std': 0.4904146265058631 }) # Run @@ -156,11 +156,11 @@ def test_flatten_model(self): 'covariance__2__1': -0.7500000000000003, 'covariance__2__2': 1.5000000000000007, 'distribs__0__mean': 0.33333333333333331, - 'distribs__0__std': 0.47140452079103168, + 'distribs__0__std': -0.7520386983881371, 'distribs__1__mean': 0.33333333333333331, - 'distribs__1__std': 0.47140452079103168, + 'distribs__1__std': -0.7520386983881371, 'distribs__2__mean': 0.33333333333333331, - 'distribs__2__std': 0.47140452079103168 + 'distribs__2__std': -0.7520386983881371 }) data_navigator = mock.MagicMock() modeler = Modeler(data_navigator) diff --git a/tests/sdv/test_sampler.py b/tests/sdv/test_sampler.py index b50fbbad5..1fefb881c 100644 --- a/tests/sdv/test_sampler.py +++ b/tests/sdv/test_sampler.py @@ -4,8 +4,8 @@ import numpy as np import pandas as pd -from sdv.data_navigator import CSVDataLoader -from sdv.modeler import Modeler +from sdv.data_navigator import CSVDataLoader, DataNavigator, Table +from sdv.modeler import GaussianMultivariate, Modeler from sdv.sampler import Sampler @@ -48,6 +48,86 @@ def test__square_matrix(self): # Check assert result == expected_result + @patch('sdv.sampler.Sampler._fill_text_columns', autospec=True) + @patch('sdv.sampler.Sampler.update_mapping_list') + @patch('sdv.sampler.Sampler._get_table_meta', autospec=True) + def test_transform_synthesized_rows_no_pk( + self, get_table_meta_mock, update_mock, fill_mock): + + """transform_synthesized_rows will update internal state and reverse transform rows.""" + # Setup - Class Instantiation + data_navigator = MagicMock() + modeler = MagicMock() + sampler = Sampler(data_navigator, modeler) + + # Setup - Mock configuration + table_metadata = { + 'fields': { + 'column_A': { + 'type': 'number', + 'subtype': 'integer' + }, + 'column_B': { + 'name': 'column', + 'type': 'number' + } + }, + 'primary_key': None + } + table_data = pd.DataFrame(columns=['column_A', 'column_B']) + test_table = Table(table_data, table_metadata) + data_navigator.tables = { + 'table': test_table + } + + data_navigator.ht.reverse_transform_table.return_value = pd.DataFrame({ + 'column_A': ['some', 'transformed values'], + 'column_B': ['another', 'transformed column'] + }) + + get_table_meta_mock.return_value = { + 'original': 'meta', + 'fields': [] + } + + fill_mock.return_value = pd.DataFrame({ + 'column_A': ['filled', 'text_values'], + 'column_B': ['nothing', 'numerical'] + }) + + # Setup - Method arguments / expected result + synthesized_rows = pd.DataFrame({ + 'column_A': [1.7, 2.5], + 'column_B': [4.7, 5.1], + 'model_parameters': ['some', 'parameters'] + }) + table_name = 'table' + num_rows = 2 + + expected_result = pd.DataFrame({ + 'column_A': ['some', 'transformed values'], + 'column_B': ['another', 'transformed column'] + }) + + # Run + result = sampler.transform_synthesized_rows(synthesized_rows, table_name, num_rows) + + # Check - Result + assert result.equals(expected_result) + + # Check - Class internal state + assert sampler.sampled == update_mock.return_value + + # Check - Mock calls + get_table_meta_mock.assert_called_once_with(sampler, data_navigator.meta, 'table') + update_mock.assert_called_once_with({}, 'table', (None, synthesized_rows)) + fill_mock.assert_called_once_with( + sampler, synthesized_rows, ['column_A', 'column_B'], 'table') + + data_navigator.ht.reverse_transform_table.assert_called_once_with( + fill_mock.return_value, get_table_meta_mock.return_value + ) + def test__prepare_sampled_covariance(self): """ """ # Setup @@ -141,7 +221,7 @@ def fake_dataframe(name, number): reset_mock.assert_called_once_with({'TABLE_A': 'concatenated_dataframe'}) def test__unflatten_dict(self): - """ """ + """unflatten_dict restructure flatten dicts.""" # Setup data_navigator = MagicMock() modeler = MagicMock() @@ -171,7 +251,7 @@ def test__unflatten_dict(self): modeler.assert_not_called() def test__unflatten_dict_mixed_array(self): - """unflatten_dict restruicture arrays""" + """unflatten_dict restructure arrays.""" # Setup data_navigator = MagicMock() modeler = MagicMock() @@ -297,11 +377,11 @@ def test__unflatten_gaussian_copula(self): 'distribs': { 0: { 'first': 'distribution', - 'std': 1 + 'std': 0 }, 1: { 'second': 'distribution', - 'std': 1 + 'std': 0 } } } @@ -395,3 +475,93 @@ def test__unflatten_gaussian_copula_negative_std(self): data_navigator.assert_not_called() modeler.assert_not_called() + + def test__sample_valid_rows_respect_categorical_values(self): + """_sample_valid_rows will return rows with valid values for categorical columns.""" + # Setup + data_navigator = MagicMock(spec=DataNavigator) + modeler = MagicMock(spec=Modeler) + sampler = Sampler(data_navigator, modeler) + + data_navigator.meta = { + 'tables': [ + { + 'name': 'table_name', + 'fields': [ + { + 'name': 'field_A', + 'type': 'categorical' + }, + { + 'name': 'field_B', + 'type': 'categorical' + } + ] + } + ] + } + + num_rows = 5 + table_name = 'table_name' + model = MagicMock(spec=GaussianMultivariate) + model.fitted = True + sample_dataframe = pd.DataFrame([ + {'field_A': 0.5, 'field_B': 0.5}, + {'field_A': 0.5, 'field_B': 0.5}, + {'field_A': 0.5, 'field_B': 0.5}, + {'field_A': 0.5, 'field_B': 1.5}, # Invalid field_B + {'field_A': 1.5, 'field_B': 0.5}, # Invalid field_A + ]) + + model.sample.side_effect = lambda x: sample_dataframe.iloc[:x].copy() + + expected_model_call_args_list = [ + ((5,), {}), + ((2,), {}) + ] + + expected_result = pd.DataFrame([ + {'field_A': 0.5, 'field_B': 0.5}, + {'field_A': 0.5, 'field_B': 0.5}, + {'field_A': 0.5, 'field_B': 0.5}, + {'field_A': 0.5, 'field_B': 0.5}, + {'field_A': 0.5, 'field_B': 0.5}, + ]) + + # Run + result = sampler._sample_valid_rows(model, num_rows, table_name) + + # Check + assert result.equals(expected_result) + + modeler.assert_not_called() + assert len(modeler.method_calls) == 0 + + data_navigator.assert_not_called() + assert len(data_navigator.method_calls) == 0 + + assert model.sample.call_args_list == expected_model_call_args_list + + def test__sample_valid_rows_raises_unfitted_model(self): + """_sample_valid_rows raise an exception for invalid models.""" + # Setup + data_navigator = MagicMock(spec=DataNavigator) + modeler = MagicMock(spec=Modeler) + sampler = Sampler(data_navigator, modeler) + + data_navigator.get_parents.return_value = set() + + num_rows = 5 + table_name = 'table_name' + model = None + + # Run + with self.assertRaises(ValueError): + sampler._sample_valid_rows(model, num_rows, table_name) + + # Check + modeler.assert_not_called() + assert len(modeler.method_calls) == 0 + + data_navigator.assert_not_called() + data_navigator.get_parents.assert_called_once_with('table_name')