From d326d00f599862fd100c734d448ee598f840dc08 Mon Sep 17 00:00:00 2001 From: Manuel Alvarez Date: Mon, 4 Feb 2019 20:55:04 +0100 Subject: [PATCH 1/6] Scale sampled values for categorical columns --- sdv/sampler.py | 25 ++++++++++++++- tests/sdv/test_sampler.py | 64 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 87 insertions(+), 2 deletions(-) diff --git a/sdv/sampler.py b/sdv/sampler.py index 32124fa24..5aa82dcb0 100644 --- a/sdv/sampler.py +++ b/sdv/sampler.py @@ -2,6 +2,7 @@ import pandas as pd from copulas import get_qualified_name +from sklearn.preprocessing import MinMaxScaler import exrex @@ -13,7 +14,6 @@ def __init__(self, data_navigator, modeler): """Instantiate a new object.""" self.dn = data_navigator self.modeler = modeler - self.been_sampled = set() # table_name -> if already sampled self.sampled = {} # table_name -> [(primary_key, generated_row)] self.primary_key = {} @@ -32,6 +32,20 @@ def reset_indices_tables(sampled_tables): return sampled_tables + @staticmethod + def _rescale_values(column): + """Scale an array of values in the interval [0, 1]. + + Args: + column (pandas.Series): Column values to scale + + Return: + pandas.Series: Column values scaled into [0,1] + """ + scaler = MinMaxScaler() + scaled = scaler.fit_transform(column.values.reshape(-1, 1)).ravel() + return pd.Series(scaled, name=column.name) + def transform_synthesized_rows(self, synthesized_rows, table_name, num_rows): """Add primary key and reverse transform synthetized data. @@ -47,6 +61,15 @@ def transform_synthesized_rows(self, synthesized_rows, table_name, num_rows): meta = self.dn.tables[table_name].meta orig_meta = self._get_table_meta(self.dn.meta, table_name) primary_key = meta.get('primary_key') + categorical_fields = [ + field['name'] for field in orig_meta['fields'] + if field['type'] == 'categorical' + ] + + if categorical_fields: + for field in categorical_fields: + if ((synthesized_rows[field] < 0) | (synthesized_rows[field] > 1)).any(): + synthesized_rows[field] = self._rescale_values(synthesized_rows[field]) if primary_key: node = meta['fields'][primary_key] diff --git a/tests/sdv/test_sampler.py b/tests/sdv/test_sampler.py index 9776046d4..3ec1b1a4b 100644 --- a/tests/sdv/test_sampler.py +++ b/tests/sdv/test_sampler.py @@ -1,6 +1,9 @@ from unittest import TestCase +from unittest.mock import MagicMock, patch -from sdv.data_navigator import CSVDataLoader +import pandas as pd + +from sdv.data_navigator import CSVDataLoader, Table from sdv.modeler import Modeler from sdv.sampler import Sampler @@ -19,6 +22,65 @@ def setUpClass(cls): def setUp(self): self.sampler = Sampler(self.data_navigator, self.modeler) + def test__rescale_values(self): + """_rescale_values return and array satisfying 0 < array < 1.""" + # Setup + data_navigator = MagicMock() + modeler = MagicMock() + sampler = Sampler(data_navigator, modeler) + + column = pd.Series([0.0, 5.0, 10], name='column') + expected_result = pd.Series([0.0, 0.5, 1.0], name='column') + + # Run + result = sampler._rescale_values(column) + + # Check + assert (result == expected_result).all().all() + assert len(data_navigator.call_args_list) == 0 + assert len(modeler.call_args_list) == 0 + + @patch('sdv.sampler.Sampler._get_table_meta') + def test_transform_synthesized_rows(self, get_table_meta_mock): + """t_s_r will add the primary key and reverse transform rows.""" + # Setup + data_navigator = MagicMock() + + table_metadata = { + 'fields': { + 'id': { + 'regex': '[0-9]{5}', + 'type': 'number', + 'subtype': 'integer' + + } + }, + 'primary_key': 'id', + } + table_data = pd.DataFrame() + test_table = Table(table_data, table_metadata) + data_navigator.tables = { + 'table': test_table + } + modeler = MagicMock() + sampler = Sampler(data_navigator, modeler) + + synthesized_rows = pd.DataFrame({ + + }) + table_name = 'table' + num_rows = 2 + + expected_result = pd.DataFrame({ + + }) + + # Run + result = sampler.transform_synthesized_rows(synthesized_rows, table_name, num_rows) + + # Check + assert result.equals(expected_result) + def test_sample_rows_parent_table(self): """sample_rows samples new rows for the given table.""" # Setup From 1c170ee6664ea0322c131a163d0539b81c038a4f Mon Sep 17 00:00:00 2001 From: Manuel Alvarez Date: Tue, 5 Feb 2019 14:20:02 +0100 Subject: [PATCH 2/6] Add unittest for transform_synthesized_rows --- sdv/sampler.py | 23 +++--- tests/sdv/test_sampler.py | 149 ++++++++++++++++++++++++++++++++++---- 2 files changed, 146 insertions(+), 26 deletions(-) diff --git a/sdv/sampler.py b/sdv/sampler.py index 68f167e73..0ca6d4a51 100644 --- a/sdv/sampler.py +++ b/sdv/sampler.py @@ -48,11 +48,11 @@ def _rescale_values(column): scaled = scaler.fit_transform(column.values.reshape(-1, 1)).ravel() return pd.Series(scaled, name=column.name) - def transform_synthesized_rows(self, synthesized_rows, table_name, num_rows): + def transform_synthesized_rows(self, synthesized, table_name, num_rows): """Add primary key and reverse transform synthetized data. Args: - synthesized_rows(pandas.DataFrame): Generated data from model + synthesized(pandas.DataFrame): Generated data from model table_name(str): Name of the table. num_rows(int): Number of rows sampled. @@ -70,8 +70,8 @@ def transform_synthesized_rows(self, synthesized_rows, table_name, num_rows): if categorical_fields: for field in categorical_fields: - if ((synthesized_rows[field] < 0) | (synthesized_rows[field] > 1)).any(): - synthesized_rows[field] = self._rescale_values(synthesized_rows[field]) + if ((synthesized[field] < 0) | (synthesized[field] > 1)).any(): + synthesized[field] = self._rescale_values(synthesized[field]) if primary_key: node = meta['fields'][primary_key] @@ -92,25 +92,24 @@ def transform_synthesized_rows(self, synthesized_rows, table_name, num_rows): ' to generate {} samples.'.format(table_name, regex, num_rows) ) - synthesized_rows[primary_key] = pd.Series(values) + synthesized[primary_key] = pd.Series(values) if (node['type'] == 'number') and (node['subtype'] == 'integer'): - synthesized_rows[primary_key] = pd.to_numeric(synthesized_rows[primary_key]) - - sample_info = (primary_key, synthesized_rows) + synthesized[primary_key] = pd.to_numeric(synthesized[primary_key]) + sample_info = (primary_key, synthesized) self.sampled = self.update_mapping_list(self.sampled, table_name, sample_info) # filter out parameters labels = list(self.dn.tables[table_name].data) - synthesized_rows = self._fill_text_columns(synthesized_rows, labels, table_name) + text_filled = self._fill_text_columns(synthesized, labels, table_name) # reverse transform data - reversed_data = self.dn.ht.reverse_transform_table(synthesized_rows, orig_meta) + reversed_data = self.dn.ht.reverse_transform_table(text_filled, orig_meta) - synthesized_rows.update(reversed_data) - return synthesized_rows[labels] + synthesized.update(reversed_data) + return synthesized[labels] def _get_parent_row(self, table_name): parents = self.dn.get_parents(table_name) diff --git a/tests/sdv/test_sampler.py b/tests/sdv/test_sampler.py index b20791eac..48ecfbc8a 100644 --- a/tests/sdv/test_sampler.py +++ b/tests/sdv/test_sampler.py @@ -41,47 +41,168 @@ def test__rescale_values(self): assert len(data_navigator.call_args_list) == 0 assert len(modeler.call_args_list) == 0 - @patch('sdv.sampler.Sampler._get_table_meta') - def test_transform_synthesized_rows(self, get_table_meta_mock): - """t_s_r will add the primary key and reverse transform rows.""" - # Setup + @patch('sdv.sampler.Sampler._fill_text_columns', autospec=True) + @patch('sdv.sampler.Sampler.update_mapping_list', autospec=True) + @patch('sdv.sampler.Sampler._get_table_meta', autospec=True) + def test_transform_synthesized_rows_no_pk_no_categorical( + self, get_table_meta_mock, update_mock, fill_mock): + + """transform_synthesized_rows will update internal state and reverse transform rows.""" + # Setup - Class Instantiation data_navigator = MagicMock() + modeler = MagicMock() + sampler = Sampler(data_navigator, modeler) + # Setup - Mock configuration table_metadata = { 'fields': { - 'id': { - 'regex': '[0-9]{5}', + 'column_A': { 'type': 'number', 'subtype': 'integer' - + }, + 'column_B': { + 'name': 'column', + 'type': 'number' } }, - 'primary_key': 'id', + 'primary_key': None } - table_data = pd.DataFrame() + table_data = pd.DataFrame(columns=['column_A', 'column_B']) test_table = Table(table_data, table_metadata) data_navigator.tables = { 'table': test_table } + + data_navigator.ht.reverse_transform_table.return_value = pd.DataFrame({ + 'column_A': ['some', 'transformed values'], + 'column_B': ['another', 'transformed column'] + }) + + get_table_meta_mock.return_value = { + 'original': 'meta', + 'fields': [] + } + + fill_mock.return_value = pd.DataFrame({ + 'column_A': ['filled', 'text_values'], + 'column_B': ['nothing', 'numerical'] + }) + + # Setup - Method arguments / expected result + synthesized_rows = pd.DataFrame({ + 'column_A': [1.7, 2.5], + 'column_B': [4.7, 5.1], + 'model_parameters': ['some', 'parameters'] + }) + table_name = 'table' + num_rows = 2 + + expected_result = pd.DataFrame({ + 'column_A': ['some', 'transformed values'], + 'column_B': ['another', 'transformed column'] + }) + + # Run + result = sampler.transform_synthesized_rows(synthesized_rows, table_name, num_rows) + + # Check - Result + assert result.equals(expected_result) + + # Check - Class internal state + assert sampler.sampled == update_mock.return_value + + # Check - Mock calls + get_table_meta_mock.assert_called_once_with(sampler, data_navigator.meta, 'table') + update_mock.assert_called_once_with(sampler, {}, 'table', (None, synthesized_rows)) + fill_mock.assert_called_once_with( + sampler, synthesized_rows, ['column_A', 'column_B'], 'table') + + data_navigator.ht.reverse_transform_table.assert_called_once_with( + fill_mock.return_value, get_table_meta_mock.return_value + ) + + @patch('sdv.sampler.Sampler._fill_text_columns', autospec=True) + @patch('sdv.sampler.Sampler.update_mapping_list', autospec=True) + @patch('sdv.sampler.Sampler._rescale_values', autospec=True) + @patch('sdv.sampler.Sampler._get_table_meta', autospec=True) + def test_transform_synthesized_rows_no_pk_but_categorical( + self, get_table_meta_mock, rescale_mock, update_mock, fill_mock): + + """transform_synthesized_rows will update internal state and reverse transform rows.""" + # Setup - Class Instantiation + data_navigator = MagicMock() modeler = MagicMock() sampler = Sampler(data_navigator, modeler) - synthesized_rows = pd.DataFrame({ + # Setup - Mock configuration + table_metadata = { + 'fields': { + 'column_A': { + 'type': 'categorical', + }, + 'column_B': { + 'name': 'column', + 'type': 'number' + } + }, + 'primary_key': None + } + table_data = pd.DataFrame(columns=['column_A', 'column_B']) + test_table = Table(table_data, table_metadata) + data_navigator.tables = { + 'table': test_table + } + data_navigator.ht.reverse_transform_table.return_value = pd.DataFrame({ + 'column_A': ['some', 'transformed values'], + 'column_B': ['another', 'transformed column'] + }) + + get_table_meta_mock.return_value = { + 'original': 'meta', + 'fields': [] + } + + rescale_mock.side_effect = lambda x: pd.Series([0.1, 0.8], name=x.name) + + fill_mock.return_value = pd.DataFrame({ + 'column_A': ['filled', 'text_values'], + 'column_B': ['nothing', 'numerical'] + }) + + # Setup - Method arguments / expected result + synthesized_rows = pd.DataFrame({ + 'column_A': [1.7, 2.5], + 'column_B': [4.7, 5.1], + 'model_parameters': ['some', 'parameters'] }) table_name = 'table' num_rows = 2 expected_result = pd.DataFrame({ - + 'column_A': ['some', 'transformed values'], + 'column_B': ['another', 'transformed column'] }) # Run result = sampler.transform_synthesized_rows(synthesized_rows, table_name, num_rows) - # Check + # Check - Result assert result.equals(expected_result) + # Check - Class internal state + assert sampler.sampled == update_mock.return_value + + # Check - Mock calls + get_table_meta_mock.assert_called_once_with(sampler, data_navigator.meta, 'table') + update_mock.assert_called_once_with(sampler, {}, 'table', (None, synthesized_rows)) + fill_mock.assert_called_once_with( + sampler, synthesized_rows, ['column_A', 'column_B'], 'table') + + data_navigator.ht.reverse_transform_table.assert_called_once_with( + fill_mock.return_value, get_table_meta_mock.return_value + ) + def test_sample_rows_parent_table(self): """sample_rows samples new rows for the given table.""" # Setup @@ -151,7 +272,7 @@ def fake_dataframe(name, number): reset_mock.assert_called_once_with({'TABLE_A': 'concatenated_dataframe'}) def test_unflatten_dict(self): - """ """ + """unflatten_dict restructure flatten dicts.""" # Setup data_navigator = MagicMock() modeler = MagicMock() @@ -181,7 +302,7 @@ def test_unflatten_dict(self): modeler.assert_not_called() def test_unflatten_dict_mixed_array(self): - """unflatten_dict restruicture arrays""" + """unflatten_dict restructure arrays.""" # Setup data_navigator = MagicMock() modeler = MagicMock() From 5494f03f889962f404378fb93f374a6cdeb31738 Mon Sep 17 00:00:00 2001 From: Manuel Alvarez Date: Mon, 11 Feb 2019 13:44:56 +0100 Subject: [PATCH 3/6] Refactor double loop into nested one --- sdv/sampler.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/sdv/sampler.py b/sdv/sampler.py index 0ca6d4a51..54aabf800 100644 --- a/sdv/sampler.py +++ b/sdv/sampler.py @@ -63,15 +63,13 @@ def transform_synthesized_rows(self, synthesized, table_name, num_rows): meta = self.dn.tables[table_name].meta orig_meta = self._get_table_meta(self.dn.meta, table_name) primary_key = meta.get('primary_key') - categorical_fields = [ - field['name'] for field in orig_meta['fields'] - if field['type'] == 'categorical' - ] - - if categorical_fields: - for field in categorical_fields: - if ((synthesized[field] < 0) | (synthesized[field] > 1)).any(): - synthesized[field] = self._rescale_values(synthesized[field]) + + for field in orig_meta['fields']: + if field['type'] == 'categorical': + column_name = field['name'] + column = synthesized[column_name] + if ((column < 0) | (column > 1)).any(): + synthesized[column_name] = self._rescale_values(column) if primary_key: node = meta['fields'][primary_key] From 927e2b140ed5c123790bbf019fbaf9a8893234d2 Mon Sep 17 00:00:00 2001 From: Manuel Alvarez Date: Tue, 12 Feb 2019 13:46:49 +0100 Subject: [PATCH 4/6] Enforce valid categorical without altering distributions --- sdv/sampler.py | 144 +++++++++++++++++++--------- tests/sdv/test_sampler.py | 196 ++++++++++++++++++-------------------- 2 files changed, 192 insertions(+), 148 deletions(-) diff --git a/sdv/sampler.py b/sdv/sampler.py index f9e7f782c..96028a487 100644 --- a/sdv/sampler.py +++ b/sdv/sampler.py @@ -4,13 +4,24 @@ import pandas as pd from copulas import get_qualified_name from rdt.transformers.positive_number import PositiveNumberTransformer -from sklearn.preprocessing import MinMaxScaler import exrex GAUSSIAN_COPULA = 'copulas.multivariate.gaussian.GaussianMultivariate' +MODEL_ERROR_MESSAGES = { + True: ( + 'There was an error recreating models from parameters. ' + 'Sampling could not continue.' + ), + False: ( + 'Modeler hasn\'t been fitted. ' + 'Please call Modeler.model_database() before sampling' + ) +} + + class Sampler: """Class to sample data from a model.""" @@ -52,6 +63,22 @@ def _square_matrix(triangular_matrix): return triangular_matrix + def _get_table_meta(self, metadata, table_name): + """Return metadata get table meta for a given table name. + + Args: + metadata (dict): Metadata for dataset. + table_name (str): Name of table to get metadata from. + + Returns: + dict: Metadata for given table. + """ + for table in metadata['tables']: + if table['name'] == table_name: + return table + + return None + def _prepare_sampled_covariance(self, covariance): """ @@ -80,20 +107,6 @@ def reset_indices_tables(sampled_tables): return sampled_tables - @staticmethod - def _rescale_values(column): - """Scale an array of values in the interval [0, 1]. - - Args: - column (pandas.Series): Column values to scale - - Return: - pandas.Series: Column values scaled into [0,1] - """ - scaler = MinMaxScaler() - scaled = scaler.fit_transform(column.values.reshape(-1, 1)).ravel() - return pd.Series(scaled, name=column.name) - def transform_synthesized_rows(self, synthesized, table_name, num_rows): """Add primary key and reverse transform synthetized data. @@ -110,13 +123,6 @@ def transform_synthesized_rows(self, synthesized, table_name, num_rows): orig_meta = self._get_table_meta(self.dn.meta, table_name) primary_key = meta.get('primary_key') - for field in orig_meta['fields']: - if field['type'] == 'categorical': - column_name = field['name'] - column = synthesized[column_name] - if ((column < 0) | (column > 1)).any(): - synthesized[column_name] = self._rescale_values(column) - if primary_key: node = meta['fields'][primary_key] regex = node['regex'] @@ -405,6 +411,75 @@ def unflatten_model(self, parent_row, table_name, parent_name): return self.modeler.model.from_dict(model_parameters) + def __get_missing_valid_rows(self, synthesized, drop_indices, valid_rows, num_rows): + """ + + Args: + synthesized (pandas.DataFrame) + + Returns: + tuple[int, pandas.DataFrame]: Amount of missing values and actual valid rows + """ + valid_rows = pd.concat([valid_rows, synthesized[~drop_indices].copy()]) + valid_rows = valid_rows.reset_index(drop=True) + + missing_rows = num_rows - valid_rows.shape[0] + + return missing_rows, valid_rows + + def _sample_valid_rows(self, model, num_rows, table_name): + """Sample using `model` and discard invalid values until having `num_rows`. + + Args: + model (copula.multivariate.base): Fitted model. + num_rows (int): Number of rows to sample. + table_name (str): name of table to synthesize. + + Returns: + pandas.DataFrame: Sampled rows, shape (, num_rows) + """ + + if model and model.fitted: + synthesized = model.sample(num_rows) + valid_rows = pd.DataFrame(columns=synthesized.columns) + drop_indices = pd.Series(False, index=synthesized.index) + + categorical_columns = [] + table_metadata = self._get_table_meta(self.dn.meta, table_name) + + for field in table_metadata['fields']: + if field['type'] == 'categorical': + column_name = field['name'] + categorical_columns.append(column_name) + column = synthesized[column_name] + filtered_values = ((column < 0) | (column > 1)) + + if filtered_values.any(): + drop_indices |= filtered_values + + missing_rows, valid_rows = self.__get_missing_valid_rows( + synthesized, drop_indices, valid_rows, num_rows) + + while missing_rows: + synthesized = model.sample(missing_rows) + drop_indices = pd.Series(False, index=synthesized.index) + + for column_name in categorical_columns: + column = synthesized[column_name] + filtered_values = ((column < 0) | (column > 1)) + + if filtered_values.any(): + drop_indices |= filtered_values + + missing_rows, valid_rows = self.__get_missing_valid_rows( + synthesized, drop_indices, valid_rows, num_rows) + + return valid_rows + + else: + parents = bool(self.dn.get_parents(table_name)) + raise ValueError(MODEL_ERROR_MESSAGES[parents]) + def sample_rows(self, table_name, num_rows): """Sample specified number of rows for specified table. @@ -427,14 +502,7 @@ def sample_rows(self, table_name, num_rows): # get parameters from parent to make model model = self.unflatten_model(parent_row, table_name, random_parent) - # sample from that model - if model is not None and model.fitted: - synthesized_rows = model.sample(num_rows) - else: - raise ValueError( - 'There was an error recreating models from parameters. ' - 'Sampling could not continue.' - ) + synthesized_rows = self._sample_valid_rows(model, num_rows, table_name) # add foreign key value to row fk_val = parent_row.loc[0, fk] @@ -447,13 +515,7 @@ def sample_rows(self, table_name, num_rows): else: # there is no parent model = self.modeler.models[table_name] - if model.fitted: - synthesized_rows = model.sample(num_rows) - else: - raise ValueError( - 'Modeler hasn\'t been fitted. ' - 'Please call Modeler.model_database() before sampling' - ) + synthesized_rows = self._sample_valid_rows(model, num_rows, table_name) return self.transform_synthesized_rows(synthesized_rows, table_name, num_rows) @@ -527,14 +589,6 @@ def sample_all(self, num_rows=5): return self.reset_indices_tables(sampled_data) - def _get_table_meta(self, meta, table_name): - """Return metadata get table meta for a given table name""" - for table in meta['tables']: - if table['name'] == table_name: - return table - - return None - def _fill_text_columns(self, row, labels, table_name): """Fill in the column values for every non numeric column that isn't the primary key. diff --git a/tests/sdv/test_sampler.py b/tests/sdv/test_sampler.py index 2aae75326..4146f6f7b 100644 --- a/tests/sdv/test_sampler.py +++ b/tests/sdv/test_sampler.py @@ -4,8 +4,8 @@ import numpy as np import pandas as pd -from sdv.data_navigator import CSVDataLoader, Table -from sdv.modeler import Modeler +from sdv.data_navigator import CSVDataLoader, DataNavigator, Table +from sdv.modeler import GaussianMultivariate, Modeler from sdv.sampler import Sampler @@ -48,28 +48,10 @@ def test__square_matrix(self): # Check assert result == expected_result - def test__rescale_values(self): - """_rescale_values return and array satisfying 0 < array < 1.""" - # Setup - data_navigator = MagicMock() - modeler = MagicMock() - sampler = Sampler(data_navigator, modeler) - - column = pd.Series([0.0, 5.0, 10], name='column') - expected_result = pd.Series([0.0, 0.5, 1.0], name='column') - - # Run - result = sampler._rescale_values(column) - - # Check - assert (result == expected_result).all().all() - assert len(data_navigator.call_args_list) == 0 - assert len(modeler.call_args_list) == 0 - @patch('sdv.sampler.Sampler._fill_text_columns', autospec=True) @patch('sdv.sampler.Sampler.update_mapping_list') @patch('sdv.sampler.Sampler._get_table_meta', autospec=True) - def test_transform_synthesized_rows_no_pk_no_categorical( + def test_transform_synthesized_rows_no_pk( self, get_table_meta_mock, update_mock, fill_mock): """transform_synthesized_rows will update internal state and reverse transform rows.""" @@ -146,88 +128,6 @@ def test_transform_synthesized_rows_no_pk_no_categorical( fill_mock.return_value, get_table_meta_mock.return_value ) - @patch('sdv.sampler.Sampler._fill_text_columns', autospec=True) - @patch('sdv.sampler.Sampler.update_mapping_list') - @patch('sdv.sampler.Sampler._rescale_values', autospec=True) - @patch('sdv.sampler.Sampler._get_table_meta', autospec=True) - def test_transform_synthesized_rows_no_pk_but_categorical( - self, get_table_meta_mock, rescale_mock, update_mock, fill_mock): - - """transform_synthesized_rows will update internal state and reverse transform rows.""" - # Setup - Class Instantiation - data_navigator = MagicMock() - modeler = MagicMock() - sampler = Sampler(data_navigator, modeler) - - # Setup - Mock configuration - table_metadata = { - 'fields': { - 'column_A': { - 'type': 'categorical', - }, - 'column_B': { - 'name': 'column', - 'type': 'number' - } - }, - 'primary_key': None - } - table_data = pd.DataFrame(columns=['column_A', 'column_B']) - test_table = Table(table_data, table_metadata) - data_navigator.tables = { - 'table': test_table - } - - data_navigator.ht.reverse_transform_table.return_value = pd.DataFrame({ - 'column_A': ['some', 'transformed values'], - 'column_B': ['another', 'transformed column'] - }) - - get_table_meta_mock.return_value = { - 'original': 'meta', - 'fields': [] - } - - rescale_mock.side_effect = lambda x: pd.Series([0.1, 0.8], name=x.name) - - fill_mock.return_value = pd.DataFrame({ - 'column_A': ['filled', 'text_values'], - 'column_B': ['nothing', 'numerical'] - }) - - # Setup - Method arguments / expected result - synthesized_rows = pd.DataFrame({ - 'column_A': [1.7, 2.5], - 'column_B': [4.7, 5.1], - 'model_parameters': ['some', 'parameters'] - }) - table_name = 'table' - num_rows = 2 - - expected_result = pd.DataFrame({ - 'column_A': ['some', 'transformed values'], - 'column_B': ['another', 'transformed column'] - }) - - # Run - result = sampler.transform_synthesized_rows(synthesized_rows, table_name, num_rows) - - # Check - Result - assert result.equals(expected_result) - - # Check - Class internal state - assert sampler.sampled == update_mock.return_value - - # Check - Mock calls - get_table_meta_mock.assert_called_once_with(sampler, data_navigator.meta, 'table') - update_mock.assert_called_once_with({}, 'table', (None, synthesized_rows)) - fill_mock.assert_called_once_with( - sampler, synthesized_rows, ['column_A', 'column_B'], 'table') - - data_navigator.ht.reverse_transform_table.assert_called_once_with( - fill_mock.return_value, get_table_meta_mock.return_value - ) - def test__prepare_sampled_covariance(self): """ """ # Setup @@ -575,3 +475,93 @@ def test__unflatten_gaussian_copula_negative_std(self): data_navigator.assert_not_called() modeler.assert_not_called() + + def test__sample_valid_rows_respect_categorical_values(self): + """_sample_valid_rows will return rows with valid values for categorical columns.""" + # Setup + data_navigator = MagicMock(spec=DataNavigator) + modeler = MagicMock(spec=Modeler) + sampler = Sampler(data_navigator, modeler) + + data_navigator.meta = { + 'tables': [ + { + 'name': 'table_name', + 'fields': [ + { + 'name': 'field_A', + 'type': 'categorical' + }, + { + 'name': 'field_B', + 'type': 'categorical' + } + ] + } + ] + } + + num_rows = 5 + table_name = 'table_name' + model = MagicMock(spec=GaussianMultivariate) + model.fitted = True + sample_dataframe = pd.DataFrame([ + {'field_A': 0.5, 'field_B': 0.5}, + {'field_A': 0.5, 'field_B': 0.5}, + {'field_A': 0.5, 'field_B': 0.5}, + {'field_A': 0.5, 'field_B': 1.5}, # Invalid field_B + {'field_A': 1.5, 'field_B': 0.5}, # Invalid field_A + ]) + + model.sample.side_effect = lambda x: sample_dataframe.iloc[:x].copy() + + expected_model_call_args_list = [ + ((5,), {}), + ((2,), {}) + ] + + expected_result = pd.DataFrame([ + {'field_A': 0.5, 'field_B': 0.5}, + {'field_A': 0.5, 'field_B': 0.5}, + {'field_A': 0.5, 'field_B': 0.5}, + {'field_A': 0.5, 'field_B': 0.5}, + {'field_A': 0.5, 'field_B': 0.5}, + ]) + + # Run + result = sampler._sample_valid_rows(model, num_rows, table_name) + + # Check + assert result.equals(expected_result) + + modeler.assert_not_called() + assert len(modeler.method_calls) == 0 + + data_navigator.assert_not_called() + assert len(data_navigator.method_calls) == 0 + + assert model.sample.call_args_list == expected_model_call_args_list + + def test__sample_valid_rows_raises_unfitted_model(self): + """_sample_valid_rows raise an exception for invalid models.""" + # Setup + data_navigator = MagicMock(spec=DataNavigator) + modeler = MagicMock(spec=Modeler) + sampler = Sampler(data_navigator, modeler) + + data_navigator.get_parents.return_value = set() + + num_rows = 5 + table_name = 'table_name' + model = None + + # Run + with self.assertRaises(ValueError): + sampler._sample_valid_rows(model, num_rows, table_name) + + # Check + modeler.assert_not_called() + assert len(modeler.method_calls) == 0 + + data_navigator.assert_not_called() + data_navigator.get_parents.assert_called_once_with('table_name') From bb824fe9355cacb057d75aca5ca22907b9b72d9f Mon Sep 17 00:00:00 2001 From: Manuel Alvarez Date: Tue, 12 Feb 2019 14:51:03 +0100 Subject: [PATCH 5/6] Uniform std setting on gaussian modeler --- sdv/modeler.py | 10 ++++++++++ sdv/sampler.py | 24 ++++++++++-------------- tests/sdv/test_modeler.py | 10 +++++----- tests/sdv/test_sampler.py | 4 ++-- 4 files changed, 27 insertions(+), 21 deletions(-) diff --git a/sdv/modeler.py b/sdv/modeler.py index 8e5145356..3fe74a434 100644 --- a/sdv/modeler.py +++ b/sdv/modeler.py @@ -6,6 +6,7 @@ from copulas import get_qualified_name from copulas.multivariate import GaussianMultivariate, TreeTypes from copulas.univariate import GaussianUnivariate +from rdt.transformers.positive_number import PositiveNumberTransformer # Configure logger logger = logging.getLogger(__name__) @@ -162,6 +163,15 @@ def flatten_model(self, model, name=''): values.append(row[:index + 1]) model.covariance = np.array(values) + if self.model_kwargs['distribution'] == get_qualified_name(DEFAULT_DISTRIBUTION): + transformer = PositiveNumberTransformer({ + 'name': 'field', + 'type': 'number' + }) + + for distribution in model.distribs.values(): + column = pd.DataFrame({'field': [distribution.std]}) + distribution.std = transformer.reverse_transform(column).loc[0, 'field'] return pd.Series(self._flatten_dict(model.to_dict(), name)) diff --git a/sdv/sampler.py b/sdv/sampler.py index 96028a487..fe741c84f 100644 --- a/sdv/sampler.py +++ b/sdv/sampler.py @@ -358,23 +358,19 @@ def _unflatten_gaussian_copula(self, model_parameters): 'fitted': True, 'type': distribution_name } + model_parameters['distribution'] = distribution_name distribs = model_parameters['distribs'] - if any([distribs[key]['std'] <= 0 for key in distribs]): - metadata = { - 'name': 'std', - 'type': 'number' - } - transformer = PositiveNumberTransformer(metadata) - - model_parameters['distribution'] = distribution_name - for key in distribs: - distribs[key].update(distribution_kwargs) + metadata = { + 'name': 'std', + 'type': 'number' + } + transformer = PositiveNumberTransformer(metadata) - distribution_std = distribs[key]['std'] - if distribution_std <= 0: - df = pd.DataFrame({'std': [distribution_std]}) - distribs[key]['std'] = transformer.fit_transform(df)['std'].values[0] + for distribution in distribs.values(): + distribution.update(distribution_kwargs) + df = pd.DataFrame({'std': [distribution['std']]}) + distribution['std'] = transformer.transform(df).loc[0, 'std'] covariance = model_parameters['covariance'] covariance = self._prepare_sampled_covariance(covariance) diff --git a/tests/sdv/test_modeler.py b/tests/sdv/test_modeler.py index cedcba425..9d04eabf4 100644 --- a/tests/sdv/test_modeler.py +++ b/tests/sdv/test_modeler.py @@ -36,9 +36,9 @@ def test__create_extension(self): 'covariance__1__0': 0.0, 'covariance__1__1': 1.4999999999999991, 'distribs__a__mean': 0.0, - 'distribs__a__std': 0.001, + 'distribs__a__std': -6.907755278982137, 'distribs__b__mean': 3.0, - 'distribs__b__std': 1.632993161855452 + 'distribs__b__std': 0.4904146265058631 }) # Run @@ -156,11 +156,11 @@ def test_flatten_model(self): 'covariance__2__1': -0.7500000000000003, 'covariance__2__2': 1.5000000000000007, 'distribs__0__mean': 0.33333333333333331, - 'distribs__0__std': 0.47140452079103168, + 'distribs__0__std': -0.7520386983881371, 'distribs__1__mean': 0.33333333333333331, - 'distribs__1__std': 0.47140452079103168, + 'distribs__1__std': -0.7520386983881371, 'distribs__2__mean': 0.33333333333333331, - 'distribs__2__std': 0.47140452079103168 + 'distribs__2__std': -0.7520386983881371 }) data_navigator = mock.MagicMock() modeler = Modeler(data_navigator) diff --git a/tests/sdv/test_sampler.py b/tests/sdv/test_sampler.py index 4146f6f7b..1fefb881c 100644 --- a/tests/sdv/test_sampler.py +++ b/tests/sdv/test_sampler.py @@ -377,11 +377,11 @@ def test__unflatten_gaussian_copula(self): 'distribs': { 0: { 'first': 'distribution', - 'std': 1 + 'std': 0 }, 1: { 'second': 'distribution', - 'std': 1 + 'std': 0 } } } From f97a4dfd588aaa8710ae7f25f268cc648727c282 Mon Sep 17 00:00:00 2001 From: Manuel Alvarez Date: Wed, 13 Feb 2019 11:11:29 +0100 Subject: [PATCH 6/6] Rename __get_missing_valid_rows -> _get_missing_valid_rows --- sdv/sampler.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sdv/sampler.py b/sdv/sampler.py index fe741c84f..b57f07c41 100644 --- a/sdv/sampler.py +++ b/sdv/sampler.py @@ -407,7 +407,7 @@ def unflatten_model(self, parent_row, table_name, parent_name): return self.modeler.model.from_dict(model_parameters) - def __get_missing_valid_rows(self, synthesized, drop_indices, valid_rows, num_rows): + def _get_missing_valid_rows(self, synthesized, drop_indices, valid_rows, num_rows): """ Args: @@ -453,7 +453,7 @@ def _sample_valid_rows(self, model, num_rows, table_name): if filtered_values.any(): drop_indices |= filtered_values - missing_rows, valid_rows = self.__get_missing_valid_rows( + missing_rows, valid_rows = self._get_missing_valid_rows( synthesized, drop_indices, valid_rows, num_rows) while missing_rows: @@ -467,7 +467,7 @@ def _sample_valid_rows(self, model, num_rows, table_name): if filtered_values.any(): drop_indices |= filtered_values - missing_rows, valid_rows = self.__get_missing_valid_rows( + missing_rows, valid_rows = self._get_missing_valid_rows( synthesized, drop_indices, valid_rows, num_rows) return valid_rows