From 4c75cb8d7d02625ab812e299af15818422bc47ad Mon Sep 17 00:00:00 2001 From: Manuel Alvarez Date: Tue, 22 Jan 2019 16:50:58 +0100 Subject: [PATCH 01/11] Make modeler.model use distribution arg. --- sdv/modeler.py | 3 ++- tests/sdv/test_modeler.py | 29 ++++++++++++++++++++++++++++- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/sdv/modeler.py b/sdv/modeler.py index 069d5bffb..b8702b4a9 100644 --- a/sdv/modeler.py +++ b/sdv/modeler.py @@ -2,6 +2,7 @@ import pickle import pandas as pd +from copulas import get_qualified_name from copulas.multivariate import GaussianMultivariate from copulas.univariate import GaussianUnivariate @@ -124,7 +125,7 @@ def fit_model(self, data): Returns: GaussianMultivariate: Fitted model. """ - model = self.model() + model = self.model(distribution=get_qualified_name(self.distribution)) model.fit(data) return model diff --git a/tests/sdv/test_modeler.py b/tests/sdv/test_modeler.py index 410f78831..9a61677ca 100644 --- a/tests/sdv/test_modeler.py +++ b/tests/sdv/test_modeler.py @@ -1,7 +1,8 @@ -from unittest import TestCase +from unittest import TestCase, mock import numpy as np import pandas as pd +from copulas.univariate.kde import KDEUnivariate from sdv.data_navigator import CSVDataLoader from sdv.modeler import Modeler @@ -164,3 +165,29 @@ def test_get_foreign_key(self): # Check assert result == expected_result + + def test_fit_model_distribution_arg(self): + """fit_model will pass self.distribution FQN to modeler.""" + # Setup + model_mock = mock.MagicMock() + modeler = Modeler(data_navigator='navigator', model=model_mock, distribution=KDEUnivariate) + data = pd.DataFrame({ + 'column': [0, 1, 1, 1, 0], + }) + + # Run + modeler.fit_model(data) + + # Check + model_mock.assert_called_once_with(distribution='copulas.univariate.kde.KDEUnivariate') + + def test_model_database_distribution_arg(self): + """model_database will use self.distribution to model tables.""" + # Setup + modeler = Modeler(data_navigator=self.dn, distribution=KDEUnivariate) + + # Run + modeler.model_database() + + # Check + assert True From 0016afce26f375ab59cc3f70bfadb54909856fad Mon Sep 17 00:00:00 2001 From: Manuel Alvarez Date: Wed, 23 Jan 2019 18:02:00 +0100 Subject: [PATCH 02/11] Flattening of generic models WIP --- sdv/modeler.py | 74 +++++++++++++++++++++++++-- tests/sdv/test_modeler.py | 102 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 168 insertions(+), 8 deletions(-) diff --git a/sdv/modeler.py b/sdv/modeler.py index b8702b4a9..871639774 100644 --- a/sdv/modeler.py +++ b/sdv/modeler.py @@ -1,6 +1,7 @@ import logging import pickle +import numpy as np import pandas as pd from copulas import get_qualified_name from copulas.multivariate import GaussianMultivariate @@ -11,6 +12,7 @@ DEFAULT_MODEL = GaussianMultivariate DEFAULT_DISTRIBUTION = GaussianUnivariate +IGNORED_DICT_KEYS = ['fitted', 'distribution', 'type'] class Modeler: @@ -61,6 +63,67 @@ def get_pk_value(self, pk, index, mapping): return val + @classmethod + def flatten_nested_array(cls, nested, prefix=''): + """Return a dictionary with the values of the given nested array. + + Args: + nested (list, np.array): Iterable to flatten. + """ + result = {} + for index in range(len(nested)): + prefix_key = '__'.join([prefix, str(index)]) if len(prefix) else str(index) + + if isinstance(nested[index], (list, np.ndarray)): + result.update(cls.flatten_nested_array(nested[index], prefix=prefix_key)) + + else: + result[prefix_key] = nested[index] + + return result + + @classmethod + def flatten_nested_dict(cls, nested, prefix=''): + """Return a flatten dict from a nested one. + + This method returns a flatten version of a dictionary, concatenating key names with + double underscores, that is: + + >>> nested_dict = { + 'my_key':{ + 'a': 1, + 'b': 2 + } + } + >>> Modeler.flatten_nested_dict(nested_dict) + { + 'my_key__a': 1, + 'my_key__b': 2 + } + + Args: + nested (dict): Original dictionary to flatten. + prefix (str): Prefix to append to key name + """ + result = {} + + for key in nested.keys(): + prefix_key = '__'.join([prefix, str(key)]) if len(prefix) else key + + if key in IGNORED_DICT_KEYS: + continue + + elif isinstance(nested[key], dict): + result.update(cls.flatten_nested_dict(nested[key], prefix_key)) + + elif isinstance(nested[key], (np.ndarray, list)): + result.update(cls.flatten_nested_array(nested[key], prefix_key)) + + else: + result[prefix_key] = nested[key] + + return result + def flatten_model(self, model): """Flatten a model's parameters into an array. @@ -130,11 +193,12 @@ def fit_model(self, data): return model - def _create_extension(self, df, transformed_child_table): + def _create_extension(self, df, transformed_child_table, foreign_key): """Return the flattened model from a dataframe.""" - # remove column of foreign key try: conditional_data = transformed_child_table.loc[df.index] + conditional_data = conditional_data.drop(foreign_key, axis=1) + except KeyError: return None @@ -142,10 +206,10 @@ def _create_extension(self, df, transformed_child_table): return self.flatten_model(self.fit_model(clean_df)) - def _extension_from_group(self, transformed_child_table): + def _extension_from_group(self, transformed_child_table, foreign_key): """Wrapper around _create_extension to use it with pd.DataFrame.apply.""" def f(group): - return self._create_extension(group, transformed_child_table) + return self._create_extension(group, transformed_child_table, foreign_key) return f def _get_extensions(self, pk, children, table_name): @@ -177,7 +241,7 @@ def _get_extensions(self, pk, children, table_name): continue extension = child_table.groupby(fk) - extension = extension.apply(self._extension_from_group(transformed_child_table)) + extension = extension.apply(self._extension_from_group(transformed_child_table, fk)) if len(extension): # keep track of child column indices diff --git a/tests/sdv/test_modeler.py b/tests/sdv/test_modeler.py index 9a61677ca..00a6fa78f 100644 --- a/tests/sdv/test_modeler.py +++ b/tests/sdv/test_modeler.py @@ -1,4 +1,4 @@ -from unittest import TestCase, mock +from unittest import TestCase, mock, skip import numpy as np import pandas as pd @@ -21,7 +21,8 @@ def test__create_extension(self): """Tests that the create extension method returns correct parameters.""" # Setup child_table = self.dn.get_data('DEMO_ORDERS') - user = child_table[child_table['CUSTOMER_ID'] == 50] + foreign_key = 'CUSTOMER_ID' + user = child_table[child_table[foreign_key] == 50] expected = pd.Series([ 1.500000e+00, 0.000000e+00, -1.269991e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00, @@ -34,7 +35,7 @@ def test__create_extension(self): ]) # Run - parameters = self.modeler._create_extension(user, child_table) + parameters = self.modeler._create_extension(user, child_table, foreign_key) # Check assert expected.subtract(parameters).all() < 10E-3 @@ -191,3 +192,98 @@ def test_model_database_distribution_arg(self): # Check assert True + + def test_flatten_nested_dict_flat_dict(self): + """flatten_nested_dict don't modify flat dicts.""" + # Setup + nested_dict = { + 'a': 1, + 'b': 2 + } + expected_result = { + 'a': 1, + 'b': 2 + } + + # Run + result = Modeler.flatten_nested_dict(nested_dict) + + # Check + assert result == expected_result + + def test_flatten_nested_dict_nested_dict(self): + """flatten_nested_dict flatten nested dicts respecting the prefixes.""" + # Setup + nested_dict = { + 'first_key': { + 'a': 1, + 'b': 2 + }, + 'second_key': { + 'x': 0 + } + } + + expected_result = { + 'first_key__a': 1, + 'first_key__b': 2, + 'second_key__x': 0 + } + + # Run + result = Modeler.flatten_nested_dict(nested_dict) + + # Check + assert result == expected_result + + def test_flatten_nested_array_ndarray(self): + """flatten_nested_array_ return a dict formed from the input np.array""" + # Setup + nested = np.array([ + [1, 0, 0], + [0, 1, 0], + [0, 0, 1] + ]) + expected_result = { + '0__0': 1, + '0__1': 0, + '0__2': 0, + '1__0': 0, + '1__1': 1, + '1__2': 0, + '2__0': 0, + '2__1': 0, + '2__2': 1 + } + + # Run + result = Modeler.flatten_nested_array(nested) + + # Check + assert result == expected_result + + def test_flatten_nested_array_LIST(self): + """flatten_nested_array_ return a dict formed from the input list""" + # Setup + nested = [ + [1, 0, 0], + [0, 1, 0], + [0, 0, 1] + ] + expected_result = { + '0__0': 1, + '0__1': 0, + '0__2': 0, + '1__0': 0, + '1__1': 1, + '1__2': 0, + '2__0': 0, + '2__1': 0, + '2__2': 1 + } + + # Run + result = Modeler.flatten_nested_array(nested) + + # Check + assert result == expected_result From 40819e49a29ed9eb9f8fc5dc803913955141b6a5 Mon Sep 17 00:00:00 2001 From: Manuel Alvarez Date: Thu, 24 Jan 2019 14:23:42 +0100 Subject: [PATCH 03/11] Add Sampler.unflatten_dict method --- sdv/modeler.py | 21 +++++++------ sdv/sampler.py | 60 +++++++++++++++++++++++++++++++++++++ tests/sdv/test_modeler.py | 30 +++++++++---------- tests/sdv/test_sampler.py | 62 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 147 insertions(+), 26 deletions(-) diff --git a/sdv/modeler.py b/sdv/modeler.py index 871639774..fa9eb22cf 100644 --- a/sdv/modeler.py +++ b/sdv/modeler.py @@ -64,7 +64,7 @@ def get_pk_value(self, pk, index, mapping): return val @classmethod - def flatten_nested_array(cls, nested, prefix=''): + def _flatten_array(cls, nested, prefix=''): """Return a dictionary with the values of the given nested array. Args: @@ -75,7 +75,7 @@ def flatten_nested_array(cls, nested, prefix=''): prefix_key = '__'.join([prefix, str(index)]) if len(prefix) else str(index) if isinstance(nested[index], (list, np.ndarray)): - result.update(cls.flatten_nested_array(nested[index], prefix=prefix_key)) + result.update(cls._flatten_array(nested[index], prefix=prefix_key)) else: result[prefix_key] = nested[index] @@ -83,7 +83,7 @@ def flatten_nested_array(cls, nested, prefix=''): return result @classmethod - def flatten_nested_dict(cls, nested, prefix=''): + def _flatten_dict(cls, nested, prefix=''): """Return a flatten dict from a nested one. This method returns a flatten version of a dictionary, concatenating key names with @@ -95,7 +95,7 @@ def flatten_nested_dict(cls, nested, prefix=''): 'b': 2 } } - >>> Modeler.flatten_nested_dict(nested_dict) + >>> Modeler.flatten_dict(nested_dict) { 'my_key__a': 1, 'my_key__b': 2 @@ -114,10 +114,10 @@ def flatten_nested_dict(cls, nested, prefix=''): continue elif isinstance(nested[key], dict): - result.update(cls.flatten_nested_dict(nested[key], prefix_key)) + result.update(cls._flatten_dict(nested[key], prefix_key)) elif isinstance(nested[key], (np.ndarray, list)): - result.update(cls.flatten_nested_array(nested[key], prefix_key)) + result.update(cls._flatten_array(nested[key], prefix_key)) else: result[prefix_key] = nested[key] @@ -193,11 +193,10 @@ def fit_model(self, data): return model - def _create_extension(self, df, transformed_child_table, foreign_key): + def _create_extension(self, df, transformed_child_table): """Return the flattened model from a dataframe.""" try: conditional_data = transformed_child_table.loc[df.index] - conditional_data = conditional_data.drop(foreign_key, axis=1) except KeyError: return None @@ -206,10 +205,10 @@ def _create_extension(self, df, transformed_child_table, foreign_key): return self.flatten_model(self.fit_model(clean_df)) - def _extension_from_group(self, transformed_child_table, foreign_key): + def _extension_from_group(self, transformed_child_table): """Wrapper around _create_extension to use it with pd.DataFrame.apply.""" def f(group): - return self._create_extension(group, transformed_child_table, foreign_key) + return self._create_extension(group, transformed_child_table) return f def _get_extensions(self, pk, children, table_name): @@ -241,7 +240,7 @@ def _get_extensions(self, pk, children, table_name): continue extension = child_table.groupby(fk) - extension = extension.apply(self._extension_from_group(transformed_child_table, fk)) + extension = extension.apply(self._extension_from_group(transformed_child_table)) if len(extension): # keep track of child column indices diff --git a/sdv/sampler.py b/sdv/sampler.py index 1b8592684..4f104949f 100644 --- a/sdv/sampler.py +++ b/sdv/sampler.py @@ -337,3 +337,63 @@ def update_mapping_list(self, mapping, key, value): mapping[key] = [value] return mapping + + @classmethod + def _unflatten_dict(cls, flat): + """Transform a flattened dict into its original form. + + Works in exact opposite way that `sdv.Modeler._flatten_dict`. + + Args: + flat (dict): Flattened dict. + + """ + + result = {} + + for key in sorted(flat.keys()): + path = key.split('__') + value = flat[key] + walked = result + for step, name in enumerate(path): + + if isinstance(walked, dict) and name in walked: + walked = walked[name] + continue + + elif isinstance(walked, list) and len(walked) and len(walked) - 1 >= int(name): + walked = walked[int(name)] + continue + + else: + if name.isdigit(): + name = int(name) + + if step == len(path) - 1: + if isinstance(walked, list): + walked.append(value) + + else: + walked[name] = value + + else: + next_step = path[step + 1] + if next_step.isdigit(): + if isinstance(name, int): + walked.append([]) + + else: + walked[name] = [] + + walked = walked[name] + + else: + if isinstance(name, int): + walked.append({}) + + else: + walked[name] = {} + + walked = walked[name] + + return result diff --git a/tests/sdv/test_modeler.py b/tests/sdv/test_modeler.py index 00a6fa78f..3d9f1bfa5 100644 --- a/tests/sdv/test_modeler.py +++ b/tests/sdv/test_modeler.py @@ -21,8 +21,7 @@ def test__create_extension(self): """Tests that the create extension method returns correct parameters.""" # Setup child_table = self.dn.get_data('DEMO_ORDERS') - foreign_key = 'CUSTOMER_ID' - user = child_table[child_table[foreign_key] == 50] + user = child_table[child_table['CUSTOMER_ID'] == 50] expected = pd.Series([ 1.500000e+00, 0.000000e+00, -1.269991e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00, @@ -35,7 +34,7 @@ def test__create_extension(self): ]) # Run - parameters = self.modeler._create_extension(user, child_table, foreign_key) + parameters = self.modeler._create_extension(user, child_table) # Check assert expected.subtract(parameters).all() < 10E-3 @@ -182,6 +181,7 @@ def test_fit_model_distribution_arg(self): # Check model_mock.assert_called_once_with(distribution='copulas.univariate.kde.KDEUnivariate') + @skip('Work in Progress') def test_model_database_distribution_arg(self): """model_database will use self.distribution to model tables.""" # Setup @@ -193,8 +193,8 @@ def test_model_database_distribution_arg(self): # Check assert True - def test_flatten_nested_dict_flat_dict(self): - """flatten_nested_dict don't modify flat dicts.""" + def test__flatten_dict_flat_dict(self): + """_flatten_dict don't modify flat dicts.""" # Setup nested_dict = { 'a': 1, @@ -206,13 +206,13 @@ def test_flatten_nested_dict_flat_dict(self): } # Run - result = Modeler.flatten_nested_dict(nested_dict) + result = Modeler._flatten_dict(nested_dict) # Check assert result == expected_result - def test_flatten_nested_dict_nested_dict(self): - """flatten_nested_dict flatten nested dicts respecting the prefixes.""" + def test__flatten_dict_nested_dict(self): + """_flatten_dict flatten nested dicts respecting the prefixes.""" # Setup nested_dict = { 'first_key': { @@ -231,13 +231,13 @@ def test_flatten_nested_dict_nested_dict(self): } # Run - result = Modeler.flatten_nested_dict(nested_dict) + result = Modeler._flatten_dict(nested_dict) # Check assert result == expected_result - def test_flatten_nested_array_ndarray(self): - """flatten_nested_array_ return a dict formed from the input np.array""" + def test__flatten_array_ndarray(self): + """_flatten_array return a dict formed from the input np.array""" # Setup nested = np.array([ [1, 0, 0], @@ -257,13 +257,13 @@ def test_flatten_nested_array_ndarray(self): } # Run - result = Modeler.flatten_nested_array(nested) + result = Modeler._flatten_array(nested) # Check assert result == expected_result - def test_flatten_nested_array_LIST(self): - """flatten_nested_array_ return a dict formed from the input list""" + def test__flatten_array_list(self): + """_flatten_array return a dict formed from the input list""" # Setup nested = [ [1, 0, 0], @@ -283,7 +283,7 @@ def test_flatten_nested_array_LIST(self): } # Run - result = Modeler.flatten_nested_array(nested) + result = Modeler._flatten_array(nested) # Check assert result == expected_result diff --git a/tests/sdv/test_sampler.py b/tests/sdv/test_sampler.py index 9776046d4..8355882e6 100644 --- a/tests/sdv/test_sampler.py +++ b/tests/sdv/test_sampler.py @@ -72,3 +72,65 @@ def test_sample_all(self): primary_key = self.sampler.dn.get_meta_data(name)['primary_key'] assert len(table) == 5 assert len(table[primary_key].unique()) == 5 + + def test_unflatten_dict(self): + """ """ + # Setup + flat = { + 'first_key__a': 1, + 'first_key__b': 2, + 'second_key__x': 0 + } + + expected_result = { + 'first_key': { + 'a': 1, + 'b': 2 + }, + 'second_key': { + 'x': 0 + } + } + + # Run + result = Sampler._unflatten_dict(flat) + + # Check + assert result == expected_result + + def test_unflatten_dict_mixed_array(self): + """ """ + # Setup + flat = { + 'first_key__0__0': 1, + 'first_key__0__1': 0, + 'first_key__1__0': 0, + 'first_key__1__1': 1, + 'second_key__0__std': 0.5, + 'second_key__0__mean': 0.5, + 'second_key__1__std': 0.25, + 'second_key__1__mean': 0.25 + } + + expected_result = { + 'first_key': [ + [1, 0], + [0, 1] + ], + 'second_key': [ + { + 'std': 0.5, + 'mean': 0.5 + }, + { + 'std': 0.25, + 'mean': 0.25 + } + ] + } + + # Run + result = Sampler._unflatten_dict(flat) + + # Check + assert result == expected_result From 4d40448b2012ad5d8e7e386d1c78da0a4d492982 Mon Sep 17 00:00:00 2001 From: Manuel Alvarez Date: Thu, 24 Jan 2019 17:22:37 +0100 Subject: [PATCH 04/11] Modeler.flatten_model using generic _flatten_dict --- sdv/modeler.py | 12 ++--- tests/sdv/test_modeler.py | 100 ++++++++++++++++++++++++-------------- 2 files changed, 67 insertions(+), 45 deletions(-) diff --git a/sdv/modeler.py b/sdv/modeler.py index fa9eb22cf..226a63c4b 100644 --- a/sdv/modeler.py +++ b/sdv/modeler.py @@ -124,7 +124,8 @@ def _flatten_dict(cls, nested, prefix=''): return result - def flatten_model(self, model): + @classmethod + def flatten_model(cls, model): """Flatten a model's parameters into an array. Args: @@ -133,18 +134,13 @@ def flatten_model(self, model): Returns: pd.Series: parameters for model """ - params = list(model.covariance.flatten()) - - for col_model in model.distribs.values(): - params.extend([col_model.std, col_model.mean]) - - return pd.Series(params) + return pd.Series(cls._flatten_dict(model.to_dict())) def get_foreign_key(self, fields, primary): """Get foreign key from primary key. Args: - fields (dict): metadata's fields key for a given table. + fields (dict): metadata `fields` key for a given table. primary (str): Name of primary key in original table. Return: diff --git a/tests/sdv/test_modeler.py b/tests/sdv/test_modeler.py index 3d9f1bfa5..4376dd385 100644 --- a/tests/sdv/test_modeler.py +++ b/tests/sdv/test_modeler.py @@ -2,6 +2,7 @@ import numpy as np import pandas as pd +from copulas.multivariate import GaussianMultivariate from copulas.univariate.kde import KDEUnivariate from sdv.data_navigator import CSVDataLoader @@ -20,24 +21,45 @@ def setUp(self): def test__create_extension(self): """Tests that the create extension method returns correct parameters.""" # Setup - child_table = self.dn.get_data('DEMO_ORDERS') - user = child_table[child_table['CUSTOMER_ID'] == 50] - expected = pd.Series([ - 1.500000e+00, 0.000000e+00, -1.269991e+00, - 0.000000e+00, 0.000000e+00, 0.000000e+00, - -1.269991e+00, 0.000000e+00, 1.500000e+00, - 0.000000e+00, 0.000000e+00, -7.401487e-17, - 1.000000e+00, 7.000000e+00, 2.449490e+00, - 4.000000e+00, 5.000000e+01, 5.000000e+01, - 1.000000e-03, 5.000000e+01, 7.300000e+02, - 2.380000e+03, 7.618545e+02, 1.806667e+03 - ]) + data_navigator = mock.MagicMock() + modeler = Modeler(data_navigator) + table = pd.DataFrame({ + 'a': [0, 1, 0, 1, 0, 1], + 'b': [1, 2, 3, 4, 5, 6] + }) + group = table[table.a == 0] + + expected_result = pd.Series({ + 'covariance__0__0': 0.0, + 'covariance__0__1': 0.0, + 'covariance__1__0': 0.0, + 'covariance__1__1': 1.4999999999999991, + 'distribs__a__mean': 0.0, + 'distribs__a__std': 0.001, + 'distribs__b__mean': 3.0, + 'distribs__b__std': 1.632993161855452 + }) # Run - parameters = self.modeler._create_extension(user, child_table) + result = modeler._create_extension(group, table) # Check - assert expected.subtract(parameters).all() < 10E-3 + assert (expected_result == result).all() + + def test__create_extension_wrong_index_return_none(self): + """_create_extension raises an exception if df.index not in transformed_child_table.""" + # Setup + data_navigator = mock.MagicMock() + modeler = Modeler(data_navigator) + transformed_child_table = pd.DataFrame(np.eye(3), columns=['A', 'B', 'C']) + df = pd.DataFrame(index=range(5, 10)) + + # Run + result = modeler._create_extension(df, transformed_child_table) + + # Check + assert result is None + def test__get_extensions(self): """_get_extensions returns a works for table with child""" @@ -93,29 +115,33 @@ def test_CPA(self): def test_flatten_model(self): """flatten_model returns a pandas.Series with all the params to recreate a model.""" # Setup - for data in self.dn.transformed_data.values(): - num_columns = data.shape[1] - model = self.modeler.model() - model.fit(data) - - # We generate it this way because RDT behavior is not fully deterministic - # and transformed data can change between test runs. - distribs_values = np.array([ - [col_model.std, col_model.mean] - for col_model in model.distribs.values() - ]).flatten() - - expected_result = pd.Series( - list(model.covariance.flatten()) + - list(distribs_values) - ) - - # Run - result = self.modeler.flatten_model(model) - - # Check - assert (result == expected_result).all() - assert len(result) == num_columns ** 2 + (2 * num_columns) + model = GaussianMultivariate() + X = np.eye(3) + model.fit(X) + + expected_result = pd.Series({ + 'covariance__0__0': 1.5000000000000004, + 'covariance__0__1': -0.7500000000000003, + 'covariance__0__2': -0.7500000000000003, + 'covariance__1__0': -0.7500000000000003, + 'covariance__1__1': 1.5000000000000004, + 'covariance__1__2': -0.7500000000000003, + 'covariance__2__0': -0.7500000000000003, + 'covariance__2__1': -0.7500000000000003, + 'covariance__2__2': 1.5000000000000007, + 'distribs__0__mean': 0.33333333333333331, + 'distribs__0__std': 0.47140452079103168, + 'distribs__1__mean': 0.33333333333333331, + 'distribs__1__std': 0.47140452079103168, + 'distribs__2__mean': 0.33333333333333331, + 'distribs__2__std': 0.47140452079103168 + }) + + # Run + result = Modeler.flatten_model(model) + + # Check + assert (result == expected_result).all() def test_impute_table(self): """impute_table fills all NaN values with 0 or the mean of values.""" From 6c7be1f8711ea9258eecbed9add83d7dadadc2e5 Mon Sep 17 00:00:00 2001 From: Manuel Alvarez Date: Thu, 24 Jan 2019 17:23:32 +0100 Subject: [PATCH 05/11] Update tests for get_extensions --- sdv/sampler.py | 3 --- tests/sdv/test_modeler.py | 55 ++++++++++++++++++++++++++++----------- 2 files changed, 40 insertions(+), 18 deletions(-) diff --git a/sdv/sampler.py b/sdv/sampler.py index 4f104949f..7252d52ec 100644 --- a/sdv/sampler.py +++ b/sdv/sampler.py @@ -372,7 +372,6 @@ def _unflatten_dict(cls, flat): if step == len(path) - 1: if isinstance(walked, list): walked.append(value) - else: walked[name] = value @@ -381,7 +380,6 @@ def _unflatten_dict(cls, flat): if next_step.isdigit(): if isinstance(name, int): walked.append([]) - else: walked[name] = [] @@ -390,7 +388,6 @@ def _unflatten_dict(cls, flat): else: if isinstance(name, int): walked.append({}) - else: walked[name] = {} diff --git a/tests/sdv/test_modeler.py b/tests/sdv/test_modeler.py index 4376dd385..1e72e41ad 100644 --- a/tests/sdv/test_modeler.py +++ b/tests/sdv/test_modeler.py @@ -5,7 +5,7 @@ from copulas.multivariate import GaussianMultivariate from copulas.univariate.kde import KDEUnivariate -from sdv.data_navigator import CSVDataLoader +from sdv.data_navigator import CSVDataLoader, Table from sdv.modeler import Modeler @@ -60,31 +60,56 @@ def test__create_extension_wrong_index_return_none(self): # Check assert result is None - - def test__get_extensions(self): - """_get_extensions returns a works for table with child""" + @mock.patch('sdv.modeler.Modeler.get_foreign_key') + @mock.patch('sdv.modeler.pd.core.groupby.GroupBy.apply') + def test__get_extensions(self, apply_mock, get_foreign_mock): + """_get_extensions return the conditional modelling parameters for each children.""" # Setup - pk = 'ORDER_ID' - table = 'DEMO_ORDERS' - children = self.dn.get_children(table) + data_navigator = mock.MagicMock() + + first_table_data = pd.DataFrame({'foreign_key': [0, 1]}) + first_table_meta = {'fields': []} + + data_navigator.tables = { + 'first_children': Table(first_table_data, first_table_meta), + 'second_children': Table(first_table_data, first_table_meta), + } + data_navigator.get_children.return_value = {} + modeler = Modeler(data_navigator) + modeler.tables = {} + + apply_mock.side_effect = lambda x: pd.DataFrame([{ + 'column_1': 1, + 'column_2': 2 + }]) + get_foreign_mock.return_value = 'foreign_key' + + pk = 'primary_key' + table_name = 'table_name' + children = ['first_children', 'second_children'] + + expected_result = [ + pd.DataFrame([[1, 2]]), + pd.DataFrame([[1, 2]], columns=range(2, 4)) + ] # Run - result = self.modeler._get_extensions(pk, children, table) + result = modeler._get_extensions(pk, children, table_name) # Check - assert len(result) == 1 - assert result[0].shape == (10, 35) + assert all([(result[index] == expected_result[index]).all().all() for index in range(2)]) def test_get_extensions_no_children(self): - """Tests that get extensions works for table with no children.""" + """_get_extensions return an empty list if children is empty.""" # Setup - pk = 'ORDER_ITEM_ID' - table = 'DEMO_ORDER_ITEMS' - children = self.dn.get_children(table) + pk = 'primary_key' + children = {} + table_name = 'table_name' + expected_result = [] # Run - result = self.modeler._get_extensions(pk, children, table) + result = self.modeler._get_extensions(pk, children, table_name) # Check assert result == expected_result From 3b21f89c21d25d94e95a1c1b3d40273b5dc1661f Mon Sep 17 00:00:00 2001 From: Manuel Alvarez Date: Fri, 25 Jan 2019 09:59:45 +0100 Subject: [PATCH 06/11] unflattering of models using generic method --- sdv/modeler.py | 41 ++++------ sdv/sampler.py | 164 ++++++++++++++++++++++++-------------- tests/sdv/test_modeler.py | 21 +++-- tests/sdv/test_sampler.py | 51 +++++++++++- 4 files changed, 181 insertions(+), 96 deletions(-) diff --git a/sdv/modeler.py b/sdv/modeler.py index 226a63c4b..2a33c3196 100644 --- a/sdv/modeler.py +++ b/sdv/modeler.py @@ -125,16 +125,17 @@ def _flatten_dict(cls, nested, prefix=''): return result @classmethod - def flatten_model(cls, model): + def flatten_model(cls, model, name=''): """Flatten a model's parameters into an array. Args: - model: a model object + model(self.model): Instance of model. + name (str): Returns: pd.Series: parameters for model """ - return pd.Series(cls._flatten_dict(model.to_dict())) + return pd.Series(cls._flatten_dict(model.to_dict(), name)) def get_foreign_key(self, fields, primary): """Get foreign key from primary key. @@ -189,7 +190,7 @@ def fit_model(self, data): return model - def _create_extension(self, df, transformed_child_table): + def _create_extension(self, df, transformed_child_table, child_name=''): """Return the flattened model from a dataframe.""" try: conditional_data = transformed_child_table.loc[df.index] @@ -199,19 +200,16 @@ def _create_extension(self, df, transformed_child_table): clean_df = self.impute_table(conditional_data) - return self.flatten_model(self.fit_model(clean_df)) + return self.flatten_model(self.fit_model(clean_df), child_name) - def _extension_from_group(self, transformed_child_table): + def _extension_from_group(self, transformed_child_table, child=''): """Wrapper around _create_extension to use it with pd.DataFrame.apply.""" def f(group): - return self._create_extension(group, transformed_child_table) + return self._create_extension(group, transformed_child_table, child) return f def _get_extensions(self, pk, children, table_name): """Generate list of extension for child tables.""" - # keep track of which columns belong to which child - start = 0 - end = 0 extensions = [] # make sure child_locs has value for table name @@ -222,6 +220,12 @@ def _get_extensions(self, pk, children, table_name): child_table = self.dn.tables[child].data child_meta = self.dn.tables[child].meta + fields = child_meta['fields'] + fk = self.get_foreign_key(fields, pk) + + if not fk: + continue + # check if leaf node if not self.dn.get_children(child): transformed_child_table = self.dn.transformed_data[child] @@ -229,25 +233,12 @@ def _get_extensions(self, pk, children, table_name): else: transformed_child_table = self.tables[child] - fields = child_meta['fields'] - fk = self.get_foreign_key(fields, pk) - - if not fk: - continue - extension = child_table.groupby(fk) - extension = extension.apply(self._extension_from_group(transformed_child_table)) + extension = extension.apply( + self._extension_from_group(transformed_child_table, '__' + child)) if len(extension): - # keep track of child column indices - end = max(end, start + extension.shape[1]) - - self.child_locs[table_name][child] = (start, end) - - # rename columns - extension.columns = range(start, end) extensions.append(extension) - start = end return extensions diff --git a/sdv/sampler.py b/sdv/sampler.py index 7252d52ec..98a942aef 100644 --- a/sdv/sampler.py +++ b/sdv/sampler.py @@ -5,6 +5,8 @@ import exrex +GAUSSIAN_COPULA = 'copulas.multivariate.gaussian.GaussianMultivariate' + class Sampler: """Class to sample data from a model.""" @@ -13,7 +15,6 @@ def __init__(self, data_navigator, modeler): """Instantiate a new object.""" self.dn = data_navigator self.modeler = modeler - self.been_sampled = set() # table_name -> if already sampled self.sampled = {} # table_name -> [(primary_key, generated_row)] self.primary_key = {} @@ -104,6 +105,103 @@ def _get_parent_row(self, table_name): return random_parent, foreign_key, parent_row + def _unflatten_dict(self, flat, table_name=''): + """Transform a flattened dict into its original form. + + Works in exact opposite way that `sdv.Modeler._flatten_dict`. + + Args: + flat (dict): Flattened dict. + + """ + result = {} + children = self.dn.get_children(table_name) + for key in sorted(flat.keys()): + path = key.split('__') + + if any(['____{}'.format(child) in key for child in children]): + path = [ + path[0], + '__'.join(path[2: -1]), + path[-1] + ] + + value = flat[key] + walked = result + for step, name in enumerate(path): + + if isinstance(walked, dict) and name in walked: + walked = walked[name] + continue + + elif isinstance(walked, list) and len(walked) and len(walked) - 1 >= int(name): + walked = walked[int(name)] + continue + + else: + if name.isdigit(): + name = int(name) + + if step == len(path) - 1: + if isinstance(walked, list): + walked.append(value) + else: + walked[name] = value + + else: + next_step = path[step + 1] + if next_step.isdigit(): + if isinstance(name, int): + walked.append([]) + while len(walked) < name + 1: + walked.append([]) + + else: + walked[name] = [] + + walked = walked[name] + + else: + if isinstance(name, int): + walked.append({}) + else: + walked[name] = {} + + walked = walked[name] + + return result + + def unflatten_model(self, parent_row, table_name, parent_name): + """ Takes the params from a generated parent row and creates a model from it. + + Args: + parent_row (dataframe): a generated parent row + table_name (string): name of table to make model for + parent_name (string): name of parent table + """ + + prefix = '__{}__'.format(table_name) + columns = [column for column in parent_row.columns if column.startswith(prefix)] + new_columns = {column: column.replace(prefix, '') for column in columns} + flat_parameters = parent_row.loc[:, columns] + flat_parameters = flat_parameters.rename(columns=new_columns).to_dict('records')[0] + + model_dict = self._unflatten_dict(flat_parameters, table_name) + model_name = get_qualified_name(self.modeler.model) + distribution_name = get_qualified_name(self.modeler.distribution) + model_dict['fitted'] = True + model_dict['type'] = model_name + + if model_name == GAUSSIAN_COPULA: + model_dict['distribution'] = distribution_name + for key in model_dict['distribs']: + model_dict['distribs'][key].update({ + 'fitted': True, + 'type': distribution_name + }) + + return self.modeler.model.from_dict(model_dict) + def sample_rows(self, table_name, num_rows): """Sample specified number of rows for specified table. @@ -124,11 +222,10 @@ def sample_rows(self, table_name, num_rows): parent_row = parent_row.loc[[0]] # get parameters from parent to make model - model = self._make_model_from_params( - parent_row, table_name, random_parent) + model = self.unflatten_model(parent_row, table_name, random_parent) # sample from that model - if model is not None and len(model.distribs) > 0: + if model is not None and model.fitted: synthesized_rows = model.sample(num_rows) else: raise ValueError( @@ -147,7 +244,7 @@ def sample_rows(self, table_name, num_rows): else: # there is no parent model = self.modeler.models[table_name] - if len(model.distribs): + if model.fitted: synthesized_rows = model.sample(num_rows) else: raise ValueError( @@ -337,60 +434,3 @@ def update_mapping_list(self, mapping, key, value): mapping[key] = [value] return mapping - - @classmethod - def _unflatten_dict(cls, flat): - """Transform a flattened dict into its original form. - - Works in exact opposite way that `sdv.Modeler._flatten_dict`. - - Args: - flat (dict): Flattened dict. - - """ - - result = {} - - for key in sorted(flat.keys()): - path = key.split('__') - value = flat[key] - walked = result - for step, name in enumerate(path): - - if isinstance(walked, dict) and name in walked: - walked = walked[name] - continue - - elif isinstance(walked, list) and len(walked) and len(walked) - 1 >= int(name): - walked = walked[int(name)] - continue - - else: - if name.isdigit(): - name = int(name) - - if step == len(path) - 1: - if isinstance(walked, list): - walked.append(value) - else: - walked[name] = value - - else: - next_step = path[step + 1] - if next_step.isdigit(): - if isinstance(name, int): - walked.append([]) - else: - walked[name] = [] - - walked = walked[name] - - else: - if isinstance(name, int): - walked.append({}) - else: - walked[name] = {} - - walked = walked[name] - - return result diff --git a/tests/sdv/test_modeler.py b/tests/sdv/test_modeler.py index 1e72e41ad..cecd7aa82 100644 --- a/tests/sdv/test_modeler.py +++ b/tests/sdv/test_modeler.py @@ -60,9 +60,10 @@ def test__create_extension_wrong_index_return_none(self): # Check assert result is None + @mock.patch('sdv.modeler.Modeler._extension_from_group') @mock.patch('sdv.modeler.Modeler.get_foreign_key') @mock.patch('sdv.modeler.pd.core.groupby.GroupBy.apply') - def test__get_extensions(self, apply_mock, get_foreign_mock): + def test__get_extensions(self, apply_mock, get_foreign_mock, extension_mock): """_get_extensions return the conditional modelling parameters for each children.""" # Setup data_navigator = mock.MagicMock() @@ -78,9 +79,11 @@ def test__get_extensions(self, apply_mock, get_foreign_mock): modeler = Modeler(data_navigator) modeler.tables = {} + extension_mock.side_effect = lambda x, y: y + apply_mock.side_effect = lambda x: pd.DataFrame([{ - 'column_1': 1, - 'column_2': 2 + '{}_column_1'.format(x): 1, + '{}_column_2'.format(x): 2 }]) get_foreign_mock.return_value = 'foreign_key' @@ -89,15 +92,21 @@ def test__get_extensions(self, apply_mock, get_foreign_mock): children = ['first_children', 'second_children'] expected_result = [ - pd.DataFrame([[1, 2]]), - pd.DataFrame([[1, 2]], columns=range(2, 4)) + pd.DataFrame([{ + '__first_children_column_1': 1, + '__first_children_column_2': 2 + }]), + pd.DataFrame([{ + '__second_children_column_1': 1, + '__second_children_column_2': 2 + }]) ] # Run result = modeler._get_extensions(pk, children, table_name) # Check - assert all([(result[index] == expected_result[index]).all().all() for index in range(2)]) + assert all([result[index].equals(expected_result[index]) for index in range(len(result))]) def test_get_extensions_no_children(self): """_get_extensions return an empty list if children is empty.""" diff --git a/tests/sdv/test_sampler.py b/tests/sdv/test_sampler.py index 8355882e6..e0aaa6415 100644 --- a/tests/sdv/test_sampler.py +++ b/tests/sdv/test_sampler.py @@ -1,4 +1,5 @@ from unittest import TestCase +from unittest.mock import MagicMock from sdv.data_navigator import CSVDataLoader from sdv.modeler import Modeler @@ -76,6 +77,9 @@ def test_sample_all(self): def test_unflatten_dict(self): """ """ # Setup + data_navigator = MagicMock() + modeler = MagicMock() + sampler = Sampler(data_navigator, modeler) flat = { 'first_key__a': 1, 'first_key__b': 2, @@ -93,14 +97,19 @@ def test_unflatten_dict(self): } # Run - result = Sampler._unflatten_dict(flat) + result = sampler._unflatten_dict(flat) # Check assert result == expected_result + data_navigator.assert_not_called() + modeler.assert_not_called() def test_unflatten_dict_mixed_array(self): - """ """ + """unflatten_dict restruicture arrays""" # Setup + data_navigator = MagicMock() + modeler = MagicMock() + sampler = Sampler(data_navigator, modeler) flat = { 'first_key__0__0': 1, 'first_key__0__1': 0, @@ -130,7 +139,43 @@ def test_unflatten_dict_mixed_array(self): } # Run - result = Sampler._unflatten_dict(flat) + + result = sampler._unflatten_dict(flat) + + # Check + assert result == expected_result + data_navigator.assert_not_called() + modeler.assert_not_called() + + def test_unflatten_dict_extension(self): + """ """ + # Setup + data_navigator = MagicMock() + data_navigator.get_children.return_value = ['CHILD_TABLE'] + modeler = MagicMock() + sampler = Sampler(data_navigator, modeler) + + flat = { + 'first_key__a': 1, + 'first_key____CHILD_TABLE__model_param': 0, + 'distribs____CHILD_TABLE__distribs__UNIT_PRICE__std__mean': 0 + } + expected_result = { + 'first_key': { + 'a': 1, + 'CHILD_TABLE': { + 'model_param': 0 + } + }, + 'distribs': { + 'CHILD_TABLE__distribs__UNIT_PRICE__std': { + 'mean': 0 + } + } + } + + # Run + result = sampler._unflatten_dict(flat) # Check assert result == expected_result From cf4962f2e501308e387afe99569ae2bc24ed84ad Mon Sep 17 00:00:00 2001 From: Manuel Alvarez Date: Fri, 25 Jan 2019 19:33:49 +0100 Subject: [PATCH 07/11] Improved model_unflatten --- sdv/sampler.py | 155 ++++++++++++++++++-------------------- tests/sdv/test_sampler.py | 85 ++++++++++++++++----- 2 files changed, 141 insertions(+), 99 deletions(-) diff --git a/sdv/sampler.py b/sdv/sampler.py index 98a942aef..8e176e3f2 100644 --- a/sdv/sampler.py +++ b/sdv/sampler.py @@ -79,6 +79,7 @@ def transform_synthesized_rows(self, synthesized_rows, table_name, num_rows): # filter out parameters labels = list(self.dn.tables[table_name].data) + synthesized_rows = self._fill_text_columns(synthesized_rows, labels, table_name) # reverse transform data @@ -105,6 +106,49 @@ def _get_parent_row(self, table_name): return random_parent, foreign_key, parent_row + @staticmethod + def generate_keys(prefix=''): + def f(row): + parts = [str(row[key]) for key in row.keys() if row[key] is not None] + if prefix: + parts = [prefix] + parts + + return '__'.join(parts) + + return f + + @classmethod + def _get_sorted_keys(cls, _dict): + result = [] + keys = list(_dict.keys()) + + if not keys: + return [] + + serie = pd.Series(keys) + df = pd.DataFrame(serie.str.split('__').values.tolist()) + uniques = df[0].unique() + + for value in uniques: + index = df[df[0] == value].index + _slice = df.loc[index, range(1, df.shape[1])].copy() + + try: + for column in _slice.columns: + _slice[column] = _slice[column].astype(int) + + except (ValueError, TypeError): + pass + + df.drop(index, inplace=True) + _slice = _slice.sort_values(list(range(1, df.shape[1]))) + result += _slice.apply(cls.generate_keys(value), axis=1).values.tolist() + + df = df.sort_values(list(range(df.shape[1]))) + result += df.apply(cls.generate_keys(), axis=1).values.tolist() + + return result + def _unflatten_dict(self, flat, table_name=''): """Transform a flattened dict into its original form. @@ -116,13 +160,15 @@ def _unflatten_dict(self, flat, table_name=''): """ result = {} children = self.dn.get_children(table_name) - for key in sorted(flat.keys()): + keys = self._get_sorted_keys(flat) + + for key in keys: path = key.split('__') - if any(['____{}'.format(child) in key for child in children]): + if any(['__{}__'.format(child) in key for child in children]): path = [ path[0], - '__'.join(path[2: -1]), + '__'.join(path[1: -1]), path[-1] ] @@ -267,6 +313,30 @@ def sample_table(self, table_name): num_rows = orig_table.shape[0] return self.sample_rows(table_name, num_rows) + def _sample_child_rows(self, parent_name, parent_row, sampled_data, num_rows=5): + """Uses parameters from parent row to synthesize child rows. + + Args: + parent_name (str): name of parent table + parent_row (dataframe): synthesized parent row + sample_data (dict): maps table name to sampled data + num_rows (int): number of rows to synthesize per parent row + + Returns: + synthesized children rows + """ + + children = self.dn.get_children(parent_name) + for child in children: + rows = self.sample_rows(child, num_rows) + + if child in sampled_data: + sampled_data[child] = pd.concat([sampled_data[child], rows]) + else: + sampled_data[child] = rows + + self._sample_child_rows(child, rows.iloc[0:1, :], sampled_data) + def sample_all(self, num_rows=5): """Samples the entire database. @@ -283,6 +353,7 @@ def sample_all(self, num_rows=5): This is this way because the children tables are created modelling the relation thet have with their parent tables, so it's behavior may change from one table to another. """ + tables = self.dn.tables sampled_data = {} @@ -300,84 +371,6 @@ def sample_all(self, num_rows=5): return self.reset_indices_tables(sampled_data) - def _sample_child_rows(self, parent_name, parent_row, sampled_data, num_rows=5): - """Uses parameters from parent row to synthesize child rows. - - Args: - parent_name (str): name of parent table - parent_row (dataframe): synthesized parent row - sample_data (dict): maps table name to sampled data - num_rows (int): number of rows to synthesize per parent row - - Returns: - synthesized children rows - """ - children = self.dn.get_children(parent_name) - for child in children: - rows = self.sample_rows(child, num_rows) - - if child in sampled_data: - sampled_data[child] = pd.concat([sampled_data[child], rows]) - else: - sampled_data[child] = rows - - self._sample_child_rows(child, rows.iloc[0:1, :], sampled_data) - - def _make_model_from_params(self, parent_row, table_name, parent_name): - """ Takes the params from a generated parent row and creates a model from it. - - Args: - parent_row (dataframe): a generated parent row - table_name (string): name of table to make model for - parent_name (string): name of parent table - """ - # get parameters - child_range = self.modeler.child_locs.get(parent_name, {}).get(table_name, {}) - - if not child_range: - return None - - param_indices = list(range(child_range[0], child_range[1])) - params = parent_row.loc[:, param_indices] - totalcols = params.shape[1] - num_cols = self.modeler.tables[table_name].shape[1] - - # get labels for dataframe - labels = list(self.modeler.tables[table_name].columns) - - # parent_meta = self.dn.tables[parent_name].meta - # fk = parent_meta['primary_key'] - - # if fk in labels: - # labels.remove(fk) - # num_cols -= 1 - - cov_size = num_cols ** 2 - - # Covariance matrix - covariance = params.iloc[:, 0:cov_size] - covariance = covariance.values.reshape((num_cols, num_cols)) - - # Distributions - distributions = {} - for label_index, i in enumerate(range(cov_size, totalcols, 2)): - distributions[labels[label_index]] = { - 'type': get_qualified_name(self.modeler.distribution), - 'fitted': True, - 'std': abs(params.iloc[:, i]), # Pending for issue - 'mean': params.iloc[:, i + 1], # https://github.com/HDI-Project/SDV/issues/58 - } - - model_params = { - 'covariance': covariance, - 'distribs': distributions, - 'type': get_qualified_name(self.modeler.model), - 'fitted': True, - 'distribution': get_qualified_name(self.modeler.distribution) - } - - return self.modeler.model.from_dict(model_params) - def _get_table_meta(self, meta, table_name): """Return metadata get table meta for a given table name""" for table in meta['tables']: diff --git a/tests/sdv/test_sampler.py b/tests/sdv/test_sampler.py index e0aaa6415..c795ed3cb 100644 --- a/tests/sdv/test_sampler.py +++ b/tests/sdv/test_sampler.py @@ -1,5 +1,8 @@ from unittest import TestCase -from unittest.mock import MagicMock +from unittest.mock import MagicMock, patch + +import numpy as np +import pandas as pd from sdv.data_navigator import CSVDataLoader from sdv.modeler import Modeler @@ -55,24 +58,38 @@ def test_sample_rows_children_table(self): assert len(unique_foreign_keys) == 1 assert unique_foreign_keys[0] in sampled_parent['CUSTOMER_ID'].values - def test_sample_all(self): + @patch('sdv.sampler.pd.concat') + @patch('sdv.sampler.Sampler.reset_indices_tables') + @patch('sdv.sampler.Sampler._sample_child_rows') + @patch('sdv.sampler.Sampler.sample_rows') + def test_sample_all(self, rows_mock, child_mock, reset_mock, concat_mock): """Check sample_all and returns some value.""" + # Setup + data_navigator = MagicMock() + data_navigator.tables = ['TABLE_A', 'TABLE_B'] + data_navigator.get_parents.side_effect = lambda x: x != 'TABLE_A' + modeler = MagicMock() + sampler = Sampler(data_navigator, modeler) + + def fake_dataframe(name, number): + return pd.DataFrame([{name: 0} for i in range(number)], index=[0]*number) + + rows_mock.side_effect = fake_dataframe + concat_mock.return_value = 'concatenated_dataframe' + + expected_get_parents_call_list = [(('TABLE_A',), {}), (('TABLE_B',), {})] + expected_rows_mock_call_list = [(('TABLE_A', 1), {}) for i in range(5)] # Run - result = self.sampler.sample_all(num_rows=5) + result = sampler.sample_all(num_rows=5) # Check - assert result.keys() == self.sampler.dn.tables.keys() + assert data_navigator.get_parents.call_args_list == expected_get_parents_call_list + assert result == reset_mock.return_value - for name, table in result.items(): - with self.subTest(table=name): - raw_data = self.modeler.dn.tables[name].data - assert (table.columns == raw_data.columns).all() - - if not self.sampler.dn.get_parents(name): - primary_key = self.sampler.dn.get_meta_data(name)['primary_key'] - assert len(table) == 5 - assert len(table[primary_key].unique()) == 5 + assert rows_mock.call_args_list == expected_rows_mock_call_list + assert child_mock.call_count == 5 + reset_mock.assert_called_once_with({'TABLE_A': 'concatenated_dataframe'}) def test_unflatten_dict(self): """ """ @@ -147,8 +164,8 @@ def test_unflatten_dict_mixed_array(self): data_navigator.assert_not_called() modeler.assert_not_called() - def test_unflatten_dict_extension(self): - """ """ + def test_unflatten_dict_child_name(self): + """unflatten_dict will respect the name of child tables.""" # Setup data_navigator = MagicMock() data_navigator.get_children.return_value = ['CHILD_TABLE'] @@ -160,22 +177,54 @@ def test_unflatten_dict_extension(self): 'first_key____CHILD_TABLE__model_param': 0, 'distribs____CHILD_TABLE__distribs__UNIT_PRICE__std__mean': 0 } + table_name = 'TABLE_NAME' expected_result = { 'first_key': { 'a': 1, - 'CHILD_TABLE': { + '__CHILD_TABLE': { 'model_param': 0 } }, 'distribs': { - 'CHILD_TABLE__distribs__UNIT_PRICE__std': { + '__CHILD_TABLE__distribs__UNIT_PRICE__std': { 'mean': 0 } } } # Run - result = sampler._unflatten_dict(flat) + result = sampler._unflatten_dict(flat, table_name) + + # Check + assert result == expected_result + modeler.assert_not_called() + data_navigator.get_children.assert_called_once_with('TABLE_NAME') + + def test_unflatten_respect_covariance_matrix(self): + """unflatten_dict restructures the covariance matrix into an square matrix.""" + # Setup + data_navigator = MagicMock() + modeler = MagicMock() + sampler = Sampler(data_navigator, modeler) + + def fake_values(i, j): + return '{}, {}'.format(i, j) + + expected_result = { + 'covariance': np.array([ + [fake_values(i, j) for j in range(40)] + for i in range(40) + ]).tolist() + } + + flat = { + 'covariance__{}__{}'.format(i, j): fake_values(i, j) + for i in range(40) for j in range(40) + } + table_name = 'TABLE_NAME' + + # Run + result = sampler._unflatten_dict(flat, table_name) # Check assert result == expected_result From 62b05444201aa2db0fe2a2c7929f8ced0ef4a8ca Mon Sep 17 00:00:00 2001 From: Manuel Alvarez Date: Tue, 29 Jan 2019 16:59:32 +0100 Subject: [PATCH 08/11] Remove foreign_key column when creating extensions --- sdv/modeler.py | 12 ++++++++---- tests/sdv/test_modeler.py | 12 +++++++----- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/sdv/modeler.py b/sdv/modeler.py index 2a33c3196..3a32071d8 100644 --- a/sdv/modeler.py +++ b/sdv/modeler.py @@ -190,10 +190,13 @@ def fit_model(self, data): return model - def _create_extension(self, df, transformed_child_table, child_name=''): + def _create_extension(self, df, transformed_child_table, table_info): """Return the flattened model from a dataframe.""" + + foreign_key, child_name = table_info try: conditional_data = transformed_child_table.loc[df.index] + conditional_data = conditional_data.drop(foreign_key, axis=1) except KeyError: return None @@ -202,10 +205,10 @@ def _create_extension(self, df, transformed_child_table, child_name=''): return self.flatten_model(self.fit_model(clean_df), child_name) - def _extension_from_group(self, transformed_child_table, child=''): + def _extension_from_group(self, transformed_child_table, table_info): """Wrapper around _create_extension to use it with pd.DataFrame.apply.""" def f(group): - return self._create_extension(group, transformed_child_table, child) + return self._create_extension(group, transformed_child_table, table_info) return f def _get_extensions(self, pk, children, table_name): @@ -233,9 +236,10 @@ def _get_extensions(self, pk, children, table_name): else: transformed_child_table = self.tables[child] + table_info = (fk, '__' + child) extension = child_table.groupby(fk) extension = extension.apply( - self._extension_from_group(transformed_child_table, '__' + child)) + self._extension_from_group(transformed_child_table, table_info)) if len(extension): extensions.append(extension) diff --git a/tests/sdv/test_modeler.py b/tests/sdv/test_modeler.py index cecd7aa82..fca014d8e 100644 --- a/tests/sdv/test_modeler.py +++ b/tests/sdv/test_modeler.py @@ -1,4 +1,4 @@ -from unittest import TestCase, mock, skip +from unittest import TestCase, mock import numpy as np import pandas as pd @@ -24,10 +24,12 @@ def test__create_extension(self): data_navigator = mock.MagicMock() modeler = Modeler(data_navigator) table = pd.DataFrame({ + 'foreign': [0, 1, 0, 1, 0, 1], 'a': [0, 1, 0, 1, 0, 1], 'b': [1, 2, 3, 4, 5, 6] }) group = table[table.a == 0] + table_info = ('foreign', '') expected_result = pd.Series({ 'covariance__0__0': 0.0, @@ -41,7 +43,7 @@ def test__create_extension(self): }) # Run - result = modeler._create_extension(group, table) + result = modeler._create_extension(group, table, table_info) # Check assert (expected_result == result).all() @@ -52,10 +54,11 @@ def test__create_extension_wrong_index_return_none(self): data_navigator = mock.MagicMock() modeler = Modeler(data_navigator) transformed_child_table = pd.DataFrame(np.eye(3), columns=['A', 'B', 'C']) + table_info = ('', '') df = pd.DataFrame(index=range(5, 10)) # Run - result = modeler._create_extension(df, transformed_child_table) + result = modeler._create_extension(df, transformed_child_table, table_info) # Check assert result is None @@ -79,7 +82,7 @@ def test__get_extensions(self, apply_mock, get_foreign_mock, extension_mock): modeler = Modeler(data_navigator) modeler.tables = {} - extension_mock.side_effect = lambda x, y: y + extension_mock.side_effect = lambda x, y: y[1] apply_mock.side_effect = lambda x: pd.DataFrame([{ '{}_column_1'.format(x): 1, @@ -241,7 +244,6 @@ def test_fit_model_distribution_arg(self): # Check model_mock.assert_called_once_with(distribution='copulas.univariate.kde.KDEUnivariate') - @skip('Work in Progress') def test_model_database_distribution_arg(self): """model_database will use self.distribution to model tables.""" # Setup From 046b4a4807040e4cec6880a99c8f8ae562955b07 Mon Sep 17 00:00:00 2001 From: Manuel Alvarez Date: Wed, 30 Jan 2019 09:38:02 +0100 Subject: [PATCH 09/11] Invoke _create_extensions without groupby --- sdv/modeler.py | 38 ++++++++++++++++++++++++++++++++------ tests/sdv/test_modeler.py | 17 ++++++----------- 2 files changed, 38 insertions(+), 17 deletions(-) diff --git a/sdv/modeler.py b/sdv/modeler.py index 3a32071d8..1e4dfbed1 100644 --- a/sdv/modeler.py +++ b/sdv/modeler.py @@ -135,6 +135,7 @@ def flatten_model(cls, model, name=''): Returns: pd.Series: parameters for model """ + return pd.Series(cls._flatten_dict(model.to_dict(), name)) def get_foreign_key(self, fields, primary): @@ -190,12 +191,24 @@ def fit_model(self, data): return model - def _create_extension(self, df, transformed_child_table, table_info): - """Return the flattened model from a dataframe.""" + def _create_extension(self, foreign, transformed_child_table, table_info): + """Return the flattened model from a dataframe. + + Args: + foreign(pandas.DataFrame): Object with Index of elements from children table elements + of a given foreign_key. + transformed_child_table(pandas.DataFrame): Table of data to fil + table_info (tuple(str, str)): foreign_key and child table names. + + Returns: + pd.DataFrame : Parameter extension + + + """ foreign_key, child_name = table_info try: - conditional_data = transformed_child_table.loc[df.index] + conditional_data = transformed_child_table.loc[foreign.index].copy() conditional_data = conditional_data.drop(foreign_key, axis=1) except KeyError: @@ -203,6 +216,8 @@ def _create_extension(self, df, transformed_child_table, table_info): clean_df = self.impute_table(conditional_data) + # if min([len(clean_df[column].unique()) for column in clean_df.columns]) == 1: + # return None return self.flatten_model(self.fit_model(clean_df), child_name) def _extension_from_group(self, transformed_child_table, table_info): @@ -237,9 +252,20 @@ def _get_extensions(self, pk, children, table_name): transformed_child_table = self.tables[child] table_info = (fk, '__' + child) - extension = child_table.groupby(fk) - extension = extension.apply( - self._extension_from_group(transformed_child_table, table_info)) + + foreign_key_values = child_table[fk].unique() + parameters = {} + + for foreign_key in foreign_key_values: + foreign_index = child_table[child_table[fk] == foreign_key] + parameter = self._create_extension( + foreign_index, transformed_child_table, table_info) + + if parameter is not None: + parameters[foreign_key] = parameter.to_dict() + + extension = pd.DataFrame(parameters).T + extension.index.name = fk if len(extension): extensions.append(extension) diff --git a/tests/sdv/test_modeler.py b/tests/sdv/test_modeler.py index fca014d8e..02b933ca6 100644 --- a/tests/sdv/test_modeler.py +++ b/tests/sdv/test_modeler.py @@ -46,10 +46,10 @@ def test__create_extension(self): result = modeler._create_extension(group, table, table_info) # Check - assert (expected_result == result).all() + assert result.equals(expected_result) def test__create_extension_wrong_index_return_none(self): - """_create_extension raises an exception if df.index not in transformed_child_table.""" + """_create_extension return None if transformed_child_table can't be indexed by df.""" # Setup data_navigator = mock.MagicMock() modeler = Modeler(data_navigator) @@ -63,10 +63,9 @@ def test__create_extension_wrong_index_return_none(self): # Check assert result is None - @mock.patch('sdv.modeler.Modeler._extension_from_group') + @mock.patch('sdv.modeler.Modeler._create_extension') @mock.patch('sdv.modeler.Modeler.get_foreign_key') - @mock.patch('sdv.modeler.pd.core.groupby.GroupBy.apply') - def test__get_extensions(self, apply_mock, get_foreign_mock, extension_mock): + def test__get_extensions(self, get_foreign_mock, extension_mock): """_get_extensions return the conditional modelling parameters for each children.""" # Setup data_navigator = mock.MagicMock() @@ -82,12 +81,8 @@ def test__get_extensions(self, apply_mock, get_foreign_mock, extension_mock): modeler = Modeler(data_navigator) modeler.tables = {} - extension_mock.side_effect = lambda x, y: y[1] + extension_mock.side_effect = lambda x, y, z: None - apply_mock.side_effect = lambda x: pd.DataFrame([{ - '{}_column_1'.format(x): 1, - '{}_column_2'.format(x): 2 - }]) get_foreign_mock.return_value = 'foreign_key' pk = 'primary_key' @@ -178,7 +173,7 @@ def test_flatten_model(self): result = Modeler.flatten_model(model) # Check - assert (result == expected_result).all() + assert np.isclose(result, expected_result).all() def test_impute_table(self): """impute_table fills all NaN values with 0 or the mean of values.""" From 4fce91893b7910a267b49165c2f0ac860b86e950 Mon Sep 17 00:00:00 2001 From: Manuel Alvarez Date: Wed, 30 Jan 2019 15:09:43 +0100 Subject: [PATCH 10/11] Update docstrings, remove unused methods, raise errors when required. --- sdv/modeler.py | 125 +++++++++++++++++++++++--------------- sdv/sampler.py | 3 +- tests/sdv/test_modeler.py | 25 +++++--- 3 files changed, 92 insertions(+), 61 deletions(-) diff --git a/sdv/modeler.py b/sdv/modeler.py index 1e4dfbed1..2d2c9fa9e 100644 --- a/sdv/modeler.py +++ b/sdv/modeler.py @@ -4,7 +4,7 @@ import numpy as np import pandas as pd from copulas import get_qualified_name -from copulas.multivariate import GaussianMultivariate +from copulas.multivariate import GaussianMultivariate, TreeTypes from copulas.univariate import GaussianUnivariate # Configure logger @@ -14,27 +14,52 @@ DEFAULT_DISTRIBUTION = GaussianUnivariate IGNORED_DICT_KEYS = ['fitted', 'distribution', 'type'] +MODELLING_ERROR_MESSAGE = ( + 'There was an error while trying to model the database. If you are using a custom' + 'distribution or model, please try again using the default ones. If the problem persist,' + 'please report it here: https://github.com/HDI-Project/SDV/issues' +) + class Modeler: - """Class responsible for modeling database.""" + """Class responsible for modeling database. + + Args: + data_navigator (DataNavigator): object for the dataset. + model (type): Class of model to use. + distribution (type): Class of distribution to use. Will be deprecated shortly. + model_kwargs (dict): Keyword arguments to pass to model. + """ DEFAULT_PRIMARY_KEY = 'GENERATED_PRIMARY_KEY' - def __init__(self, data_navigator, model=DEFAULT_MODEL, distribution=DEFAULT_DISTRIBUTION): + def __init__(self, data_navigator, model=DEFAULT_MODEL, distribution=None, model_kwargs=None): """Instantiates a modeler object. - Args: - data_navigator (DataNavigator): object for the dataset. - transformed_data (dict): transformed tables {table_name:dataframe}. - model (type): Class of model to use. - distribution (type): Class of model to use. """ self.tables = {} self.models = {} self.child_locs = {} # maps table->{child: col #} self.dn = data_navigator self.model = model - self.distribution = distribution + + if distribution and model != DEFAULT_MODEL: + raise ValueError( + '`distribution` argument is only suported for `GaussianMultivariate` model.') + + if distribution: + distribution = get_qualified_name(distribution) + else: + distribution = get_qualified_name(DEFAULT_DISTRIBUTION) + + if not model_kwargs: + if model == DEFAULT_MODEL: + model_kwargs = {'distribution': distribution} + + else: + model_kwargs = {'vine_type': TreeTypes.REGULAR} + + self.model_kwargs = model_kwargs def save(self, file_name): """Saves model to file destination. @@ -69,6 +94,10 @@ def _flatten_array(cls, nested, prefix=''): Args: nested (list, np.array): Iterable to flatten. + prefix (str): Name to append to the array indices. + + Returns: + dict """ result = {} for index in range(len(nested)): @@ -89,21 +118,12 @@ def _flatten_dict(cls, nested, prefix=''): This method returns a flatten version of a dictionary, concatenating key names with double underscores, that is: - >>> nested_dict = { - 'my_key':{ - 'a': 1, - 'b': 2 - } - } - >>> Modeler.flatten_dict(nested_dict) - { - 'my_key__a': 1, - 'my_key__b': 2 - } - Args: nested (dict): Original dictionary to flatten. prefix (str): Prefix to append to key name + + Returns: + dict: Flattened dictionary. That is, all its keys hold a primitive value. """ result = {} @@ -130,7 +150,7 @@ def flatten_model(cls, model, name=''): Args: model(self.model): Instance of model. - name (str): + name (str): Prefix to the parameter name. Returns: pd.Series: parameters for model @@ -160,7 +180,7 @@ def impute_table(table): """Fill in any NaN values in a table. Args: - table(pandas.DataFrame): + table(pandas.DataFrame): Table to fill NaN values Returns: pandas.DataFrame @@ -184,9 +204,9 @@ def fit_model(self, data): data (pandas.DataFrame): Data to train the model with. Returns: - GaussianMultivariate: Fitted model. + model: Instance of self.model fitted with data. """ - model = self.model(distribution=get_qualified_name(self.distribution)) + model = self.model(**self.model_kwargs) model.fit(data) return model @@ -201,9 +221,7 @@ def _create_extension(self, foreign, transformed_child_table, table_info): table_info (tuple(str, str)): foreign_key and child table names. Returns: - pd.DataFrame : Parameter extension - - + pd.Series : Parameter extension """ foreign_key, child_name = table_info @@ -215,23 +233,24 @@ def _create_extension(self, foreign, transformed_child_table, table_info): return None clean_df = self.impute_table(conditional_data) - - # if min([len(clean_df[column].unique()) for column in clean_df.columns]) == 1: - # return None return self.flatten_model(self.fit_model(clean_df), child_name) - def _extension_from_group(self, transformed_child_table, table_info): - """Wrapper around _create_extension to use it with pd.DataFrame.apply.""" - def f(group): - return self._create_extension(group, transformed_child_table, table_info) - return f + def _get_extensions(self, pk, children): + """Generate list of extension for child tables. - def _get_extensions(self, pk, children, table_name): - """Generate list of extension for child tables.""" - extensions = [] + Args: + pk (str): Name of the primary_key column in the parent table. + children (set[str]): Names of the children. + + Returns: list(pandas.DataFrame) - # make sure child_locs has value for table name - self.child_locs[table_name] = self.child_locs.get(table_name, {}) + Each element of the list is generated for one single children. + That dataframe should have as index.name the `foreign_key` name, and as index + it's values. + The values for a given index is generated by flattening a model fit with the related + data to that index in the children table. + """ + extensions = [] # find children that ref primary key for child in children: @@ -275,13 +294,15 @@ def _get_extensions(self, pk, children, table_name): def CPA(self, table): """Run CPA algorithm on a table. - Conditional Parameter Aggregation. It will take the tab + Conditional Parameter Aggregation. It will take the table's children and generate + extensions (parameters from modelling the related children for each foreign key) + and merge them to the original `table` Args: table (string): name of table. Returns: - None: + None """ logger.info('Modeling %s', table) # Grab table @@ -295,7 +316,7 @@ def CPA(self, table): # start with transformed table extended_table = self.dn.transformed_data[table] - extensions = self._get_extensions(pk, children, table) + extensions = self._get_extensions(pk, children) # add extensions for extension in extensions: @@ -318,12 +339,16 @@ def RCPA(self, table): def model_database(self): """Use RCPA and store model for database.""" - for table in self.dn.tables: - if not self.dn.get_parents(table): - self.RCPA(table) + try: + for table in self.dn.tables: + if not self.dn.get_parents(table): + self.RCPA(table) + + for table in self.tables: + clean_table = self.impute_table(self.tables[table]) + self.models[table] = self.fit_model(clean_table) - for table in self.tables: - clean_table = self.impute_table(self.tables[table]) - self.models[table] = self.fit_model(clean_table) + except (ValueError, np.linalg.linalg.LinAlgError): + ValueError(MODELLING_ERROR_MESSAGE) logger.info('Modeling Complete') diff --git a/sdv/sampler.py b/sdv/sampler.py index 8e176e3f2..d87b08f15 100644 --- a/sdv/sampler.py +++ b/sdv/sampler.py @@ -234,11 +234,12 @@ def unflatten_model(self, parent_row, table_name, parent_name): model_dict = self._unflatten_dict(flat_parameters, table_name) model_name = get_qualified_name(self.modeler.model) - distribution_name = get_qualified_name(self.modeler.distribution) + model_dict['fitted'] = True model_dict['type'] = model_name if model_name == GAUSSIAN_COPULA: + distribution_name = self.modeler.model_kwargs['distribution'] model_dict['distribution'] = distribution_name for key in model_dict['distribs']: model_dict['distribs'][key].update({ diff --git a/tests/sdv/test_modeler.py b/tests/sdv/test_modeler.py index 02b933ca6..28a23682c 100644 --- a/tests/sdv/test_modeler.py +++ b/tests/sdv/test_modeler.py @@ -2,7 +2,7 @@ import numpy as np import pandas as pd -from copulas.multivariate import GaussianMultivariate +from copulas.multivariate import GaussianMultivariate, VineCopula from copulas.univariate.kde import KDEUnivariate from sdv.data_navigator import CSVDataLoader, Table @@ -86,7 +86,6 @@ def test__get_extensions(self, get_foreign_mock, extension_mock): get_foreign_mock.return_value = 'foreign_key' pk = 'primary_key' - table_name = 'table_name' children = ['first_children', 'second_children'] expected_result = [ @@ -101,7 +100,7 @@ def test__get_extensions(self, get_foreign_mock, extension_mock): ] # Run - result = modeler._get_extensions(pk, children, table_name) + result = modeler._get_extensions(pk, children) # Check assert all([result[index].equals(expected_result[index]) for index in range(len(result))]) @@ -111,18 +110,17 @@ def test_get_extensions_no_children(self): # Setup pk = 'primary_key' children = {} - table_name = 'table_name' expected_result = [] # Run - result = self.modeler._get_extensions(pk, children, table_name) + result = self.modeler._get_extensions(pk, children) # Check assert result == expected_result def test_CPA(self): - """ """ + """CPA will append extensions to the original table.""" # Setup self.modeler.model_database() table_name = 'DEMO_CUSTOMERS' @@ -228,6 +226,8 @@ def test_fit_model_distribution_arg(self): """fit_model will pass self.distribution FQN to modeler.""" # Setup model_mock = mock.MagicMock() + model_mock.__eq__.return_value = True + model_mock.__ne__.return_value = False modeler = Modeler(data_navigator='navigator', model=model_mock, distribution=KDEUnivariate) data = pd.DataFrame({ 'column': [0, 1, 1, 1, 0], @@ -239,16 +239,21 @@ def test_fit_model_distribution_arg(self): # Check model_mock.assert_called_once_with(distribution='copulas.univariate.kde.KDEUnivariate') - def test_model_database_distribution_arg(self): - """model_database will use self.distribution to model tables.""" + def test_model_database_kde_distribution(self): + """model_database works fine with kde distribution.""" # Setup modeler = Modeler(data_navigator=self.dn, distribution=KDEUnivariate) # Run modeler.model_database() - # Check - assert True + def test_model_database_vine_modeler(self): + """model_database works fine with vine modeler.""" + # Setup + modeler = Modeler(data_navigator=self.dn, model=VineCopula) + + # Run + modeler.model_database() def test__flatten_dict_flat_dict(self): """_flatten_dict don't modify flat dicts.""" From c811c0e6b94ffd5dfbff2a7e1e637310762e502b Mon Sep 17 00:00:00 2001 From: Manuel Alvarez Date: Tue, 5 Feb 2019 11:52:38 +0100 Subject: [PATCH 11/11] Update README snippets --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 31805193c..8e775ad7e 100644 --- a/README.md +++ b/README.md @@ -248,14 +248,14 @@ The modeler can also be saved to a file using the `save()` method. This will sav on the specified path. ```python ->>> modeler.save('models/demo_model.pkl') +>>> modeler.save('demo_model.pkl') ``` If you have stored a model in a previous session using the command above, you can load the model using the `load()` method: ```python ->>> modeler = Modeler.load('models/demo_model.pkl') +>>> modeler = Modeler.load('demo_model.pkl') ``` ### Using the Sampler