diff --git a/README.md b/README.md index 31805193c..8e775ad7e 100644 --- a/README.md +++ b/README.md @@ -248,14 +248,14 @@ The modeler can also be saved to a file using the `save()` method. This will sav on the specified path. ```python ->>> modeler.save('models/demo_model.pkl') +>>> modeler.save('demo_model.pkl') ``` If you have stored a model in a previous session using the command above, you can load the model using the `load()` method: ```python ->>> modeler = Modeler.load('models/demo_model.pkl') +>>> modeler = Modeler.load('demo_model.pkl') ``` ### Using the Sampler diff --git a/sdv/modeler.py b/sdv/modeler.py index 069d5bffb..2d2c9fa9e 100644 --- a/sdv/modeler.py +++ b/sdv/modeler.py @@ -1,8 +1,10 @@ import logging import pickle +import numpy as np import pandas as pd -from copulas.multivariate import GaussianMultivariate +from copulas import get_qualified_name +from copulas.multivariate import GaussianMultivariate, TreeTypes from copulas.univariate import GaussianUnivariate # Configure logger @@ -10,28 +12,54 @@ DEFAULT_MODEL = GaussianMultivariate DEFAULT_DISTRIBUTION = GaussianUnivariate +IGNORED_DICT_KEYS = ['fitted', 'distribution', 'type'] + +MODELLING_ERROR_MESSAGE = ( + 'There was an error while trying to model the database. If you are using a custom' + 'distribution or model, please try again using the default ones. If the problem persist,' + 'please report it here: https://github.com/HDI-Project/SDV/issues' +) class Modeler: - """Class responsible for modeling database.""" + """Class responsible for modeling database. + + Args: + data_navigator (DataNavigator): object for the dataset. + model (type): Class of model to use. + distribution (type): Class of distribution to use. Will be deprecated shortly. + model_kwargs (dict): Keyword arguments to pass to model. + """ DEFAULT_PRIMARY_KEY = 'GENERATED_PRIMARY_KEY' - def __init__(self, data_navigator, model=DEFAULT_MODEL, distribution=DEFAULT_DISTRIBUTION): + def __init__(self, data_navigator, model=DEFAULT_MODEL, distribution=None, model_kwargs=None): """Instantiates a modeler object. - Args: - data_navigator (DataNavigator): object for the dataset. - transformed_data (dict): transformed tables {table_name:dataframe}. - model (type): Class of model to use. - distribution (type): Class of model to use. """ self.tables = {} self.models = {} self.child_locs = {} # maps table->{child: col #} self.dn = data_navigator self.model = model - self.distribution = distribution + + if distribution and model != DEFAULT_MODEL: + raise ValueError( + '`distribution` argument is only suported for `GaussianMultivariate` model.') + + if distribution: + distribution = get_qualified_name(distribution) + else: + distribution = get_qualified_name(DEFAULT_DISTRIBUTION) + + if not model_kwargs: + if model == DEFAULT_MODEL: + model_kwargs = {'distribution': distribution} + + else: + model_kwargs = {'vine_type': TreeTypes.REGULAR} + + self.model_kwargs = model_kwargs def save(self, file_name): """Saves model to file destination. @@ -60,27 +88,81 @@ def get_pk_value(self, pk, index, mapping): return val - def flatten_model(self, model): + @classmethod + def _flatten_array(cls, nested, prefix=''): + """Return a dictionary with the values of the given nested array. + + Args: + nested (list, np.array): Iterable to flatten. + prefix (str): Name to append to the array indices. + + Returns: + dict + """ + result = {} + for index in range(len(nested)): + prefix_key = '__'.join([prefix, str(index)]) if len(prefix) else str(index) + + if isinstance(nested[index], (list, np.ndarray)): + result.update(cls._flatten_array(nested[index], prefix=prefix_key)) + + else: + result[prefix_key] = nested[index] + + return result + + @classmethod + def _flatten_dict(cls, nested, prefix=''): + """Return a flatten dict from a nested one. + + This method returns a flatten version of a dictionary, concatenating key names with + double underscores, that is: + + Args: + nested (dict): Original dictionary to flatten. + prefix (str): Prefix to append to key name + + Returns: + dict: Flattened dictionary. That is, all its keys hold a primitive value. + """ + result = {} + + for key in nested.keys(): + prefix_key = '__'.join([prefix, str(key)]) if len(prefix) else key + + if key in IGNORED_DICT_KEYS: + continue + + elif isinstance(nested[key], dict): + result.update(cls._flatten_dict(nested[key], prefix_key)) + + elif isinstance(nested[key], (np.ndarray, list)): + result.update(cls._flatten_array(nested[key], prefix_key)) + + else: + result[prefix_key] = nested[key] + + return result + + @classmethod + def flatten_model(cls, model, name=''): """Flatten a model's parameters into an array. Args: - model: a model object + model(self.model): Instance of model. + name (str): Prefix to the parameter name. Returns: pd.Series: parameters for model """ - params = list(model.covariance.flatten()) - for col_model in model.distribs.values(): - params.extend([col_model.std, col_model.mean]) - - return pd.Series(params) + return pd.Series(cls._flatten_dict(model.to_dict(), name)) def get_foreign_key(self, fields, primary): """Get foreign key from primary key. Args: - fields (dict): metadata's fields key for a given table. + fields (dict): metadata `fields` key for a given table. primary (str): Name of primary key in original table. Return: @@ -98,7 +180,7 @@ def impute_table(table): """Fill in any NaN values in a table. Args: - table(pandas.DataFrame): + table(pandas.DataFrame): Table to fill NaN values Returns: pandas.DataFrame @@ -122,46 +204,65 @@ def fit_model(self, data): data (pandas.DataFrame): Data to train the model with. Returns: - GaussianMultivariate: Fitted model. + model: Instance of self.model fitted with data. """ - model = self.model() + model = self.model(**self.model_kwargs) model.fit(data) return model - def _create_extension(self, df, transformed_child_table): - """Return the flattened model from a dataframe.""" - # remove column of foreign key + def _create_extension(self, foreign, transformed_child_table, table_info): + """Return the flattened model from a dataframe. + + Args: + foreign(pandas.DataFrame): Object with Index of elements from children table elements + of a given foreign_key. + transformed_child_table(pandas.DataFrame): Table of data to fil + table_info (tuple(str, str)): foreign_key and child table names. + + Returns: + pd.Series : Parameter extension + """ + + foreign_key, child_name = table_info try: - conditional_data = transformed_child_table.loc[df.index] + conditional_data = transformed_child_table.loc[foreign.index].copy() + conditional_data = conditional_data.drop(foreign_key, axis=1) + except KeyError: return None clean_df = self.impute_table(conditional_data) + return self.flatten_model(self.fit_model(clean_df), child_name) - return self.flatten_model(self.fit_model(clean_df)) + def _get_extensions(self, pk, children): + """Generate list of extension for child tables. - def _extension_from_group(self, transformed_child_table): - """Wrapper around _create_extension to use it with pd.DataFrame.apply.""" - def f(group): - return self._create_extension(group, transformed_child_table) - return f + Args: + pk (str): Name of the primary_key column in the parent table. + children (set[str]): Names of the children. - def _get_extensions(self, pk, children, table_name): - """Generate list of extension for child tables.""" - # keep track of which columns belong to which child - start = 0 - end = 0 - extensions = [] + Returns: list(pandas.DataFrame) - # make sure child_locs has value for table name - self.child_locs[table_name] = self.child_locs.get(table_name, {}) + Each element of the list is generated for one single children. + That dataframe should have as index.name the `foreign_key` name, and as index + it's values. + The values for a given index is generated by flattening a model fit with the related + data to that index in the children table. + """ + extensions = [] # find children that ref primary key for child in children: child_table = self.dn.tables[child].data child_meta = self.dn.tables[child].meta + fields = child_meta['fields'] + fk = self.get_foreign_key(fields, pk) + + if not fk: + continue + # check if leaf node if not self.dn.get_children(child): transformed_child_table = self.dn.transformed_data[child] @@ -169,38 +270,39 @@ def _get_extensions(self, pk, children, table_name): else: transformed_child_table = self.tables[child] - fields = child_meta['fields'] - fk = self.get_foreign_key(fields, pk) + table_info = (fk, '__' + child) - if not fk: - continue + foreign_key_values = child_table[fk].unique() + parameters = {} - extension = child_table.groupby(fk) - extension = extension.apply(self._extension_from_group(transformed_child_table)) + for foreign_key in foreign_key_values: + foreign_index = child_table[child_table[fk] == foreign_key] + parameter = self._create_extension( + foreign_index, transformed_child_table, table_info) - if len(extension): - # keep track of child column indices - end = max(end, start + extension.shape[1]) + if parameter is not None: + parameters[foreign_key] = parameter.to_dict() - self.child_locs[table_name][child] = (start, end) + extension = pd.DataFrame(parameters).T + extension.index.name = fk - # rename columns - extension.columns = range(start, end) + if len(extension): extensions.append(extension) - start = end return extensions def CPA(self, table): """Run CPA algorithm on a table. - Conditional Parameter Aggregation. It will take the tab + Conditional Parameter Aggregation. It will take the table's children and generate + extensions (parameters from modelling the related children for each foreign key) + and merge them to the original `table` Args: table (string): name of table. Returns: - None: + None """ logger.info('Modeling %s', table) # Grab table @@ -214,7 +316,7 @@ def CPA(self, table): # start with transformed table extended_table = self.dn.transformed_data[table] - extensions = self._get_extensions(pk, children, table) + extensions = self._get_extensions(pk, children) # add extensions for extension in extensions: @@ -237,12 +339,16 @@ def RCPA(self, table): def model_database(self): """Use RCPA and store model for database.""" - for table in self.dn.tables: - if not self.dn.get_parents(table): - self.RCPA(table) + try: + for table in self.dn.tables: + if not self.dn.get_parents(table): + self.RCPA(table) + + for table in self.tables: + clean_table = self.impute_table(self.tables[table]) + self.models[table] = self.fit_model(clean_table) - for table in self.tables: - clean_table = self.impute_table(self.tables[table]) - self.models[table] = self.fit_model(clean_table) + except (ValueError, np.linalg.linalg.LinAlgError): + ValueError(MODELLING_ERROR_MESSAGE) logger.info('Modeling Complete') diff --git a/sdv/sampler.py b/sdv/sampler.py index 32124fa24..4246ea40c 100644 --- a/sdv/sampler.py +++ b/sdv/sampler.py @@ -5,6 +5,8 @@ import exrex +GAUSSIAN_COPULA = 'copulas.multivariate.gaussian.GaussianMultivariate' + class Sampler: """Class to sample data from a model.""" @@ -13,7 +15,6 @@ def __init__(self, data_navigator, modeler): """Instantiate a new object.""" self.dn = data_navigator self.modeler = modeler - self.been_sampled = set() # table_name -> if already sampled self.sampled = {} # table_name -> [(primary_key, generated_row)] self.primary_key = {} @@ -78,6 +79,7 @@ def transform_synthesized_rows(self, synthesized_rows, table_name, num_rows): # filter out parameters labels = list(self.dn.tables[table_name].data) + synthesized_rows = self._fill_text_columns(synthesized_rows, labels, table_name) # reverse transform data @@ -103,6 +105,149 @@ def _get_parent_row(self, table_name): return random_parent, foreign_key, parent_row + @staticmethod + def generate_keys(prefix=''): + def f(row): + parts = [str(row[key]) for key in row.keys() if row[key] is not None] + if prefix: + parts = [prefix] + parts + + return '__'.join(parts) + + return f + + @classmethod + def _get_sorted_keys(cls, _dict): + result = [] + keys = list(_dict.keys()) + + if not keys: + return [] + + serie = pd.Series(keys) + df = pd.DataFrame(serie.str.split('__').values.tolist()) + uniques = df[0].unique() + + for value in uniques: + index = df[df[0] == value].index + _slice = df.loc[index, range(1, df.shape[1])].copy() + + try: + for column in _slice.columns: + _slice[column] = _slice[column].astype(int) + + except (ValueError, TypeError): + pass + + df.drop(index, inplace=True) + _slice = _slice.sort_values(list(range(1, df.shape[1]))) + result += _slice.apply(cls.generate_keys(value), axis=1).values.tolist() + + df = df.sort_values(list(range(df.shape[1]))) + result += df.apply(cls.generate_keys(), axis=1).values.tolist() + + return result + + def _unflatten_dict(self, flat, table_name=''): + """Transform a flattened dict into its original form. + + Works in exact opposite way that `sdv.Modeler._flatten_dict`. + + Args: + flat (dict): Flattened dict. + + """ + result = {} + children = self.dn.get_children(table_name) + keys = self._get_sorted_keys(flat) + + for key in keys: + path = key.split('__') + + if any(['__{}__'.format(child) in key for child in children]): + path = [ + path[0], + '__'.join(path[1: -1]), + path[-1] + ] + + value = flat[key] + walked = result + for step, name in enumerate(path): + + if isinstance(walked, dict) and name in walked: + walked = walked[name] + continue + + elif isinstance(walked, list) and len(walked) and len(walked) - 1 >= int(name): + walked = walked[int(name)] + continue + + else: + if name.isdigit(): + name = int(name) + + if step == len(path) - 1: + if isinstance(walked, list): + walked.append(value) + else: + walked[name] = value + + else: + next_step = path[step + 1] + if next_step.isdigit(): + if isinstance(name, int): + walked.append([]) + while len(walked) < name + 1: + walked.append([]) + + else: + walked[name] = [] + + walked = walked[name] + + else: + if isinstance(name, int): + walked.append({}) + else: + walked[name] = {} + + walked = walked[name] + + return result + + def unflatten_model(self, parent_row, table_name, parent_name): + """ Takes the params from a generated parent row and creates a model from it. + + Args: + parent_row (dataframe): a generated parent row + table_name (string): name of table to make model for + parent_name (string): name of parent table + """ + + prefix = '__{}__'.format(table_name) + columns = [column for column in parent_row.columns if column.startswith(prefix)] + new_columns = {column: column.replace(prefix, '') for column in columns} + flat_parameters = parent_row.loc[:, columns] + flat_parameters = flat_parameters.rename(columns=new_columns).to_dict('records')[0] + + model_dict = self._unflatten_dict(flat_parameters, table_name) + model_name = get_qualified_name(self.modeler.model) + + model_dict['fitted'] = True + model_dict['type'] = model_name + + if model_name == GAUSSIAN_COPULA: + distribution_name = self.modeler.model_kwargs['distribution'] + model_dict['distribution'] = distribution_name + for key in model_dict['distribs']: + model_dict['distribs'][key].update({ + 'fitted': True, + 'type': distribution_name + }) + + return self.modeler.model.from_dict(model_dict) + def sample_rows(self, table_name, num_rows): """Sample specified number of rows for specified table. @@ -123,11 +268,10 @@ def sample_rows(self, table_name, num_rows): parent_row = parent_row.loc[[0]] # get parameters from parent to make model - model = self._make_model_from_params( - parent_row, table_name, random_parent) + model = self.unflatten_model(parent_row, table_name, random_parent) # sample from that model - if model is not None and len(model.distribs) > 0: + if model is not None and model.fitted: synthesized_rows = model.sample(num_rows) else: raise ValueError( @@ -146,7 +290,7 @@ def sample_rows(self, table_name, num_rows): else: # there is no parent model = self.modeler.models[table_name] - if len(model.distribs): + if model.fitted: synthesized_rows = model.sample(num_rows) else: raise ValueError( @@ -168,6 +312,30 @@ def sample_table(self, table_name): num_rows = self.dn.tables[table_name].data.shape[0] return self.sample_rows(table_name, num_rows) + def _sample_child_rows(self, parent_name, parent_row, sampled_data, num_rows=5): + """Uses parameters from parent row to synthesize child rows. + + Args: + parent_name (str): name of parent table + parent_row (dataframe): synthesized parent row + sample_data (dict): maps table name to sampled data + num_rows (int): number of rows to synthesize per parent row + + Returns: + synthesized children rows + """ + + children = self.dn.get_children(parent_name) + for child in children: + rows = self.sample_rows(child, num_rows) + + if child in sampled_data: + sampled_data[child] = pd.concat([sampled_data[child], rows]) + else: + sampled_data[child] = rows + + self._sample_child_rows(child, rows.iloc[0:1, :], sampled_data) + def sample_all(self, num_rows=5): """Samples the entire database. @@ -184,6 +352,7 @@ def sample_all(self, num_rows=5): This is this way because the children tables are created modelling the relation thet have with their parent tables, so it's behavior may change from one table to another. """ + tables = self.dn.tables sampled_data = {} @@ -201,84 +370,6 @@ def sample_all(self, num_rows=5): return self.reset_indices_tables(sampled_data) - def _sample_child_rows(self, parent_name, parent_row, sampled_data, num_rows=5): - """Uses parameters from parent row to synthesize child rows. - - Args: - parent_name (str): name of parent table - parent_row (dataframe): synthesized parent row - sample_data (dict): maps table name to sampled data - num_rows (int): number of rows to synthesize per parent row - - Returns: - synthesized children rows - """ - children = self.dn.get_children(parent_name) - for child in children: - rows = self.sample_rows(child, num_rows) - - if child in sampled_data: - sampled_data[child] = pd.concat([sampled_data[child], rows]) - else: - sampled_data[child] = rows - - self._sample_child_rows(child, rows.iloc[0:1, :], sampled_data) - - def _make_model_from_params(self, parent_row, table_name, parent_name): - """ Takes the params from a generated parent row and creates a model from it. - - Args: - parent_row (dataframe): a generated parent row - table_name (string): name of table to make model for - parent_name (string): name of parent table - """ - # get parameters - child_range = self.modeler.child_locs.get(parent_name, {}).get(table_name, {}) - - if not child_range: - return None - - param_indices = list(range(child_range[0], child_range[1])) - params = parent_row.loc[:, param_indices] - totalcols = params.shape[1] - num_cols = self.modeler.tables[table_name].shape[1] - - # get labels for dataframe - labels = list(self.modeler.tables[table_name].columns) - - # parent_meta = self.dn.tables[parent_name].meta - # fk = parent_meta['primary_key'] - - # if fk in labels: - # labels.remove(fk) - # num_cols -= 1 - - cov_size = num_cols ** 2 - - # Covariance matrix - covariance = params.iloc[:, 0:cov_size] - covariance = covariance.values.reshape((num_cols, num_cols)) - - # Distributions - distributions = {} - for label_index, i in enumerate(range(cov_size, totalcols, 2)): - distributions[labels[label_index]] = { - 'type': get_qualified_name(self.modeler.distribution), - 'fitted': True, - 'std': abs(params.iloc[:, i]), # Pending for issue - 'mean': params.iloc[:, i + 1], # https://github.com/HDI-Project/SDV/issues/58 - } - - model_params = { - 'covariance': covariance, - 'distribs': distributions, - 'type': get_qualified_name(self.modeler.model), - 'fitted': True, - 'distribution': get_qualified_name(self.modeler.distribution) - } - - return self.modeler.model.from_dict(model_params) - def _get_table_meta(self, meta, table_name): """Return metadata get table meta for a given table name""" for table in meta['tables']: diff --git a/tests/sdv/test_modeler.py b/tests/sdv/test_modeler.py index 410f78831..28a23682c 100644 --- a/tests/sdv/test_modeler.py +++ b/tests/sdv/test_modeler.py @@ -1,9 +1,11 @@ -from unittest import TestCase +from unittest import TestCase, mock import numpy as np import pandas as pd +from copulas.multivariate import GaussianMultivariate, VineCopula +from copulas.univariate.kde import KDEUnivariate -from sdv.data_navigator import CSVDataLoader +from sdv.data_navigator import CSVDataLoader, Table from sdv.modeler import Modeler @@ -19,55 +21,106 @@ def setUp(self): def test__create_extension(self): """Tests that the create extension method returns correct parameters.""" # Setup - child_table = self.dn.get_data('DEMO_ORDERS') - user = child_table[child_table['CUSTOMER_ID'] == 50] - expected = pd.Series([ - 1.500000e+00, 0.000000e+00, -1.269991e+00, - 0.000000e+00, 0.000000e+00, 0.000000e+00, - -1.269991e+00, 0.000000e+00, 1.500000e+00, - 0.000000e+00, 0.000000e+00, -7.401487e-17, - 1.000000e+00, 7.000000e+00, 2.449490e+00, - 4.000000e+00, 5.000000e+01, 5.000000e+01, - 1.000000e-03, 5.000000e+01, 7.300000e+02, - 2.380000e+03, 7.618545e+02, 1.806667e+03 - ]) + data_navigator = mock.MagicMock() + modeler = Modeler(data_navigator) + table = pd.DataFrame({ + 'foreign': [0, 1, 0, 1, 0, 1], + 'a': [0, 1, 0, 1, 0, 1], + 'b': [1, 2, 3, 4, 5, 6] + }) + group = table[table.a == 0] + table_info = ('foreign', '') + + expected_result = pd.Series({ + 'covariance__0__0': 0.0, + 'covariance__0__1': 0.0, + 'covariance__1__0': 0.0, + 'covariance__1__1': 1.4999999999999991, + 'distribs__a__mean': 0.0, + 'distribs__a__std': 0.001, + 'distribs__b__mean': 3.0, + 'distribs__b__std': 1.632993161855452 + }) + + # Run + result = modeler._create_extension(group, table, table_info) + + # Check + assert result.equals(expected_result) + + def test__create_extension_wrong_index_return_none(self): + """_create_extension return None if transformed_child_table can't be indexed by df.""" + # Setup + data_navigator = mock.MagicMock() + modeler = Modeler(data_navigator) + transformed_child_table = pd.DataFrame(np.eye(3), columns=['A', 'B', 'C']) + table_info = ('', '') + df = pd.DataFrame(index=range(5, 10)) # Run - parameters = self.modeler._create_extension(user, child_table) + result = modeler._create_extension(df, transformed_child_table, table_info) # Check - assert expected.subtract(parameters).all() < 10E-3 + assert result is None - def test__get_extensions(self): - """_get_extensions returns a works for table with child""" + @mock.patch('sdv.modeler.Modeler._create_extension') + @mock.patch('sdv.modeler.Modeler.get_foreign_key') + def test__get_extensions(self, get_foreign_mock, extension_mock): + """_get_extensions return the conditional modelling parameters for each children.""" # Setup - pk = 'ORDER_ID' - table = 'DEMO_ORDERS' - children = self.dn.get_children(table) + data_navigator = mock.MagicMock() + + first_table_data = pd.DataFrame({'foreign_key': [0, 1]}) + first_table_meta = {'fields': []} + + data_navigator.tables = { + 'first_children': Table(first_table_data, first_table_meta), + 'second_children': Table(first_table_data, first_table_meta), + } + data_navigator.get_children.return_value = {} + modeler = Modeler(data_navigator) + modeler.tables = {} + + extension_mock.side_effect = lambda x, y, z: None + + get_foreign_mock.return_value = 'foreign_key' + + pk = 'primary_key' + children = ['first_children', 'second_children'] + + expected_result = [ + pd.DataFrame([{ + '__first_children_column_1': 1, + '__first_children_column_2': 2 + }]), + pd.DataFrame([{ + '__second_children_column_1': 1, + '__second_children_column_2': 2 + }]) + ] # Run - result = self.modeler._get_extensions(pk, children, table) + result = modeler._get_extensions(pk, children) # Check - assert len(result) == 1 - assert result[0].shape == (10, 35) + assert all([result[index].equals(expected_result[index]) for index in range(len(result))]) def test_get_extensions_no_children(self): - """Tests that get extensions works for table with no children.""" + """_get_extensions return an empty list if children is empty.""" # Setup - pk = 'ORDER_ITEM_ID' - table = 'DEMO_ORDER_ITEMS' - children = self.dn.get_children(table) + pk = 'primary_key' + children = {} + expected_result = [] # Run - result = self.modeler._get_extensions(pk, children, table) + result = self.modeler._get_extensions(pk, children) # Check assert result == expected_result def test_CPA(self): - """ """ + """CPA will append extensions to the original table.""" # Setup self.modeler.model_database() table_name = 'DEMO_CUSTOMERS' @@ -92,29 +145,33 @@ def test_CPA(self): def test_flatten_model(self): """flatten_model returns a pandas.Series with all the params to recreate a model.""" # Setup - for data in self.dn.transformed_data.values(): - num_columns = data.shape[1] - model = self.modeler.model() - model.fit(data) - - # We generate it this way because RDT behavior is not fully deterministic - # and transformed data can change between test runs. - distribs_values = np.array([ - [col_model.std, col_model.mean] - for col_model in model.distribs.values() - ]).flatten() - - expected_result = pd.Series( - list(model.covariance.flatten()) + - list(distribs_values) - ) - - # Run - result = self.modeler.flatten_model(model) - - # Check - assert (result == expected_result).all() - assert len(result) == num_columns ** 2 + (2 * num_columns) + model = GaussianMultivariate() + X = np.eye(3) + model.fit(X) + + expected_result = pd.Series({ + 'covariance__0__0': 1.5000000000000004, + 'covariance__0__1': -0.7500000000000003, + 'covariance__0__2': -0.7500000000000003, + 'covariance__1__0': -0.7500000000000003, + 'covariance__1__1': 1.5000000000000004, + 'covariance__1__2': -0.7500000000000003, + 'covariance__2__0': -0.7500000000000003, + 'covariance__2__1': -0.7500000000000003, + 'covariance__2__2': 1.5000000000000007, + 'distribs__0__mean': 0.33333333333333331, + 'distribs__0__std': 0.47140452079103168, + 'distribs__1__mean': 0.33333333333333331, + 'distribs__1__std': 0.47140452079103168, + 'distribs__2__mean': 0.33333333333333331, + 'distribs__2__std': 0.47140452079103168 + }) + + # Run + result = Modeler.flatten_model(model) + + # Check + assert np.isclose(result, expected_result).all() def test_impute_table(self): """impute_table fills all NaN values with 0 or the mean of values.""" @@ -164,3 +221,131 @@ def test_get_foreign_key(self): # Check assert result == expected_result + + def test_fit_model_distribution_arg(self): + """fit_model will pass self.distribution FQN to modeler.""" + # Setup + model_mock = mock.MagicMock() + model_mock.__eq__.return_value = True + model_mock.__ne__.return_value = False + modeler = Modeler(data_navigator='navigator', model=model_mock, distribution=KDEUnivariate) + data = pd.DataFrame({ + 'column': [0, 1, 1, 1, 0], + }) + + # Run + modeler.fit_model(data) + + # Check + model_mock.assert_called_once_with(distribution='copulas.univariate.kde.KDEUnivariate') + + def test_model_database_kde_distribution(self): + """model_database works fine with kde distribution.""" + # Setup + modeler = Modeler(data_navigator=self.dn, distribution=KDEUnivariate) + + # Run + modeler.model_database() + + def test_model_database_vine_modeler(self): + """model_database works fine with vine modeler.""" + # Setup + modeler = Modeler(data_navigator=self.dn, model=VineCopula) + + # Run + modeler.model_database() + + def test__flatten_dict_flat_dict(self): + """_flatten_dict don't modify flat dicts.""" + # Setup + nested_dict = { + 'a': 1, + 'b': 2 + } + expected_result = { + 'a': 1, + 'b': 2 + } + + # Run + result = Modeler._flatten_dict(nested_dict) + + # Check + assert result == expected_result + + def test__flatten_dict_nested_dict(self): + """_flatten_dict flatten nested dicts respecting the prefixes.""" + # Setup + nested_dict = { + 'first_key': { + 'a': 1, + 'b': 2 + }, + 'second_key': { + 'x': 0 + } + } + + expected_result = { + 'first_key__a': 1, + 'first_key__b': 2, + 'second_key__x': 0 + } + + # Run + result = Modeler._flatten_dict(nested_dict) + + # Check + assert result == expected_result + + def test__flatten_array_ndarray(self): + """_flatten_array return a dict formed from the input np.array""" + # Setup + nested = np.array([ + [1, 0, 0], + [0, 1, 0], + [0, 0, 1] + ]) + expected_result = { + '0__0': 1, + '0__1': 0, + '0__2': 0, + '1__0': 0, + '1__1': 1, + '1__2': 0, + '2__0': 0, + '2__1': 0, + '2__2': 1 + } + + # Run + result = Modeler._flatten_array(nested) + + # Check + assert result == expected_result + + def test__flatten_array_list(self): + """_flatten_array return a dict formed from the input list""" + # Setup + nested = [ + [1, 0, 0], + [0, 1, 0], + [0, 0, 1] + ] + expected_result = { + '0__0': 1, + '0__1': 0, + '0__2': 0, + '1__0': 0, + '1__1': 1, + '1__2': 0, + '2__0': 0, + '2__1': 0, + '2__2': 1 + } + + # Run + result = Modeler._flatten_array(nested) + + # Check + assert result == expected_result diff --git a/tests/sdv/test_sampler.py b/tests/sdv/test_sampler.py index 9776046d4..c795ed3cb 100644 --- a/tests/sdv/test_sampler.py +++ b/tests/sdv/test_sampler.py @@ -1,4 +1,8 @@ from unittest import TestCase +from unittest.mock import MagicMock, patch + +import numpy as np +import pandas as pd from sdv.data_navigator import CSVDataLoader from sdv.modeler import Modeler @@ -54,21 +58,173 @@ def test_sample_rows_children_table(self): assert len(unique_foreign_keys) == 1 assert unique_foreign_keys[0] in sampled_parent['CUSTOMER_ID'].values - def test_sample_all(self): + @patch('sdv.sampler.pd.concat') + @patch('sdv.sampler.Sampler.reset_indices_tables') + @patch('sdv.sampler.Sampler._sample_child_rows') + @patch('sdv.sampler.Sampler.sample_rows') + def test_sample_all(self, rows_mock, child_mock, reset_mock, concat_mock): """Check sample_all and returns some value.""" + # Setup + data_navigator = MagicMock() + data_navigator.tables = ['TABLE_A', 'TABLE_B'] + data_navigator.get_parents.side_effect = lambda x: x != 'TABLE_A' + modeler = MagicMock() + sampler = Sampler(data_navigator, modeler) + + def fake_dataframe(name, number): + return pd.DataFrame([{name: 0} for i in range(number)], index=[0]*number) + + rows_mock.side_effect = fake_dataframe + concat_mock.return_value = 'concatenated_dataframe' + + expected_get_parents_call_list = [(('TABLE_A',), {}), (('TABLE_B',), {})] + expected_rows_mock_call_list = [(('TABLE_A', 1), {}) for i in range(5)] # Run - result = self.sampler.sample_all(num_rows=5) + result = sampler.sample_all(num_rows=5) # Check - assert result.keys() == self.sampler.dn.tables.keys() + assert data_navigator.get_parents.call_args_list == expected_get_parents_call_list + assert result == reset_mock.return_value + + assert rows_mock.call_args_list == expected_rows_mock_call_list + assert child_mock.call_count == 5 + reset_mock.assert_called_once_with({'TABLE_A': 'concatenated_dataframe'}) + + def test_unflatten_dict(self): + """ """ + # Setup + data_navigator = MagicMock() + modeler = MagicMock() + sampler = Sampler(data_navigator, modeler) + flat = { + 'first_key__a': 1, + 'first_key__b': 2, + 'second_key__x': 0 + } + + expected_result = { + 'first_key': { + 'a': 1, + 'b': 2 + }, + 'second_key': { + 'x': 0 + } + } - for name, table in result.items(): - with self.subTest(table=name): - raw_data = self.modeler.dn.tables[name].data - assert (table.columns == raw_data.columns).all() + # Run + result = sampler._unflatten_dict(flat) + + # Check + assert result == expected_result + data_navigator.assert_not_called() + modeler.assert_not_called() + + def test_unflatten_dict_mixed_array(self): + """unflatten_dict restruicture arrays""" + # Setup + data_navigator = MagicMock() + modeler = MagicMock() + sampler = Sampler(data_navigator, modeler) + flat = { + 'first_key__0__0': 1, + 'first_key__0__1': 0, + 'first_key__1__0': 0, + 'first_key__1__1': 1, + 'second_key__0__std': 0.5, + 'second_key__0__mean': 0.5, + 'second_key__1__std': 0.25, + 'second_key__1__mean': 0.25 + } + + expected_result = { + 'first_key': [ + [1, 0], + [0, 1] + ], + 'second_key': [ + { + 'std': 0.5, + 'mean': 0.5 + }, + { + 'std': 0.25, + 'mean': 0.25 + } + ] + } + + # Run - if not self.sampler.dn.get_parents(name): - primary_key = self.sampler.dn.get_meta_data(name)['primary_key'] - assert len(table) == 5 - assert len(table[primary_key].unique()) == 5 + result = sampler._unflatten_dict(flat) + + # Check + assert result == expected_result + data_navigator.assert_not_called() + modeler.assert_not_called() + + def test_unflatten_dict_child_name(self): + """unflatten_dict will respect the name of child tables.""" + # Setup + data_navigator = MagicMock() + data_navigator.get_children.return_value = ['CHILD_TABLE'] + modeler = MagicMock() + sampler = Sampler(data_navigator, modeler) + + flat = { + 'first_key__a': 1, + 'first_key____CHILD_TABLE__model_param': 0, + 'distribs____CHILD_TABLE__distribs__UNIT_PRICE__std__mean': 0 + } + table_name = 'TABLE_NAME' + expected_result = { + 'first_key': { + 'a': 1, + '__CHILD_TABLE': { + 'model_param': 0 + } + }, + 'distribs': { + '__CHILD_TABLE__distribs__UNIT_PRICE__std': { + 'mean': 0 + } + } + } + + # Run + result = sampler._unflatten_dict(flat, table_name) + + # Check + assert result == expected_result + modeler.assert_not_called() + data_navigator.get_children.assert_called_once_with('TABLE_NAME') + + def test_unflatten_respect_covariance_matrix(self): + """unflatten_dict restructures the covariance matrix into an square matrix.""" + # Setup + data_navigator = MagicMock() + modeler = MagicMock() + sampler = Sampler(data_navigator, modeler) + + def fake_values(i, j): + return '{}, {}'.format(i, j) + + expected_result = { + 'covariance': np.array([ + [fake_values(i, j) for j in range(40)] + for i in range(40) + ]).tolist() + } + + flat = { + 'covariance__{}__{}'.format(i, j): fake_values(i, j) + for i in range(40) for j in range(40) + } + table_name = 'TABLE_NAME' + + # Run + result = sampler._unflatten_dict(flat, table_name) + + # Check + assert result == expected_result