From 7b3dd7a5053afb650ebbb54967d2a3ce1c5ae9ec Mon Sep 17 00:00:00 2001 From: Manuel Alvarez Date: Wed, 6 Feb 2019 14:40:50 +0100 Subject: [PATCH 1/9] Update Makefile, add examples to lint --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 99f759541..9ac8d9b8d 100644 --- a/Makefile +++ b/Makefile @@ -51,8 +51,8 @@ clean-test: ## remove test and coverage artifacts rm -fr .pytest_cache lint: ## check style with flake8 and isort - flake8 sdv tests - isort -c --recursive sdv tests + flake8 sdv tests examples + isort -c --recursive sdv tests examples fixlint: ## fix lint issues using autoflake, autopep8, and isort find sdv -name '*.py' | xargs autoflake --in-place --remove-all-unused-imports --remove-unused-variables From 760c3bc9ee04caff8d1d753f6ef4eac79dbf449c Mon Sep 17 00:00:00 2001 From: Manuel Alvarez Date: Wed, 6 Feb 2019 14:44:52 +0100 Subject: [PATCH 2/9] Sample examples all at once, skip repeated downloads, fix linting issues --- examples/demo.py | 19 ++++++++------- .../multiparent_example.py | 5 ++-- examples/utils.py | 23 +++++++++++++------ 3 files changed, 28 insertions(+), 19 deletions(-) diff --git a/examples/demo.py b/examples/demo.py index 4da909164..8f93c436b 100644 --- a/examples/demo.py +++ b/examples/demo.py @@ -3,8 +3,8 @@ import sys from timeit import default_timer as timer -from sdv.sdv import SDV from examples.utils import download_folder +from sdv.sdv import SDV def get_logger(): @@ -51,15 +51,14 @@ def run_demo(folder_name): 'demo', folder_name, folder_name.capitalize() + '_manual_meta.json') sdv = SDV(meta_file) sdv.fit() - sampled_rows = {} - LOGGER.info('Parent map: %s', - sdv.dn.parent_map) - LOGGER.info('Transformed data: %s', - sdv.dn.transformed_data) - table_list = table_dict[folder_name] - for table in table_list: - sampled_rows[table] = sdv.sample_rows(table, 1) - LOGGER.info('Sampled row from %s: %s', table, sampled_rows[table]) + sampled = sdv.sample_all() + + LOGGER.info('Parent map: %s', sdv.dn.parent_map) + LOGGER.info('Transformed data: %s', sdv.dn.transformed_data) + + for name, table in sampled.items(): + LOGGER.info('Sampled row from %s: %s', name, table.head(3).T) + end = timer() LOGGER.info('Total time: %s seconds', round(end-start)) diff --git a/examples/multiparent_example/multiparent_example.py b/examples/multiparent_example/multiparent_example.py index 23bd9b757..cb8fa8613 100644 --- a/examples/multiparent_example/multiparent_example.py +++ b/examples/multiparent_example/multiparent_example.py @@ -22,12 +22,13 @@ def run_example(): # Setup vault = SDV('data/meta.json') vault.fit() - + # Run result = vault.sample_all() for name, table in result.items(): print('Samples generated for table {}:\n{}\n'.format(name, table.head(5))) + if __name__ == '__main__': - run_example() \ No newline at end of file + run_example() diff --git a/examples/utils.py b/examples/utils.py index 92c1d3a53..b980a1785 100644 --- a/examples/utils.py +++ b/examples/utils.py @@ -1,11 +1,13 @@ +import logging import os -import boto3 -import botocore import zipfile +import boto3 from botocore import UNSIGNED from botocore.client import Config +from botocore.exceptions import ClientError +LOGGER = logging.getLogger(__name__) BUCKET_NAME = 'hdi-demos' SDV_NAME = 'sdv-demo' @@ -14,20 +16,27 @@ def download_folder(folder_name): """Downloads and extracts demo folder from S3""" - s3 = boto3.resource('s3', region_name='us-east-1', - config=Config(signature_version=UNSIGNED)) + s3 = boto3.resource('s3', region_name='us-east-1', config=Config(signature_version=UNSIGNED)) zip_name = folder_name + SUFFIX zip_destination = os.path.join('demo', zip_name) key = os.path.join(SDV_NAME, zip_name) - # make sure directory exists + + # If the directory doesn't exist , we create it + # If it exists, we check for the folder_name for early exit if not os.path.exists('demo'): os.makedirs('demo') + + else: + if os.path.exists(os.path.join('demo', folder_name)): + LOGGER.info('Folder %s found, skipping download', folder_name) + return + # try to download files from s3 try: s3.Bucket(BUCKET_NAME).download_file(key, zip_destination) - except botocore.exceptions.ClientError as e: + except ClientError as e: if e.response['Error']['Code'] == "404": - print("The object does not exist.") + LOGGER.error("The object does not exist.") else: raise From 8deb632275c5262ff091115e9b8582fc8647bb02 Mon Sep 17 00:00:00 2001 From: Manuel Alvarez Date: Wed, 6 Feb 2019 18:31:00 +0100 Subject: [PATCH 3/9] Fix issues when fitting model with empty or constant dataframes --- sdv/modeler.py | 36 ++++++--- sdv/sampler.py | 22 +++--- tests/sdv/test_modeler.py | 149 +++++++++++++++++++++++++++++++------- 3 files changed, 158 insertions(+), 49 deletions(-) diff --git a/sdv/modeler.py b/sdv/modeler.py index 2d2c9fa9e..ea9aae7eb 100644 --- a/sdv/modeler.py +++ b/sdv/modeler.py @@ -3,7 +3,7 @@ import numpy as np import pandas as pd -from copulas import get_qualified_name +from copulas import EPSILON, get_qualified_name from copulas.multivariate import GaussianMultivariate, TreeTypes from copulas.univariate import GaussianUnivariate @@ -15,9 +15,9 @@ IGNORED_DICT_KEYS = ['fitted', 'distribution', 'type'] MODELLING_ERROR_MESSAGE = ( - 'There was an error while trying to model the database. If you are using a custom' - 'distribution or model, please try again using the default ones. If the problem persist,' - 'please report it here: https://github.com/HDI-Project/SDV/issues' + 'There was an error while trying to model the database. If you are using a custom ' + 'distribution or model, please try again using the default ones. If the problem persist, ' + 'please report it here:\nhttps://github.com/HDI-Project/SDV/issues.\n' ) @@ -194,8 +194,17 @@ def impute_table(table): values[label] = value else: values[label] = 0 + table = table.fillna(values) - return table.fillna(values) + # There is an issue when using KDEUnivariate modeler in tables with childs + # As the extension columns would have constant values, that make it crash + # This is a temporary fix while https://github.com/DAI-Lab/Copulas/issues/82 is solved. + first_index = table.index[0] + constant_columns = table.loc[:, (table == table.loc[first_index]).all()].columns + for column in constant_columns: + table.loc[first_index, column] = table.loc[first_index, column] + EPSILON + + return table def fit_model(self, data): """Returns an instance of self.model fitted with the given data. @@ -218,10 +227,10 @@ def _create_extension(self, foreign, transformed_child_table, table_info): foreign(pandas.DataFrame): Object with Index of elements from children table elements of a given foreign_key. transformed_child_table(pandas.DataFrame): Table of data to fil - table_info (tuple(str, str)): foreign_key and child table names. + table_info (tuple[str, str]): foreign_key and child table names. Returns: - pd.Series : Parameter extension + pd.Series or None : Parameter extension if it can be generated, None elsewhere. """ foreign_key, child_name = table_info @@ -232,8 +241,11 @@ def _create_extension(self, foreign, transformed_child_table, table_info): except KeyError: return None - clean_df = self.impute_table(conditional_data) - return self.flatten_model(self.fit_model(clean_df), child_name) + if len(conditional_data): + clean_df = self.impute_table(conditional_data) + return self.flatten_model(self.fit_model(clean_df), child_name) + + return None def _get_extensions(self, pk, children): """Generate list of extension for child tables. @@ -284,7 +296,7 @@ def _get_extensions(self, pk, children): parameters[foreign_key] = parameter.to_dict() extension = pd.DataFrame(parameters).T - extension.index.name = fk + extension.index.name = pk if len(extension): extensions.append(extension) @@ -348,7 +360,7 @@ def model_database(self): clean_table = self.impute_table(self.tables[table]) self.models[table] = self.fit_model(clean_table) - except (ValueError, np.linalg.linalg.LinAlgError): - ValueError(MODELLING_ERROR_MESSAGE) + except (ValueError, np.linalg.linalg.LinAlgError) as error: + raise ValueError(MODELLING_ERROR_MESSAGE).with_traceback(error.__traceback__) logger.info('Modeling Complete') diff --git a/sdv/sampler.py b/sdv/sampler.py index 4246ea40c..b67674a66 100644 --- a/sdv/sampler.py +++ b/sdv/sampler.py @@ -33,11 +33,11 @@ def reset_indices_tables(sampled_tables): return sampled_tables - def transform_synthesized_rows(self, synthesized_rows, table_name, num_rows): + def transform_synthesized_rows(self, synthesized, table_name, num_rows): """Add primary key and reverse transform synthetized data. Args: - synthesized_rows(pandas.DataFrame): Generated data from model + synthesized(pandas.DataFrame): Generated data from model table_name(str): Name of the table. num_rows(int): Number of rows sampled. @@ -68,25 +68,29 @@ def transform_synthesized_rows(self, synthesized_rows, table_name, num_rows): ' to generate {} samples.'.format(table_name, regex, num_rows) ) - synthesized_rows[primary_key] = pd.Series(values) + synthesized[primary_key] = pd.Series(values) if (node['type'] == 'number') and (node['subtype'] == 'integer'): - synthesized_rows[primary_key] = pd.to_numeric(synthesized_rows[primary_key]) + synthesized[primary_key] = pd.to_numeric(synthesized[primary_key]) - sample_info = (primary_key, synthesized_rows) + sample_info = (primary_key, synthesized) self.sampled = self.update_mapping_list(self.sampled, table_name, sample_info) # filter out parameters labels = list(self.dn.tables[table_name].data) + reverse_columns = [ + transformer[1] for transformer in self.dn.ht.transformers + if table_name in transformer + ] - synthesized_rows = self._fill_text_columns(synthesized_rows, labels, table_name) + synthesized = self._fill_text_columns(synthesized, labels, table_name) # reverse transform data - reversed_data = self.dn.ht.reverse_transform_table(synthesized_rows, orig_meta) + reversed_data = self.dn.ht.reverse_transform_table(synthesized[reverse_columns], orig_meta) - synthesized_rows.update(reversed_data) - return synthesized_rows[labels] + synthesized.update(reversed_data) + return synthesized[labels] def _get_parent_row(self, table_name): parents = self.dn.get_parents(table_name) diff --git a/tests/sdv/test_modeler.py b/tests/sdv/test_modeler.py index 28a23682c..93eab1b1b 100644 --- a/tests/sdv/test_modeler.py +++ b/tests/sdv/test_modeler.py @@ -1,7 +1,9 @@ -from unittest import TestCase, mock +from unittest import TestCase +from unittest.mock import MagicMock, patch import numpy as np import pandas as pd +from copulas import EPSILON from copulas.multivariate import GaussianMultivariate, VineCopula from copulas.univariate.kde import KDEUnivariate @@ -9,7 +11,7 @@ from sdv.modeler import Modeler -class ModelerTest(TestCase): +class TestModeler(TestCase): def setUp(self): """Set up test fixtures, if any.""" @@ -18,20 +20,30 @@ def setUp(self): self.dn.transform_data() self.modeler = Modeler(self.dn) - def test__create_extension(self): + @patch('sdv.modeler.Modeler.flatten_model') + @patch('sdv.modeler.Modeler.fit_model') + @patch('sdv.modeler.Modeler.impute_table') + def test__create_extension(self, impute_mock, fit_mock, flatten_mock): """Tests that the create extension method returns correct parameters.""" # Setup - data_navigator = mock.MagicMock() + data_navigator = MagicMock() modeler = Modeler(data_navigator) table = pd.DataFrame({ 'foreign': [0, 1, 0, 1, 0, 1], 'a': [0, 1, 0, 1, 0, 1], 'b': [1, 2, 3, 4, 5, 6] }) - group = table[table.a == 0] - table_info = ('foreign', '') + foreign = table[table.a == 0] + table_info = ('foreign', 'child') - expected_result = pd.Series({ + impute_mock.return_value = pd.DataFrame({ + 'A': [1, 2], + 'B': [3, 4] + }) + + fit_mock.return_value = 'fitted model' + + flatten_mock.return_value = pd.Series({ 'covariance__0__0': 0.0, 'covariance__0__1': 0.0, 'covariance__1__0': 0.0, @@ -43,15 +55,30 @@ def test__create_extension(self): }) # Run - result = modeler._create_extension(group, table, table_info) + result = modeler._create_extension(foreign, table, table_info) # Check - assert result.equals(expected_result) + assert result.equals(flatten_mock.return_value) + + df = pd.DataFrame({ + 'a': [0, 1, 0, 1, 0, 1], + 'b': [1, 2, 3, 4, 5, 6] + }) + df = df.loc[foreign.index] + + assert len(impute_mock.call_args_list) + call_args = impute_mock.call_args_list[0] + assert len(call_args[0]) == 1 + assert call_args[0][0].equals(df) + assert call_args[1] == {} + + fit_mock.assert_called_once_with(impute_mock.return_value) + flatten_mock.assert_called_once_with('fitted model', 'child') def test__create_extension_wrong_index_return_none(self): """_create_extension return None if transformed_child_table can't be indexed by df.""" # Setup - data_navigator = mock.MagicMock() + data_navigator = MagicMock() modeler = Modeler(data_navigator) transformed_child_table = pd.DataFrame(np.eye(3), columns=['A', 'B', 'C']) table_info = ('', '') @@ -63,12 +90,12 @@ def test__create_extension_wrong_index_return_none(self): # Check assert result is None - @mock.patch('sdv.modeler.Modeler._create_extension') - @mock.patch('sdv.modeler.Modeler.get_foreign_key') + @patch('sdv.modeler.Modeler._create_extension') + @patch('sdv.modeler.Modeler.get_foreign_key') def test__get_extensions(self, get_foreign_mock, extension_mock): """_get_extensions return the conditional modelling parameters for each children.""" # Setup - data_navigator = mock.MagicMock() + data_navigator = MagicMock() first_table_data = pd.DataFrame({'foreign_key': [0, 1]}) first_table_meta = {'fields': []} @@ -173,18 +200,18 @@ def test_flatten_model(self): # Check assert np.isclose(result, expected_result).all() - def test_impute_table(self): - """impute_table fills all NaN values with 0 or the mean of values.""" + def test_impute_table_with_mean(self): + """impute_table fills all NaN values the mean of values when possible.""" # Setup table = pd.DataFrame([ - {'A': np.nan, 'B': 10., 'C': 20.}, - {'A': 5., 'B': np.nan, 'C': 20.}, - {'A': 5., 'B': 10., 'C': np.nan}, + {'A': np.nan, 'B': 2., 'C': 4.}, + {'A': 4., 'B': np.nan, 'C': 2.}, + {'A': 2., 'B': 4., 'C': np.nan}, ]) expected_result = pd.DataFrame([ - {'A': 5., 'B': 10., 'C': 20.}, - {'A': 5., 'B': 10., 'C': 20.}, - {'A': 5., 'B': 10., 'C': 20.}, + {'A': 3., 'B': 2., 'C': 4.}, + {'A': 4., 'B': 3., 'C': 2.}, + {'A': 2., 'B': 4., 'C': 3.}, ]) # Run @@ -200,14 +227,56 @@ def test_impute_table(self): for column in result: assert 0 not in result[column].values - def test_model_database(self): - """model_database computes conditions between tables and models them.""" + def test_impute_table_with_mean_default(self): + """If a column only has NaN, impute_table fills it with 0.(+EPSILON). + + If a column has no mean (all values are null), then the NaN values are replaced with 0. + Then, it will transform like a constant column, adding copulas.EPSILON at the + first element. + """ + # Setup + table = pd.DataFrame([ + {'A': np.nan, 'B': 2., 'C': 2.}, + {'A': np.nan, 'B': 3., 'C': 3.}, + {'A': np.nan, 'B': 4., 'C': 4.}, + ]) + expected_result = pd.DataFrame([ + {'A': EPSILON, 'B': 2., 'C': 2.}, + {'A': 0., 'B': 3., 'C': 3.}, + {'A': 0., 'B': 4., 'C': 4.}, + ]) # Run - self.modeler.model_database() + result = self.modeler.impute_table(table) # Check - assert self.modeler.tables.keys() == self.modeler.models.keys() + assert result.equals(expected_result) + + # No null values are left + assert not result.isnull().all().all() + + def test_impute_table_constant_column(self): + """impute_table adds EPSILON at the first element of a constant column.""" + # Setup + table = pd.DataFrame([ + {'A': np.nan, 'B': 10., 'C': 20.}, + {'A': 5., 'B': np.nan, 'C': 20.}, + {'A': 5., 'B': 10., 'C': np.nan}, + ]) + expected_result = pd.DataFrame([ + {'A': 5. + EPSILON, 'B': 10. + EPSILON, 'C': 20. + EPSILON}, + {'A': 5., 'B': 10., 'C': 20.}, + {'A': 5., 'B': 10., 'C': 20.}, + ]) + + # Run + result = self.modeler.impute_table(table) + + # Check + assert result.equals(expected_result) + + # No null values are left + assert not result.isnull().all().all() def test_get_foreign_key(self): """get_foreign_key returns the foreign key from a metadata and a primary key.""" @@ -225,7 +294,7 @@ def test_get_foreign_key(self): def test_fit_model_distribution_arg(self): """fit_model will pass self.distribution FQN to modeler.""" # Setup - model_mock = mock.MagicMock() + model_mock = MagicMock() model_mock.__eq__.return_value = True model_mock.__ne__.return_value = False modeler = Modeler(data_navigator='navigator', model=model_mock, distribution=KDEUnivariate) @@ -239,6 +308,29 @@ def test_fit_model_distribution_arg(self): # Check model_mock.assert_called_once_with(distribution='copulas.univariate.kde.KDEUnivariate') + def test_model_database(self): + """model_database computes conditions between tables and models them.""" + # Run + self.modeler.model_database() + + # Check + assert self.modeler.tables.keys() == self.modeler.models.keys() + + @patch('sdv.modeler.Modeler.RCPA') + def test_model_database_raises(self, rcpa_mock): + """If the models raise an exception, it prints a custom message.""" + # Setup + data_navigator = MagicMock() + modeler = Modeler(data_navigator) + + data_navigator.tables = ['table_1', 'table_2'] + data_navigator.get_parents.return_value = False + rcpa_mock.side_effect = ValueError('value error!') + + # Run / Check + with self.assertRaises(ValueError): + modeler.model_database() + def test_model_database_kde_distribution(self): """model_database works fine with kde distribution.""" # Setup @@ -247,10 +339,11 @@ def test_model_database_kde_distribution(self): # Run modeler.model_database() - def test_model_database_vine_modeler(self): + def test_model_database_vine_modeler_single_table(self): """model_database works fine with vine modeler.""" # Setup - modeler = Modeler(data_navigator=self.dn, model=VineCopula) + data_navigator = MagicMock() + modeler = Modeler(data_navigator=data_navigator, model=VineCopula) # Run modeler.model_database() From 7091fc920751a4a6444c61796aeb1c5fb7bcd90a Mon Sep 17 00:00:00 2001 From: Manuel Alvarez Date: Thu, 7 Feb 2019 01:25:42 +0100 Subject: [PATCH 4/9] Add deterministic tests for model_database with different models --- tests/sdv/test_modeler.py | 121 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 117 insertions(+), 4 deletions(-) diff --git a/tests/sdv/test_modeler.py b/tests/sdv/test_modeler.py index 93eab1b1b..65ae140e5 100644 --- a/tests/sdv/test_modeler.py +++ b/tests/sdv/test_modeler.py @@ -1,14 +1,15 @@ -from unittest import TestCase +from unittest import TestCase, skip from unittest.mock import MagicMock, patch import numpy as np import pandas as pd from copulas import EPSILON from copulas.multivariate import GaussianMultivariate, VineCopula -from copulas.univariate.kde import KDEUnivariate +from copulas.univariate import KDEUnivariate -from sdv.data_navigator import CSVDataLoader, Table +from sdv.data_navigator import CSVDataLoader, DataNavigator, Table from sdv.modeler import Modeler +from sdv.sampler import Sampler class TestModeler(TestCase): @@ -316,6 +317,66 @@ def test_model_database(self): # Check assert self.modeler.tables.keys() == self.modeler.models.keys() + def test_model_database_gaussian_copula_single_table(self): + """model_database can model a single table using the gausian copula model.""" + # Setup + data_navigator = MagicMock(spec=DataNavigator) + modeler = Modeler(data_navigator=data_navigator, model=GaussianMultivariate) + + # Setup - Mocks - DataNavigator + data = pd.DataFrame({ + 'column_A': list('abdc'), + 'column_B': range(4) + }) + meta = { + 'name': 'table_name', + 'fields': { + 'column_A': { + 'name': 'A', + 'type': 'categorical' + }, + 'column_B': { + 'name': 'B', + 'type': 'number', + 'subtype': 'integer' + } + } + } + + data_navigator.tables = { + 'table_name': Table(data, meta) + } + data_navigator.get_parents.return_value = set() + data_navigator.get_children.return_value = set() + data_navigator.transformed_data = { + 'table_name': pd.DataFrame({ + 'column_A': [0.1, 0.2, 0.5, 1.0], + 'column_B': range(4) + }) + } + data_navigator.meta = { + 'tables': [ + { + 'name': meta + } + ] + } + data_navigator.ht = MagicMock() + data_navigator.ht.transformers = { + ('table_name', 'column_A'): None, + ('table_name', 'column_B'): None + } + + # Run + modeler.model_database() + + # Check + assert 'table_name' in modeler.models + + sampler = Sampler(data_navigator, modeler) + samples = sampler.sample_all() + assert 'table_name' in samples + @patch('sdv.modeler.Modeler.RCPA') def test_model_database_raises(self, rcpa_mock): """If the models raise an exception, it prints a custom message.""" @@ -339,15 +400,67 @@ def test_model_database_kde_distribution(self): # Run modeler.model_database() + @skip('s') def test_model_database_vine_modeler_single_table(self): """model_database works fine with vine modeler.""" # Setup - data_navigator = MagicMock() + data_navigator = MagicMock(spec=DataNavigator) modeler = Modeler(data_navigator=data_navigator, model=VineCopula) + # Setup - Mock + data = pd.DataFrame({ + 'column_A': list('abdc'), + 'column_B': range(4) + }) + meta = { + 'name': 'table_name', + 'fields': { + 'column_A': { + 'name': 'A', + 'type': 'categorical' + }, + 'column_B': { + 'name': 'B', + 'type': 'number', + 'subtype': 'integer' + } + } + } + + data_navigator.tables = { + 'table_name': Table(data, meta) + } + data_navigator.get_parents.return_value = set() + data_navigator.get_children.return_value = set() + data_navigator.transformed_data = { + 'table_name': pd.DataFrame({ + 'column_A': [0.1, 0.2, 0.5, 1.0], + 'column_B': range(4) + }) + } + data_navigator.meta = { + 'tables': [ + { + 'name': meta + } + ] + } + data_navigator.ht = MagicMock() + data_navigator.ht.transformers = { + ('table_name', 'column_A'): None, + ('table_name', 'column_B'): None + } + # Run modeler.model_database() + # Check + assert 'table_name' in modeler.models + + sampler = Sampler(data_navigator, modeler) + samples = sampler.sample_all() + assert 'table_name' in samples + def test__flatten_dict_flat_dict(self): """_flatten_dict don't modify flat dicts.""" # Setup From 0451bd3f8c7b1afac428c90265d0d7e1145cad2b Mon Sep 17 00:00:00 2001 From: Manuel Alvarez Date: Thu, 7 Feb 2019 12:00:31 +0100 Subject: [PATCH 5/9] Add support for categorical index on modeler.CPA --- sdv/modeler.py | 16 ++++- tests/sdv/test_modeler.py | 126 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 139 insertions(+), 3 deletions(-) diff --git a/sdv/modeler.py b/sdv/modeler.py index ea9aae7eb..d08bfacc0 100644 --- a/sdv/modeler.py +++ b/sdv/modeler.py @@ -330,9 +330,19 @@ def CPA(self, table): extended_table = self.dn.transformed_data[table] extensions = self._get_extensions(pk, children) - # add extensions - for extension in extensions: - extended_table = extended_table.merge(extension.reset_index(), how='left', on=pk) + if extensions: + original_pk = tables[table].data[pk] + transformed_pk = None + if not extended_table[pk].equals(original_pk): + transformed_pk = extended_table[pk].copy() + extended_table[pk] = original_pk + + # add extensions + for extension in extensions: + extended_table = extended_table.merge(extension.reset_index(), how='left', on=pk) + + if transformed_pk is not None: + extended_table[pk] = transformed_pk self.tables[table] = extended_table diff --git a/tests/sdv/test_modeler.py b/tests/sdv/test_modeler.py index 65ae140e5..ebf057fd2 100644 --- a/tests/sdv/test_modeler.py +++ b/tests/sdv/test_modeler.py @@ -170,6 +170,132 @@ def test_CPA(self): assert (raw_table.index == table.index).all() assert all([column in table.columns for column in raw_table.columns]) + @patch('sdv.modeler.Modeler._get_extensions') + def test_CPA_transformed_index(self, extension_mock): + """CPA is able to merge extensions in tables with transformed index. """ + # Setup + data_navigator = MagicMock(spec=DataNavigator) + modeler = Modeler(data_navigator) + + # Setup - Mock + parent_data = pd.DataFrame([ + {'parent_id': 'A', 'values': 1}, + {'parent_id': 'B', 'values': 2}, + {'parent_id': 'C', 'values': 3}, + ]) + parent_meta = { + 'name': 'parent', + 'primary_key': 'parent_id', + 'fields': { + 'parent_id': { + 'name': 'parent_id', + 'type': 'categorical', + 'regex': '^[A-Z]$' + }, + 'values': { + 'name': 'values', + 'type': 'number', + 'subtype': 'integer' + } + } + } + + child_data = pd.DataFrame([ + {'child_id': 1, 'parent_id': 'A', 'value': 0.1}, + {'child_id': 2, 'parent_id': 'A', 'value': 0.2}, + {'child_id': 3, 'parent_id': 'A', 'value': 0.3}, + {'child_id': 4, 'parent_id': 'B', 'value': 0.4}, + {'child_id': 5, 'parent_id': 'B', 'value': 0.5}, + {'child_id': 6, 'parent_id': 'B', 'value': 0.6}, + {'child_id': 7, 'parent_id': 'C', 'value': 0.7}, + {'child_id': 8, 'parent_id': 'C', 'value': 0.8}, + {'child_id': 9, 'parent_id': 'C', 'value': 0.9}, + ]) + child_meta = { + 'name': 'child', + 'primary_key': 'child_id', + 'fields': { + 'child_id': { + 'name': 'child_id', + 'type': 'number' + }, + 'parent_id': { + 'name': 'parent_id', + 'type': 'category', + 'ref': { + 'table': 'parent', + 'field': 'parent_id' + } + }, + 'value': { + 'name': 'value', + 'type': 'number' + } + } + } + + data_navigator.tables = { + 'parent': Table(parent_data, parent_meta), + 'child': Table(child_data, child_meta) + } + + children_map = {'parent': {'child'}} + parent_map = {'child': {'parent'}} + + data_navigator.get_children.side_effect = lambda x: children_map.get(x, set()) + data_navigator.get_parents.side_effect = lambda x: parent_map.get(x, set()) + + transformed_parent = pd.DataFrame([ + {'parent_id': 0.1, 'values': 1}, + {'parent_id': 0.4, 'values': 2}, + {'parent_id': 0.8, 'values': 3}, + ]) + transformed_child = pd.DataFrame([ + {'child_id': 1, 'parent_id': 0.15, 'value': 0.1}, + {'child_id': 2, 'parent_id': 0.10, 'value': 0.2}, + {'child_id': 3, 'parent_id': 0.20, 'value': 0.3}, + {'child_id': 4, 'parent_id': 0.35, 'value': 0.4}, + {'child_id': 5, 'parent_id': 0.50, 'value': 0.5}, + {'child_id': 6, 'parent_id': 0.55, 'value': 0.6}, + {'child_id': 7, 'parent_id': 0.70, 'value': 0.7}, + {'child_id': 8, 'parent_id': 0.80, 'value': 0.8}, + {'child_id': 9, 'parent_id': 0.85, 'value': 0.9}, + ]) + + data_navigator.transformed_data = { + 'parent': transformed_parent, + 'child': transformed_child + } + extension = pd.DataFrame(**{ + 'data': [ + {'param_1': 0.5, 'param_2': 0.4}, + {'param_1': 0.7, 'param_2': 0.2}, + {'param_1': 0.2, 'param_2': 0.1}, + ], + 'index': list('ABC') + }) + extension.index.name = 'parent_id' + extension_mock.return_value = [extension] + + expected_extended_parent = pd.DataFrame( + [ + {'parent_id': 0.1, 'values': 1, 'param_1': 0.5, 'param_2': 0.4}, + {'parent_id': 0.4, 'values': 2, 'param_1': 0.7, 'param_2': 0.2}, + {'parent_id': 0.8, 'values': 3, 'param_1': 0.2, 'param_2': 0.1}, + ], + columns=['parent_id', 'values', 'param_1', 'param_2'] + ) + + # Run + modeler.CPA('parent') + + # Check + 'parent' in modeler.tables + assert modeler.tables['parent'].equals(expected_extended_parent) + + data_navigator.get_children.assert_called_once_with('parent') + extension_mock.assert_called_once_with('parent_id', {'child'}) + def test_flatten_model(self): """flatten_model returns a pandas.Series with all the params to recreate a model.""" # Setup From fcf2cc1665d5d37230789a3a60d193e15e186caa Mon Sep 17 00:00:00 2001 From: Manuel Alvarez Date: Thu, 7 Feb 2019 13:14:55 +0100 Subject: [PATCH 6/9] Modeler.impute_table only compute mean on numeric columns --- sdv/modeler.py | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/sdv/modeler.py b/sdv/modeler.py index d08bfacc0..108ae56e2 100644 --- a/sdv/modeler.py +++ b/sdv/modeler.py @@ -187,13 +187,15 @@ def impute_table(table): """ values = {} - for label in table: - value = table[label].mean() + for column in table.loc[:, table.isnull().any()].columns: + if table[column].dtype in [np.float64, np.int64]: + value = table[column].mean() - if not pd.isnull(value): - values[label] = value + if not pd.isnull(value or np.nan): + values[column] = value else: - values[label] = 0 + values[column] = 0 + table = table.fillna(values) # There is an issue when using KDEUnivariate modeler in tables with childs @@ -236,7 +238,8 @@ def _create_extension(self, foreign, transformed_child_table, table_info): foreign_key, child_name = table_info try: conditional_data = transformed_child_table.loc[foreign.index].copy() - conditional_data = conditional_data.drop(foreign_key, axis=1) + if foreign_key in conditional_data: + conditional_data = conditional_data.drop(foreign_key, axis=1) except KeyError: return None @@ -308,7 +311,15 @@ def CPA(self, table): Conditional Parameter Aggregation. It will take the table's children and generate extensions (parameters from modelling the related children for each foreign key) - and merge them to the original `table` + and merge them to the original `table`. + + After the extensions are created, `extended_table` is modified in order for the extensions + to be merged. As the extensions are returned with an index consisting of values of the + `primary_key` of the parent table, we need to make sure that same values are present in + `extended_table`. The values couldn't be present in two situations: + + - They weren't numeric, and have been transformed. + - They weren't transformed, and therefore are not present on `extended_table` Args: table (string): name of table. @@ -333,8 +344,11 @@ def CPA(self, table): if extensions: original_pk = tables[table].data[pk] transformed_pk = None - if not extended_table[pk].equals(original_pk): + + if pk in extended_table: transformed_pk = extended_table[pk].copy() + + if (pk not in extended_table) or (not extended_table[pk].equals(original_pk)): extended_table[pk] = original_pk # add extensions @@ -343,6 +357,8 @@ def CPA(self, table): if transformed_pk is not None: extended_table[pk] = transformed_pk + else: + extended_table = extended_table.drop(pk, axis=1) self.tables[table] = extended_table From 0a2e781a9cfd977b630146cd9f1d0e11bce60b13 Mon Sep 17 00:00:00 2001 From: Manuel Alvarez Date: Mon, 11 Feb 2019 13:26:20 +0100 Subject: [PATCH 7/9] Fix wrong mock import on tests --- tests/sdv/test_modeler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/sdv/test_modeler.py b/tests/sdv/test_modeler.py index 2508a9bcb..5df6db0dd 100644 --- a/tests/sdv/test_modeler.py +++ b/tests/sdv/test_modeler.py @@ -316,7 +316,7 @@ def test_flatten_model(self): 'distribs__2__mean': 0.33333333333333331, 'distribs__2__std': 0.47140452079103168 }) - data_navigator = mock.MagicMock() + data_navigator = MagicMock() modeler = Modeler(data_navigator) # Run From cfd80a0cec5ce7d97ee96f39403a07bc75f337d5 Mon Sep 17 00:00:00 2001 From: Manuel Alvarez Date: Wed, 13 Feb 2019 11:36:55 +0100 Subject: [PATCH 8/9] Remove boto3 as dependency, delete redundant log message --- examples/utils.py | 26 +++++++++++--------------- sdv/sdv.py | 2 +- setup.py | 3 --- 3 files changed, 12 insertions(+), 19 deletions(-) diff --git a/examples/utils.py b/examples/utils.py index b980a1785..b9483a38b 100644 --- a/examples/utils.py +++ b/examples/utils.py @@ -1,12 +1,9 @@ +import io import logging import os +import urllib import zipfile -import boto3 -from botocore import UNSIGNED -from botocore.client import Config -from botocore.exceptions import ClientError - LOGGER = logging.getLogger(__name__) BUCKET_NAME = 'hdi-demos' @@ -16,9 +13,7 @@ def download_folder(folder_name): """Downloads and extracts demo folder from S3""" - s3 = boto3.resource('s3', region_name='us-east-1', config=Config(signature_version=UNSIGNED)) zip_name = folder_name + SUFFIX - zip_destination = os.path.join('demo', zip_name) key = os.path.join(SDV_NAME, zip_name) # If the directory doesn't exist , we create it @@ -28,19 +23,20 @@ def download_folder(folder_name): else: if os.path.exists(os.path.join('demo', folder_name)): - LOGGER.info('Folder %s found, skipping download', folder_name) return # try to download files from s3 try: - s3.Bucket(BUCKET_NAME).download_file(key, zip_destination) - except ClientError as e: - if e.response['Error']['Code'] == "404": - LOGGER.error("The object does not exist.") - else: - raise + url = 'https://{}.s3.amazonaws.com/{}'.format(BUCKET_NAME, key) + response = urllib.request.urlopen(url) + bytes_io = io.BytesIO(response.read()) + + except urllib.error.HTTPError as error: + if error.code == 404: + LOGGER.error('File %s not found.', key) + raise # unzip folder - zip_ref = zipfile.ZipFile(zip_destination, 'r') + zip_ref = zipfile.ZipFile(bytes_io, 'r') zip_ref.extractall('demo') zip_ref.close() diff --git a/sdv/sdv.py b/sdv/sdv.py index 349de1cbe..5fdf443db 100644 --- a/sdv/sdv.py +++ b/sdv/sdv.py @@ -3,7 +3,7 @@ """Main module.""" import pickle -from sklearn.exceptions import NotFittedError +from copulas import NotFittedError from sdv.data_navigator import CSVDataLoader from sdv.modeler import Modeler diff --git a/setup.py b/setup.py index ee29e1e7b..b008bcb87 100644 --- a/setup.py +++ b/setup.py @@ -12,12 +12,9 @@ history = history_file.read() install_requires = [ - 'boto3>=1.7.47', 'exrex>=0.10.5', 'numpy>=1.13.1', 'pandas>=0.22.0', - 'scipy>=0.19.1', - 'scikit-learn>=0.19.1', 'copulas>=0.2.1', 'rdt>=0.1.2' ] From d46539cb48da495ed9ce9b76233a74ff73b0c2c5 Mon Sep 17 00:00:00 2001 From: Manuel Alvarez Date: Wed, 13 Feb 2019 11:40:12 +0100 Subject: [PATCH 9/9] Prettify error message traceback --- sdv/modeler.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sdv/modeler.py b/sdv/modeler.py index 21f309bbb..ce2268823 100644 --- a/sdv/modeler.py +++ b/sdv/modeler.py @@ -394,6 +394,7 @@ def model_database(self): self.models[table] = self.fit_model(clean_table) except (ValueError, np.linalg.linalg.LinAlgError) as error: - raise ValueError(MODELLING_ERROR_MESSAGE).with_traceback(error.__traceback__) + raise ValueError( + MODELLING_ERROR_MESSAGE).with_traceback(error.__traceback__) from None logger.info('Modeling Complete')