Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue 81: enforce sampled values in range [0,1] for categorical columns #87

Merged
merged 8 commits into from
Feb 13, 2019
43 changes: 33 additions & 10 deletions sdv/sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import pandas as pd
from copulas import get_qualified_name
from sklearn.preprocessing import MinMaxScaler

import exrex

Expand Down Expand Up @@ -33,11 +34,25 @@ def reset_indices_tables(sampled_tables):

return sampled_tables

def transform_synthesized_rows(self, synthesized_rows, table_name, num_rows):
@staticmethod
def _rescale_values(column):
"""Scale an array of values in the interval [0, 1].

Args:
column (pandas.Series): Column values to scale

Return:
pandas.Series: Column values scaled into [0,1]
"""
scaler = MinMaxScaler()
scaled = scaler.fit_transform(column.values.reshape(-1, 1)).ravel()
ManuelAlvarezC marked this conversation as resolved.
Show resolved Hide resolved
return pd.Series(scaled, name=column.name)

def transform_synthesized_rows(self, synthesized, table_name, num_rows):
"""Add primary key and reverse transform synthetized data.

Args:
synthesized_rows(pandas.DataFrame): Generated data from model
synthesized(pandas.DataFrame): Generated data from model
table_name(str): Name of the table.
num_rows(int): Number of rows sampled.

Expand All @@ -48,6 +63,15 @@ def transform_synthesized_rows(self, synthesized_rows, table_name, num_rows):
meta = self.dn.tables[table_name].meta
orig_meta = self._get_table_meta(self.dn.meta, table_name)
primary_key = meta.get('primary_key')
categorical_fields = [
ManuelAlvarezC marked this conversation as resolved.
Show resolved Hide resolved
field['name'] for field in orig_meta['fields']
if field['type'] == 'categorical'
]

if categorical_fields:
for field in categorical_fields:
if ((synthesized[field] < 0) | (synthesized[field] > 1)).any():
ManuelAlvarezC marked this conversation as resolved.
Show resolved Hide resolved
synthesized[field] = self._rescale_values(synthesized[field])

if primary_key:
node = meta['fields'][primary_key]
Expand All @@ -68,25 +92,24 @@ def transform_synthesized_rows(self, synthesized_rows, table_name, num_rows):
' to generate {} samples.'.format(table_name, regex, num_rows)
)

synthesized_rows[primary_key] = pd.Series(values)
synthesized[primary_key] = pd.Series(values)

if (node['type'] == 'number') and (node['subtype'] == 'integer'):
synthesized_rows[primary_key] = pd.to_numeric(synthesized_rows[primary_key])

sample_info = (primary_key, synthesized_rows)
synthesized[primary_key] = pd.to_numeric(synthesized[primary_key])

sample_info = (primary_key, synthesized)
self.sampled = self.update_mapping_list(self.sampled, table_name, sample_info)

# filter out parameters
labels = list(self.dn.tables[table_name].data)

synthesized_rows = self._fill_text_columns(synthesized_rows, labels, table_name)
text_filled = self._fill_text_columns(synthesized, labels, table_name)

# reverse transform data
reversed_data = self.dn.ht.reverse_transform_table(synthesized_rows, orig_meta)
reversed_data = self.dn.ht.reverse_transform_table(text_filled, orig_meta)

synthesized_rows.update(reversed_data)
return synthesized_rows[labels]
synthesized.update(reversed_data)
return synthesized[labels]

def _get_parent_row(self, table_name):
parents = self.dn.get_parents(table_name)
Expand Down
186 changes: 183 additions & 3 deletions tests/sdv/test_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import numpy as np
import pandas as pd

from sdv.data_navigator import CSVDataLoader
from sdv.data_navigator import CSVDataLoader, Table
from sdv.modeler import Modeler
from sdv.sampler import Sampler

Expand All @@ -23,6 +23,186 @@ def setUpClass(cls):
def setUp(self):
self.sampler = Sampler(self.data_navigator, self.modeler)

def test__rescale_values(self):
"""_rescale_values return and array satisfying 0 < array < 1."""
# Setup
data_navigator = MagicMock()
modeler = MagicMock()
sampler = Sampler(data_navigator, modeler)

column = pd.Series([0.0, 5.0, 10], name='column')
expected_result = pd.Series([0.0, 0.5, 1.0], name='column')

# Run
result = sampler._rescale_values(column)

# Check
assert (result == expected_result).all().all()
assert len(data_navigator.call_args_list) == 0
assert len(modeler.call_args_list) == 0

@patch('sdv.sampler.Sampler._fill_text_columns', autospec=True)
@patch('sdv.sampler.Sampler.update_mapping_list', autospec=True)
@patch('sdv.sampler.Sampler._get_table_meta', autospec=True)
def test_transform_synthesized_rows_no_pk_no_categorical(
self, get_table_meta_mock, update_mock, fill_mock):

"""transform_synthesized_rows will update internal state and reverse transform rows."""
# Setup - Class Instantiation
data_navigator = MagicMock()
modeler = MagicMock()
sampler = Sampler(data_navigator, modeler)

# Setup - Mock configuration
table_metadata = {
'fields': {
'column_A': {
'type': 'number',
'subtype': 'integer'
},
'column_B': {
'name': 'column',
'type': 'number'
}
},
'primary_key': None
}
table_data = pd.DataFrame(columns=['column_A', 'column_B'])
test_table = Table(table_data, table_metadata)
data_navigator.tables = {
'table': test_table
}

data_navigator.ht.reverse_transform_table.return_value = pd.DataFrame({
'column_A': ['some', 'transformed values'],
'column_B': ['another', 'transformed column']
})

get_table_meta_mock.return_value = {
'original': 'meta',
'fields': []
}

fill_mock.return_value = pd.DataFrame({
'column_A': ['filled', 'text_values'],
'column_B': ['nothing', 'numerical']
})

# Setup - Method arguments / expected result
synthesized_rows = pd.DataFrame({
'column_A': [1.7, 2.5],
'column_B': [4.7, 5.1],
'model_parameters': ['some', 'parameters']
})
table_name = 'table'
num_rows = 2

expected_result = pd.DataFrame({
'column_A': ['some', 'transformed values'],
'column_B': ['another', 'transformed column']
})

# Run
result = sampler.transform_synthesized_rows(synthesized_rows, table_name, num_rows)

# Check - Result
assert result.equals(expected_result)

# Check - Class internal state
assert sampler.sampled == update_mock.return_value

# Check - Mock calls
get_table_meta_mock.assert_called_once_with(sampler, data_navigator.meta, 'table')
update_mock.assert_called_once_with(sampler, {}, 'table', (None, synthesized_rows))
fill_mock.assert_called_once_with(
sampler, synthesized_rows, ['column_A', 'column_B'], 'table')

data_navigator.ht.reverse_transform_table.assert_called_once_with(
fill_mock.return_value, get_table_meta_mock.return_value
)

@patch('sdv.sampler.Sampler._fill_text_columns', autospec=True)
@patch('sdv.sampler.Sampler.update_mapping_list', autospec=True)
@patch('sdv.sampler.Sampler._rescale_values', autospec=True)
@patch('sdv.sampler.Sampler._get_table_meta', autospec=True)
def test_transform_synthesized_rows_no_pk_but_categorical(
self, get_table_meta_mock, rescale_mock, update_mock, fill_mock):

"""transform_synthesized_rows will update internal state and reverse transform rows."""
# Setup - Class Instantiation
data_navigator = MagicMock()
modeler = MagicMock()
sampler = Sampler(data_navigator, modeler)

# Setup - Mock configuration
table_metadata = {
'fields': {
'column_A': {
'type': 'categorical',
},
'column_B': {
'name': 'column',
'type': 'number'
}
},
'primary_key': None
}
table_data = pd.DataFrame(columns=['column_A', 'column_B'])
test_table = Table(table_data, table_metadata)
data_navigator.tables = {
'table': test_table
}

data_navigator.ht.reverse_transform_table.return_value = pd.DataFrame({
'column_A': ['some', 'transformed values'],
'column_B': ['another', 'transformed column']
})

get_table_meta_mock.return_value = {
'original': 'meta',
'fields': []
}

rescale_mock.side_effect = lambda x: pd.Series([0.1, 0.8], name=x.name)

fill_mock.return_value = pd.DataFrame({
'column_A': ['filled', 'text_values'],
'column_B': ['nothing', 'numerical']
})

# Setup - Method arguments / expected result
synthesized_rows = pd.DataFrame({
'column_A': [1.7, 2.5],
'column_B': [4.7, 5.1],
'model_parameters': ['some', 'parameters']
})
table_name = 'table'
num_rows = 2

expected_result = pd.DataFrame({
'column_A': ['some', 'transformed values'],
'column_B': ['another', 'transformed column']
})

# Run
result = sampler.transform_synthesized_rows(synthesized_rows, table_name, num_rows)

# Check - Result
assert result.equals(expected_result)

# Check - Class internal state
assert sampler.sampled == update_mock.return_value

# Check - Mock calls
get_table_meta_mock.assert_called_once_with(sampler, data_navigator.meta, 'table')
update_mock.assert_called_once_with(sampler, {}, 'table', (None, synthesized_rows))
fill_mock.assert_called_once_with(
sampler, synthesized_rows, ['column_A', 'column_B'], 'table')

data_navigator.ht.reverse_transform_table.assert_called_once_with(
fill_mock.return_value, get_table_meta_mock.return_value
)

def test_sample_rows_parent_table(self):
"""sample_rows samples new rows for the given table."""
# Setup
Expand Down Expand Up @@ -92,7 +272,7 @@ def fake_dataframe(name, number):
reset_mock.assert_called_once_with({'TABLE_A': 'concatenated_dataframe'})

def test_unflatten_dict(self):
""" """
"""unflatten_dict restructure flatten dicts."""
# Setup
data_navigator = MagicMock()
modeler = MagicMock()
Expand Down Expand Up @@ -122,7 +302,7 @@ def test_unflatten_dict(self):
modeler.assert_not_called()

def test_unflatten_dict_mixed_array(self):
"""unflatten_dict restruicture arrays"""
"""unflatten_dict restructure arrays."""
# Setup
data_navigator = MagicMock()
modeler = MagicMock()
Expand Down