Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue 81: enforce sampled values in range [0,1] for categorical columns #87

Merged
merged 8 commits into from
Feb 13, 2019
10 changes: 10 additions & 0 deletions sdv/modeler.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from copulas import get_qualified_name
from copulas.multivariate import GaussianMultivariate, TreeTypes
from copulas.univariate import GaussianUnivariate
from rdt.transformers.positive_number import PositiveNumberTransformer

# Configure logger
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -162,6 +163,15 @@ def flatten_model(self, model, name=''):
values.append(row[:index + 1])

model.covariance = np.array(values)
if self.model_kwargs['distribution'] == get_qualified_name(DEFAULT_DISTRIBUTION):
transformer = PositiveNumberTransformer({
'name': 'field',
'type': 'number'
})

for distribution in model.distribs.values():
column = pd.DataFrame({'field': [distribution.std]})
distribution.std = transformer.reverse_transform(column).loc[0, 'field']

return pd.Series(self._flatten_dict(model.to_dict(), name))

Expand Down
165 changes: 118 additions & 47 deletions sdv/sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,18 @@
GAUSSIAN_COPULA = 'copulas.multivariate.gaussian.GaussianMultivariate'


MODEL_ERROR_MESSAGES = {
True: (
'There was an error recreating models from parameters. '
'Sampling could not continue.'
),
False: (
'Modeler hasn\'t been fitted. '
'Please call Modeler.model_database() before sampling'
)
}


class Sampler:
"""Class to sample data from a model."""

Expand Down Expand Up @@ -51,6 +63,22 @@ def _square_matrix(triangular_matrix):

return triangular_matrix

def _get_table_meta(self, metadata, table_name):
"""Return metadata get table meta for a given table name.

Args:
metadata (dict): Metadata for dataset.
table_name (str): Name of table to get metadata from.

Returns:
dict: Metadata for given table.
"""
for table in metadata['tables']:
if table['name'] == table_name:
return table

return None

def _prepare_sampled_covariance(self, covariance):
"""

Expand Down Expand Up @@ -79,11 +107,11 @@ def reset_indices_tables(sampled_tables):

return sampled_tables

def transform_synthesized_rows(self, synthesized_rows, table_name, num_rows):
def transform_synthesized_rows(self, synthesized, table_name, num_rows):
"""Add primary key and reverse transform synthetized data.

Args:
synthesized_rows(pandas.DataFrame): Generated data from model
synthesized(pandas.DataFrame): Generated data from model
table_name(str): Name of the table.
num_rows(int): Number of rows sampled.

Expand Down Expand Up @@ -114,25 +142,24 @@ def transform_synthesized_rows(self, synthesized_rows, table_name, num_rows):
' to generate {} samples.'.format(table_name, regex, num_rows)
)

synthesized_rows[primary_key] = pd.Series(values)
synthesized[primary_key] = pd.Series(values)

if (node['type'] == 'number') and (node['subtype'] == 'integer'):
synthesized_rows[primary_key] = pd.to_numeric(synthesized_rows[primary_key])

sample_info = (primary_key, synthesized_rows)
synthesized[primary_key] = pd.to_numeric(synthesized[primary_key])

sample_info = (primary_key, synthesized)
self.sampled = self.update_mapping_list(self.sampled, table_name, sample_info)

# filter out parameters
labels = list(self.dn.tables[table_name].data)

synthesized_rows = self._fill_text_columns(synthesized_rows, labels, table_name)
text_filled = self._fill_text_columns(synthesized, labels, table_name)

# reverse transform data
reversed_data = self.dn.ht.reverse_transform_table(synthesized_rows, orig_meta)
reversed_data = self.dn.ht.reverse_transform_table(text_filled, orig_meta)

synthesized_rows.update(reversed_data)
return synthesized_rows[labels]
synthesized.update(reversed_data)
return synthesized[labels]

def _get_parent_row(self, table_name):
parents = self.dn.get_parents(table_name)
Expand Down Expand Up @@ -331,23 +358,19 @@ def _unflatten_gaussian_copula(self, model_parameters):
'fitted': True,
'type': distribution_name
}
model_parameters['distribution'] = distribution_name

distribs = model_parameters['distribs']
if any([distribs[key]['std'] <= 0 for key in distribs]):
metadata = {
'name': 'std',
'type': 'number'
}
transformer = PositiveNumberTransformer(metadata)

model_parameters['distribution'] = distribution_name
for key in distribs:
distribs[key].update(distribution_kwargs)
metadata = {
'name': 'std',
'type': 'number'
}
transformer = PositiveNumberTransformer(metadata)

distribution_std = distribs[key]['std']
if distribution_std <= 0:
df = pd.DataFrame({'std': [distribution_std]})
distribs[key]['std'] = transformer.fit_transform(df)['std'].values[0]
for distribution in distribs.values():
distribution.update(distribution_kwargs)
df = pd.DataFrame({'std': [distribution['std']]})
distribution['std'] = transformer.transform(df).loc[0, 'std']

covariance = model_parameters['covariance']
covariance = self._prepare_sampled_covariance(covariance)
Expand Down Expand Up @@ -384,6 +407,75 @@ def unflatten_model(self, parent_row, table_name, parent_name):

return self.modeler.model.from_dict(model_parameters)

def __get_missing_valid_rows(self, synthesized, drop_indices, valid_rows, num_rows):
ManuelAlvarezC marked this conversation as resolved.
Show resolved Hide resolved
"""

Args:
synthesized (pandas.DataFrame)

Returns:
tuple[int, pandas.DataFrame]: Amount of missing values and actual valid rows
"""
valid_rows = pd.concat([valid_rows, synthesized[~drop_indices].copy()])
valid_rows = valid_rows.reset_index(drop=True)

missing_rows = num_rows - valid_rows.shape[0]

return missing_rows, valid_rows

def _sample_valid_rows(self, model, num_rows, table_name):
"""Sample using `model` and discard invalid values until having `num_rows`.

Args:
model (copula.multivariate.base): Fitted model.
num_rows (int): Number of rows to sample.
table_name (str): name of table to synthesize.

Returns:
pandas.DataFrame: Sampled rows, shape (, num_rows)
"""

if model and model.fitted:
synthesized = model.sample(num_rows)
valid_rows = pd.DataFrame(columns=synthesized.columns)
drop_indices = pd.Series(False, index=synthesized.index)

categorical_columns = []
table_metadata = self._get_table_meta(self.dn.meta, table_name)

for field in table_metadata['fields']:
if field['type'] == 'categorical':
column_name = field['name']
categorical_columns.append(column_name)
column = synthesized[column_name]
filtered_values = ((column < 0) | (column > 1))

if filtered_values.any():
drop_indices |= filtered_values

missing_rows, valid_rows = self.__get_missing_valid_rows(
synthesized, drop_indices, valid_rows, num_rows)

while missing_rows:
synthesized = model.sample(missing_rows)
drop_indices = pd.Series(False, index=synthesized.index)

for column_name in categorical_columns:
column = synthesized[column_name]
filtered_values = ((column < 0) | (column > 1))

if filtered_values.any():
drop_indices |= filtered_values

missing_rows, valid_rows = self.__get_missing_valid_rows(
synthesized, drop_indices, valid_rows, num_rows)

return valid_rows

else:
parents = bool(self.dn.get_parents(table_name))
raise ValueError(MODEL_ERROR_MESSAGES[parents])

def sample_rows(self, table_name, num_rows):
"""Sample specified number of rows for specified table.

Expand All @@ -406,14 +498,7 @@ def sample_rows(self, table_name, num_rows):
# get parameters from parent to make model
model = self.unflatten_model(parent_row, table_name, random_parent)

# sample from that model
if model is not None and model.fitted:
synthesized_rows = model.sample(num_rows)
else:
raise ValueError(
'There was an error recreating models from parameters. '
'Sampling could not continue.'
)
synthesized_rows = self._sample_valid_rows(model, num_rows, table_name)

# add foreign key value to row
fk_val = parent_row.loc[0, fk]
Expand All @@ -426,13 +511,7 @@ def sample_rows(self, table_name, num_rows):

else: # there is no parent
model = self.modeler.models[table_name]
if model.fitted:
synthesized_rows = model.sample(num_rows)
else:
raise ValueError(
'Modeler hasn\'t been fitted. '
'Please call Modeler.model_database() before sampling'
)
synthesized_rows = self._sample_valid_rows(model, num_rows, table_name)

return self.transform_synthesized_rows(synthesized_rows, table_name, num_rows)

Expand Down Expand Up @@ -506,14 +585,6 @@ def sample_all(self, num_rows=5):

return self.reset_indices_tables(sampled_data)

def _get_table_meta(self, meta, table_name):
"""Return metadata get table meta for a given table name"""
for table in meta['tables']:
if table['name'] == table_name:
return table

return None

def _fill_text_columns(self, row, labels, table_name):
"""Fill in the column values for every non numeric column that isn't the primary key.

Expand Down
10 changes: 5 additions & 5 deletions tests/sdv/test_modeler.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@ def test__create_extension(self):
'covariance__1__0': 0.0,
'covariance__1__1': 1.4999999999999991,
'distribs__a__mean': 0.0,
'distribs__a__std': 0.001,
'distribs__a__std': -6.907755278982137,
'distribs__b__mean': 3.0,
'distribs__b__std': 1.632993161855452
'distribs__b__std': 0.4904146265058631
})

# Run
Expand Down Expand Up @@ -156,11 +156,11 @@ def test_flatten_model(self):
'covariance__2__1': -0.7500000000000003,
'covariance__2__2': 1.5000000000000007,
'distribs__0__mean': 0.33333333333333331,
'distribs__0__std': 0.47140452079103168,
'distribs__0__std': -0.7520386983881371,
'distribs__1__mean': 0.33333333333333331,
'distribs__1__std': 0.47140452079103168,
'distribs__1__std': -0.7520386983881371,
'distribs__2__mean': 0.33333333333333331,
'distribs__2__std': 0.47140452079103168
'distribs__2__std': -0.7520386983881371
})
data_navigator = mock.MagicMock()
modeler = Modeler(data_navigator)
Expand Down
Loading