Skip to content

Commit

Permalink
Merge pull request #87 from HDI-Project/81_enforce_sampled_values_cat…
Browse files Browse the repository at this point in the history
…egorical

Issue 81: enforce sampled values in range [0,1] for  categorical columns
  • Loading branch information
ManuelAlvarezC authored Feb 13, 2019
2 parents fa1adae + f97a4df commit b688a88
Show file tree
Hide file tree
Showing 4 changed files with 309 additions and 58 deletions.
10 changes: 10 additions & 0 deletions sdv/modeler.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from copulas import get_qualified_name
from copulas.multivariate import GaussianMultivariate, TreeTypes
from copulas.univariate import GaussianUnivariate
from rdt.transformers.positive_number import PositiveNumberTransformer

# Configure logger
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -162,6 +163,15 @@ def flatten_model(self, model, name=''):
values.append(row[:index + 1])

model.covariance = np.array(values)
if self.model_kwargs['distribution'] == get_qualified_name(DEFAULT_DISTRIBUTION):
transformer = PositiveNumberTransformer({
'name': 'field',
'type': 'number'
})

for distribution in model.distribs.values():
column = pd.DataFrame({'field': [distribution.std]})
distribution.std = transformer.reverse_transform(column).loc[0, 'field']

return pd.Series(self._flatten_dict(model.to_dict(), name))

Expand Down
165 changes: 118 additions & 47 deletions sdv/sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,18 @@
GAUSSIAN_COPULA = 'copulas.multivariate.gaussian.GaussianMultivariate'


MODEL_ERROR_MESSAGES = {
True: (
'There was an error recreating models from parameters. '
'Sampling could not continue.'
),
False: (
'Modeler hasn\'t been fitted. '
'Please call Modeler.model_database() before sampling'
)
}


class Sampler:
"""Class to sample data from a model."""

Expand Down Expand Up @@ -51,6 +63,22 @@ def _square_matrix(triangular_matrix):

return triangular_matrix

def _get_table_meta(self, metadata, table_name):
"""Return metadata get table meta for a given table name.
Args:
metadata (dict): Metadata for dataset.
table_name (str): Name of table to get metadata from.
Returns:
dict: Metadata for given table.
"""
for table in metadata['tables']:
if table['name'] == table_name:
return table

return None

def _prepare_sampled_covariance(self, covariance):
"""
Expand Down Expand Up @@ -79,11 +107,11 @@ def reset_indices_tables(sampled_tables):

return sampled_tables

def transform_synthesized_rows(self, synthesized_rows, table_name, num_rows):
def transform_synthesized_rows(self, synthesized, table_name, num_rows):
"""Add primary key and reverse transform synthetized data.
Args:
synthesized_rows(pandas.DataFrame): Generated data from model
synthesized(pandas.DataFrame): Generated data from model
table_name(str): Name of the table.
num_rows(int): Number of rows sampled.
Expand Down Expand Up @@ -114,25 +142,24 @@ def transform_synthesized_rows(self, synthesized_rows, table_name, num_rows):
' to generate {} samples.'.format(table_name, regex, num_rows)
)

synthesized_rows[primary_key] = pd.Series(values)
synthesized[primary_key] = pd.Series(values)

if (node['type'] == 'number') and (node['subtype'] == 'integer'):
synthesized_rows[primary_key] = pd.to_numeric(synthesized_rows[primary_key])

sample_info = (primary_key, synthesized_rows)
synthesized[primary_key] = pd.to_numeric(synthesized[primary_key])

sample_info = (primary_key, synthesized)
self.sampled = self.update_mapping_list(self.sampled, table_name, sample_info)

# filter out parameters
labels = list(self.dn.tables[table_name].data)

synthesized_rows = self._fill_text_columns(synthesized_rows, labels, table_name)
text_filled = self._fill_text_columns(synthesized, labels, table_name)

# reverse transform data
reversed_data = self.dn.ht.reverse_transform_table(synthesized_rows, orig_meta)
reversed_data = self.dn.ht.reverse_transform_table(text_filled, orig_meta)

synthesized_rows.update(reversed_data)
return synthesized_rows[labels]
synthesized.update(reversed_data)
return synthesized[labels]

def _get_parent_row(self, table_name):
parents = self.dn.get_parents(table_name)
Expand Down Expand Up @@ -331,23 +358,19 @@ def _unflatten_gaussian_copula(self, model_parameters):
'fitted': True,
'type': distribution_name
}
model_parameters['distribution'] = distribution_name

distribs = model_parameters['distribs']
if any([distribs[key]['std'] <= 0 for key in distribs]):
metadata = {
'name': 'std',
'type': 'number'
}
transformer = PositiveNumberTransformer(metadata)

model_parameters['distribution'] = distribution_name
for key in distribs:
distribs[key].update(distribution_kwargs)
metadata = {
'name': 'std',
'type': 'number'
}
transformer = PositiveNumberTransformer(metadata)

distribution_std = distribs[key]['std']
if distribution_std <= 0:
df = pd.DataFrame({'std': [distribution_std]})
distribs[key]['std'] = transformer.fit_transform(df)['std'].values[0]
for distribution in distribs.values():
distribution.update(distribution_kwargs)
df = pd.DataFrame({'std': [distribution['std']]})
distribution['std'] = transformer.transform(df).loc[0, 'std']

covariance = model_parameters['covariance']
covariance = self._prepare_sampled_covariance(covariance)
Expand Down Expand Up @@ -384,6 +407,75 @@ def unflatten_model(self, parent_row, table_name, parent_name):

return self.modeler.model.from_dict(model_parameters)

def _get_missing_valid_rows(self, synthesized, drop_indices, valid_rows, num_rows):
"""
Args:
synthesized (pandas.DataFrame)
Returns:
tuple[int, pandas.DataFrame]: Amount of missing values and actual valid rows
"""
valid_rows = pd.concat([valid_rows, synthesized[~drop_indices].copy()])
valid_rows = valid_rows.reset_index(drop=True)

missing_rows = num_rows - valid_rows.shape[0]

return missing_rows, valid_rows

def _sample_valid_rows(self, model, num_rows, table_name):
"""Sample using `model` and discard invalid values until having `num_rows`.
Args:
model (copula.multivariate.base): Fitted model.
num_rows (int): Number of rows to sample.
table_name (str): name of table to synthesize.
Returns:
pandas.DataFrame: Sampled rows, shape (, num_rows)
"""

if model and model.fitted:
synthesized = model.sample(num_rows)
valid_rows = pd.DataFrame(columns=synthesized.columns)
drop_indices = pd.Series(False, index=synthesized.index)

categorical_columns = []
table_metadata = self._get_table_meta(self.dn.meta, table_name)

for field in table_metadata['fields']:
if field['type'] == 'categorical':
column_name = field['name']
categorical_columns.append(column_name)
column = synthesized[column_name]
filtered_values = ((column < 0) | (column > 1))

if filtered_values.any():
drop_indices |= filtered_values

missing_rows, valid_rows = self._get_missing_valid_rows(
synthesized, drop_indices, valid_rows, num_rows)

while missing_rows:
synthesized = model.sample(missing_rows)
drop_indices = pd.Series(False, index=synthesized.index)

for column_name in categorical_columns:
column = synthesized[column_name]
filtered_values = ((column < 0) | (column > 1))

if filtered_values.any():
drop_indices |= filtered_values

missing_rows, valid_rows = self._get_missing_valid_rows(
synthesized, drop_indices, valid_rows, num_rows)

return valid_rows

else:
parents = bool(self.dn.get_parents(table_name))
raise ValueError(MODEL_ERROR_MESSAGES[parents])

def sample_rows(self, table_name, num_rows):
"""Sample specified number of rows for specified table.
Expand All @@ -406,14 +498,7 @@ def sample_rows(self, table_name, num_rows):
# get parameters from parent to make model
model = self.unflatten_model(parent_row, table_name, random_parent)

# sample from that model
if model is not None and model.fitted:
synthesized_rows = model.sample(num_rows)
else:
raise ValueError(
'There was an error recreating models from parameters. '
'Sampling could not continue.'
)
synthesized_rows = self._sample_valid_rows(model, num_rows, table_name)

# add foreign key value to row
fk_val = parent_row.loc[0, fk]
Expand All @@ -426,13 +511,7 @@ def sample_rows(self, table_name, num_rows):

else: # there is no parent
model = self.modeler.models[table_name]
if model.fitted:
synthesized_rows = model.sample(num_rows)
else:
raise ValueError(
'Modeler hasn\'t been fitted. '
'Please call Modeler.model_database() before sampling'
)
synthesized_rows = self._sample_valid_rows(model, num_rows, table_name)

return self.transform_synthesized_rows(synthesized_rows, table_name, num_rows)

Expand Down Expand Up @@ -506,14 +585,6 @@ def sample_all(self, num_rows=5):

return self.reset_indices_tables(sampled_data)

def _get_table_meta(self, meta, table_name):
"""Return metadata get table meta for a given table name"""
for table in meta['tables']:
if table['name'] == table_name:
return table

return None

def _fill_text_columns(self, row, labels, table_name):
"""Fill in the column values for every non numeric column that isn't the primary key.
Expand Down
10 changes: 5 additions & 5 deletions tests/sdv/test_modeler.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@ def test__create_extension(self):
'covariance__1__0': 0.0,
'covariance__1__1': 1.4999999999999991,
'distribs__a__mean': 0.0,
'distribs__a__std': 0.001,
'distribs__a__std': -6.907755278982137,
'distribs__b__mean': 3.0,
'distribs__b__std': 1.632993161855452
'distribs__b__std': 0.4904146265058631
})

# Run
Expand Down Expand Up @@ -156,11 +156,11 @@ def test_flatten_model(self):
'covariance__2__1': -0.7500000000000003,
'covariance__2__2': 1.5000000000000007,
'distribs__0__mean': 0.33333333333333331,
'distribs__0__std': 0.47140452079103168,
'distribs__0__std': -0.7520386983881371,
'distribs__1__mean': 0.33333333333333331,
'distribs__1__std': 0.47140452079103168,
'distribs__1__std': -0.7520386983881371,
'distribs__2__mean': 0.33333333333333331,
'distribs__2__std': 0.47140452079103168
'distribs__2__std': -0.7520386983881371
})
data_navigator = mock.MagicMock()
modeler = Modeler(data_navigator)
Expand Down
Loading

0 comments on commit b688a88

Please sign in to comment.