Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue 58: Improve gaussian copula parameter sampling #85

Merged
merged 5 commits into from
Feb 11, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions sdv/modeler.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,8 +144,7 @@ def _flatten_dict(cls, nested, prefix=''):

return result

@classmethod
def flatten_model(cls, model, name=''):
def flatten_model(self, model, name=''):
"""Flatten a model's parameters into an array.

Args:
Expand All @@ -155,8 +154,16 @@ def flatten_model(cls, model, name=''):
Returns:
pd.Series: parameters for model
"""
if self.model == DEFAULT_MODEL:
values = []
triangle = np.tril(model.covariance)

for index, row in enumerate(triangle.tolist()):
values.append(row[:index + 1])

model.covariance = np.array(values)

return pd.Series(cls._flatten_dict(model.to_dict(), name))
return pd.Series(self._flatten_dict(model.to_dict(), name))

def get_foreign_key(self, fields, primary):
"""Get foreign key from primary key.
Expand Down
170 changes: 147 additions & 23 deletions sdv/sampler.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import random

import numpy as np
import pandas as pd
from copulas import get_qualified_name
from rdt.transformers.positive_number import PositiveNumberTransformer

import exrex

Expand All @@ -18,6 +20,50 @@ def __init__(self, data_navigator, modeler):
self.sampled = {} # table_name -> [(primary_key, generated_row)]
self.primary_key = {}

@staticmethod
def update_mapping_list(mapping, key, value):
"""Append value on mapping[key] if exists, create it otherwise."""
item = mapping.get(key)

if item:
item.append(value)

else:
mapping[key] = [value]

return mapping

@staticmethod
def _square_matrix(triangular_matrix):
"""Fill with zeros a triangular matrix to reshape it to a square one.

Args:
triangular_matrix (list[list[float]]): Array of arrays of

Returns:
list: Square matrix.
"""
length = len(triangular_matrix)
zero = [0.0]

for item in triangular_matrix:
item.extend(zero * (length - len(item)))

return triangular_matrix

def _prepare_sampled_covariance(self, covariance):
"""

Args:
covariance (list): covariance after unflattening model parameters.

Result:
list[list]: symmetric Positive semi-definite matrix.
"""
covariance = np.array(self._square_matrix(covariance))
covariance = (covariance + covariance.T - (np.identity(covariance.shape[0]) * covariance))
return covariance

@staticmethod
def reset_indices_tables(sampled_tables):
"""Reset the indices of sampled tables.
Expand Down Expand Up @@ -216,6 +262,102 @@ def _unflatten_dict(self, flat, table_name=''):

return result

def _make_positive_definite(self, matrix):
"""Find the nearest positive-definite matrix to input

Args:
matrix (numpy.ndarray): Matrix to transform

Returns:
numpy.ndarray: Closest symetric positive-definite matrix.
"""
symetric_matrix = (matrix + matrix.T) / 2
_, s, V = np.linalg.svd(symetric_matrix)
symmetric_polar = np.dot(V.T, np.dot(np.diag(s), V))
A2 = (symetric_matrix + symmetric_polar) / 2
A3 = (A2 + A2.T) / 2

if self._check_matrix_symmetric_positive_definite(A3):
return A3

spacing = np.spacing(np.linalg.norm(matrix))
identity = np.eye(matrix.shape[0])
iterations = 1
while not self._check_matrix_symmetric_positive_definite(A3):
min_eigenvals = np.min(np.real(np.linalg.eigvals(A3)))
A3 += identity * (-min_eigenvals * iterations**2 + spacing)
iterations += 1

return A3

def _check_matrix_symmetric_positive_definite(self, matrix):
"""Checks if a matrix is symmetric positive-definite.

Args:
matrix (list or np.ndarray): Matrix to evaluate.

Returns:
bool
"""
try:
if len(matrix.shape) != 2 or matrix.shape[0] != matrix.shape[1]:
# Not 2-dimensional or square, so not simmetric.
return False

np.linalg.cholesky(matrix)
return True

except np.linalg.LinAlgError:
return False

def _unflatten_gaussian_copula(self, model_parameters):
"""Prepare unflattened model params to recreate Gaussian Multivariate instance.

The preparations consist basically in:
- Transform sampled negative standard deviations from distributions into positive numbers
- Ensure the covariance matrix is a valid symmetric positive-semidefinite matrix.
- Add string parameters kept inside the class (as they can't be modelled),
like `distribution_type`.

Args:
model_parameters (dict): Sampled and reestructured model parameters.

Returns:
dict: Model parameters ready to recreate the model.
"""

distribution_name = self.modeler.model_kwargs['distribution']
distribution_kwargs = {
'fitted': True,
'type': distribution_name
}

distribs = model_parameters['distribs']
if any([distribs[key]['std'] <= 0 for key in distribs]):
metadata = {
'name': 'std',
'type': 'number'
}
transformer = PositiveNumberTransformer(metadata)

model_parameters['distribution'] = distribution_name
for key in distribs:
distribs[key].update(distribution_kwargs)

distribution_std = distribs[key]['std']
if distribution_std <= 0:
df = pd.DataFrame({'std': [distribution_std]})
distribs[key]['std'] = transformer.fit_transform(df)['std'].values[0]

covariance = model_parameters['covariance']
covariance = self._prepare_sampled_covariance(covariance)
if not self._check_matrix_symmetric_positive_definite(covariance):
covariance = self._make_positive_definite(covariance)

model_parameters['covariance'] = covariance.tolist()

return model_parameters

def unflatten_model(self, parent_row, table_name, parent_name):
""" Takes the params from a generated parent row and creates a model from it.

Expand All @@ -231,22 +373,16 @@ def unflatten_model(self, parent_row, table_name, parent_name):
flat_parameters = parent_row.loc[:, columns]
flat_parameters = flat_parameters.rename(columns=new_columns).to_dict('records')[0]

model_dict = self._unflatten_dict(flat_parameters, table_name)
model_parameters = self._unflatten_dict(flat_parameters, table_name)
model_name = get_qualified_name(self.modeler.model)

model_dict['fitted'] = True
model_dict['type'] = model_name
model_parameters['fitted'] = True
model_parameters['type'] = model_name

if model_name == GAUSSIAN_COPULA:
distribution_name = self.modeler.model_kwargs['distribution']
model_dict['distribution'] = distribution_name
for key in model_dict['distribs']:
model_dict['distribs'][key].update({
'fitted': True,
'type': distribution_name
})
model_parameters = self._unflatten_gaussian_copula(model_parameters)

return self.modeler.model.from_dict(model_dict)
return self.modeler.model.from_dict(model_parameters)

def sample_rows(self, table_name, num_rows):
"""Sample specified number of rows for specified table.
Expand Down Expand Up @@ -414,15 +550,3 @@ def _fill_text_columns(self, row, labels, table_name):
row.loc[:, field['name']] = exrex.getone(regex)

return row

def update_mapping_list(self, mapping, key, value):
"""Append value on mapping[key] if exists, create it otherwise."""
item = mapping.get(key)

if item:
item.append(value)

else:
mapping[key] = [value]

return mapping
8 changes: 3 additions & 5 deletions tests/sdv/test_modeler.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ def test__create_extension(self):

expected_result = pd.Series({
'covariance__0__0': 0.0,
'covariance__0__1': 0.0,
'covariance__1__0': 0.0,
'covariance__1__1': 1.4999999999999991,
'distribs__a__mean': 0.0,
Expand Down Expand Up @@ -151,11 +150,8 @@ def test_flatten_model(self):

expected_result = pd.Series({
'covariance__0__0': 1.5000000000000004,
'covariance__0__1': -0.7500000000000003,
'covariance__0__2': -0.7500000000000003,
'covariance__1__0': -0.7500000000000003,
'covariance__1__1': 1.5000000000000004,
'covariance__1__2': -0.7500000000000003,
'covariance__2__0': -0.7500000000000003,
'covariance__2__1': -0.7500000000000003,
'covariance__2__2': 1.5000000000000007,
Expand All @@ -166,9 +162,11 @@ def test_flatten_model(self):
'distribs__2__mean': 0.33333333333333331,
'distribs__2__std': 0.47140452079103168
})
data_navigator = mock.MagicMock()
modeler = Modeler(data_navigator)

# Run
result = Modeler.flatten_model(model)
result = modeler.flatten_model(model)

# Check
assert np.isclose(result, expected_result).all()
Expand Down
Loading