From b3c1f3fc0607071853a39e78134e48b151ad26eb Mon Sep 17 00:00:00 2001 From: Katharine Xiao <2405771+katxiao@users.noreply.github.com> Date: Thu, 10 Feb 2022 20:25:36 -0500 Subject: [PATCH] fix integration tests --- sdv/tabular/base.py | 2 +- tests/integration/tabular/test_base.py | 463 +++++++++----------- tests/integration/tabular/test_copulagan.py | 105 ++--- tests/integration/tabular/test_copulas.py | 101 ++--- tests/integration/tabular/test_ctgan.py | 105 ++--- tests/integration/tabular/test_tvae.py | 101 ++--- tests/unit/tabular/test_base.py | 8 +- 7 files changed, 426 insertions(+), 459 deletions(-) diff --git a/sdv/tabular/base.py b/sdv/tabular/base.py index 0eea1a4c4..681225c17 100644 --- a/sdv/tabular/base.py +++ b/sdv/tabular/base.py @@ -372,7 +372,7 @@ def _make_condition_dfs(self, conditions): ] def _conditionally_sample_rows(self, dataframe, condition, transformed_condition, - max_tries=None, batch_size_per_try=None, float_rtol=None, + max_tries=None, batch_size_per_try=None, float_rtol=0.01, graceful_reject_sampling=True): num_rows = len(dataframe) sampled_rows = self._sample_batch( diff --git a/tests/integration/tabular/test_base.py b/tests/integration/tabular/test_base.py index 2b71462f5..93b7eafa0 100644 --- a/tests/integration/tabular/test_base.py +++ b/tests/integration/tabular/test_base.py @@ -6,6 +6,7 @@ from sdv.constraints import Unique, UniqueCombinations from sdv.constraints.tabular import GreaterThan +from sdv.sampling import Condition from sdv.tabular.copulagan import CopulaGAN from sdv.tabular.copulas import GaussianCopula from sdv.tabular.ctgan import CTGAN, TVAE @@ -18,25 +19,25 @@ ] -#@pytest.mark.parametrize('model', MODELS) -#def test_conditional_sampling_graceful_reject_sampling_True_dict(model): -# data = pd.DataFrame({ -# 'column1': list(range(100)), -# 'column2': list(range(100)), -# 'column3': list(range(100)) -# }) -# -# model.fit(data) -# conditions = { -# 'column1': 28, -# 'column2': 37, -# 'column3': 93 -# } -# -# with pytest.raises(ValueError): -# model.sample(1, conditions=conditions, graceful_reject_sampling=True) -# -# +@pytest.mark.parametrize('model', MODELS) +def test_conditional_sampling_graceful_reject_sampling_True_dict(model): + data = pd.DataFrame({ + 'column1': list(range(100)), + 'column2': list(range(100)), + 'column3': list(range(100)) + }) + + model.fit(data) + conditions = [Condition({ + 'column1': 28, + 'column2': 37, + 'column3': 93 + })] + + with pytest.raises(ValueError): + model.sample_conditions(conditions=conditions) + + #@pytest.mark.parametrize('model', MODELS) #def test_conditional_sampling_graceful_reject_sampling_True_dataframe(model): # data = pd.DataFrame({ @@ -54,44 +55,6 @@ # # with pytest.raises(ValueError): # model.sample(conditions=conditions, graceful_reject_sampling=True) -# -# -#@pytest.mark.parametrize('model', MODELS) -#def test_conditional_sampling_graceful_reject_sampling_False_dict(model): -# data = pd.DataFrame({ -# 'column1': list(range(100)), -# 'column2': list(range(100)), -# 'column3': list(range(100)) -# }) -# -# model.fit(data) -# conditions = { -# 'column1': 28, -# 'column2': 37, -# 'column3': 93 -# } -# -# with pytest.raises(ValueError): -# model.sample(1, conditions=conditions) -# -# -#@pytest.mark.parametrize('model', MODELS) -#def test_conditional_sampling_graceful_reject_sampling_False_dataframe(model): -# data = pd.DataFrame({ -# 'column1': list(range(100)), -# 'column2': list(range(100)), -# 'column3': list(range(100)) -# }) -# -# model.fit(data) -# conditions = pd.DataFrame({ -# 'column1': [28], -# 'column2': [37], -# 'column3': [93] -# }) -# -# with pytest.raises(ValueError): -# model.sample(conditions=conditions) def test_fit_with_unique_constraint_on_data_with_only_index_column(): @@ -245,196 +208,196 @@ def test_fit_with_unique_constraint_on_data_subset(): assert samples["test_column"].is_unique -#@patch('sdv.tabular.copulas.copulas.multivariate.GaussianMultivariate', -# spec_set=GaussianMultivariate) -#def test_conditional_sampling_constraint_uses_reject_sampling(gm_mock): -# """Test that the ``sample`` method handles constraints with conditions. -# -# The ``sample`` method is expected to properly apply constraint -# transformations by dropping columns that cannot be conditonally sampled -# on due to them being part of a constraint if ``fit_columns_model`` -# is False. -# -# Setup: -# - The model is being passed a ``UniqueCombination`` constraint and then -# asked to sample with two conditions, one of which the constraint depends on. -# The constraint is expected to skip its transformations since only some of -# the columns are provided by the conditions and the model will use reject -# sampling to meet the constraint instead. -# -# Input: -# - Conditions -# Side Effects: -# - Correct columns to condition on are passed to underlying sample method -# """ -# # Setup -# constraint = UniqueCombinations( -# columns=['city', 'state'], -# handling_strategy='transform', -# fit_columns_model=False -# ) -# data = pd.DataFrame({ -# 'city': ['LA', 'SF', 'CHI', 'LA', 'LA'], -# 'state': ['CA', 'CA', 'IL', 'CA', 'CA'], -# 'age': [27, 28, 26, 21, 30] -# }) -# model = GaussianCopula(constraints=[constraint], categorical_transformer='label_encoding') -# sampled_numeric_data = [pd.DataFrame({ -# 'city#state.value': [0, 1, 2, 0, 0], -# 'age.value': [30, 30, 30, 30, 30] -# }), pd.DataFrame({ -# 'city#state.value': [1], -# 'age.value': [30] -# })] -# gm_mock.return_value.sample.side_effect = sampled_numeric_data -# model.fit(data) -# -# # Run -# conditions = {'age': 30, 'state': 'CA'} -# sampled_data = model.sample(5, conditions=conditions) -# -# # Assert -# expected_transformed_conditions = {'age.value': 30} -# expected_data = pd.DataFrame({ -# 'city': ['LA', 'SF', 'LA', 'LA', 'SF'], -# 'state': ['CA', 'CA', 'CA', 'CA', 'CA'], -# 'age': [30, 30, 30, 30, 30] -# }) -# sample_calls = model._model.sample.mock_calls -# assert len(sample_calls) == 2 -# model._model.sample.assert_any_call(5, conditions=expected_transformed_conditions) -# model._model.sample.assert_any_call(1, conditions=expected_transformed_conditions) -# pd.testing.assert_frame_equal(sampled_data, expected_data) -# -# -#@patch('sdv.tabular.copulas.copulas.multivariate.GaussianMultivariate', -# spec_set=GaussianMultivariate) -#def test_conditional_sampling_constraint_uses_columns_model(gm_mock): -# """Test that the ``sample`` method handles constraints with conditions. -# -# The ``sample`` method is expected to properly apply constraint -# transformations by sampling the missing columns for the constraint -# if ``fit_columns_model`` is True. -# -# Setup: -# - The model is being passed a ``UniqueCombination`` constraint and then -# asked to sample with two conditions, one of which the constraint depends on. -# The constraint will sample the columns it needs that are not present in -# the conditions and will then use constraint transformations to meet the -# requirements. -# -# Input: -# - Conditions -# Side Effects: -# - Correct columns to condition on are passed to underlying sample method -# """ -# # Setup -# constraint = UniqueCombinations( -# columns=['city', 'state'], -# handling_strategy='transform', -# fit_columns_model=True, -# ) -# data = pd.DataFrame({ -# 'city': ['LA', 'SF', 'CHI', 'LA', 'LA'], -# 'state': ['CA', 'CA', 'IL', 'CA', 'CA'], -# 'age': [27, 28, 26, 21, 30] -# }) -# model = GaussianCopula(constraints=[constraint], categorical_transformer='label_encoding') -# sampled_numeric_data = [pd.DataFrame({ -# 'city#state.value': [2], -# 'age.value': [30] -# }), pd.DataFrame({ -# 'city#state.value': [1, 1, 0, 0, 0], -# 'age.value': [30, 30, 30, 30, 30] -# }), pd.DataFrame({ -# 'city#state.value': [0, 0, 1, 1, 1], -# 'age.value': [30, 30, 30, 30, 30]}) -# ] -# gm_mock.return_value.sample.side_effect = sampled_numeric_data -# model.fit(data) -# -# # Run -# conditions = {'age': 30, 'state': 'CA'} -# sampled_data = model.sample(5, conditions=conditions) -# -# # Assert -# expected_states = pd.Series(['CA', 'CA', 'CA', 'CA', 'CA'], name='state') -# expected_ages = pd.Series([30, 30, 30, 30, 30], name='age') -# sample_calls = model._model.sample.mock_calls -# assert len(sample_calls) >= 2 and len(sample_calls) <= 3 -# assert all(c[2]['conditions']['age.value'] == 30 for c in sample_calls) -# assert all('city#state.value' in c[2]['conditions'] for c in sample_calls) -# pd.testing.assert_series_equal(sampled_data['age'], expected_ages) -# pd.testing.assert_series_equal(sampled_data['state'], expected_states) -# assert all(c in ('SF', 'LA') for c in sampled_data['city']) -# -# -#@patch('sdv.constraints.base.GaussianMultivariate', -# spec_set=GaussianMultivariate) -#def test_conditional_sampling_constraint_uses_columns_model_reject_sampling(column_model_mock): -# """Test that the ``sample`` method handles constraints with conditions. -# -# The ``sample`` method is expected to properly apply constraint -# transformations by sampling the missing columns for the constraint -# if ``fit_columns_model`` is True. All values sampled by the column -# model should be valid because reject sampling is used on any that aren't. -# -# Setup: -# - The model is being passed a ``GreaterThan`` constraint and then -# asked to sample with one condition. One of the constraint columns is -# the conditioned column. The ``GaussianMultivariate`` class is mocked -# so that the constraint's ``_column_model`` returns some invalid rows -# in order to test that the reject sampling is used. -# -# Input: -# - Conditions -# Side Effects: -# - Correct columns to condition on are passed to underlying sample method -# """ -# # Setup -# constraint = GreaterThan( -# low='age_joined', -# high='age', -# handling_strategy='transform', -# fit_columns_model=True, -# drop='high' -# ) -# data = pd.DataFrame({ -# 'age_joined': [22.0, 21.0, 15.0, 18.0, 29.0], -# 'age': [27.0, 28.0, 26.0, 21.0, 30.0], -# 'experience_years': [6.0, 7.0, 11.0, 3.0, 7.0], -# }) -# model = GaussianCopula(constraints=[constraint]) -# sampled_conditions = [ -# pd.DataFrame({ -# 'age_joined.value': [26.0, 18.0, 31.0, 29.0, 32.0], -# 'age.value': [30.0, 30.0, 30.0, 30.0, 30.0] -# }), -# pd.DataFrame({ -# 'age_joined.value': [28.0, 33.0, 31.0], -# 'age.value': [30.0, 30.0, 30.0] -# }), -# pd.DataFrame({ -# 'age_joined.value': [27.0], -# 'age.value': [30.0] -# }) -# ] -# -# column_model_mock.return_value.sample.side_effect = sampled_conditions -# model.fit(data) -# -# # Run -# conditions = {'age': 30.0} -# sampled_data = model.sample(5, conditions=conditions) -# -# # Assert -# assert len(column_model_mock.return_value.sample.mock_calls) == 3 -# -# expected_result = pd.DataFrame({ -# 'age_joined': [26.0, 18.0, 29.0, 28.0, 27.0], -# 'age': [30.0, 30.0, 30.0, 30.0, 30.0] -# }) -# pd.testing.assert_frame_equal( -# sampled_data[['age_joined', 'age']], -# expected_result[['age_joined', 'age']], -# ) +@patch('sdv.tabular.copulas.copulas.multivariate.GaussianMultivariate', + spec_set=GaussianMultivariate) +def test_conditional_sampling_constraint_uses_reject_sampling(gm_mock): + """Test that the ``sample`` method handles constraints with conditions. + + The ``sample`` method is expected to properly apply constraint + transformations by dropping columns that cannot be conditonally sampled + on due to them being part of a constraint if ``fit_columns_model`` + is False. + + Setup: + - The model is being passed a ``UniqueCombination`` constraint and then + asked to sample with two conditions, one of which the constraint depends on. + The constraint is expected to skip its transformations since only some of + the columns are provided by the conditions and the model will use reject + sampling to meet the constraint instead. + + Input: + - Conditions + Side Effects: + - Correct columns to condition on are passed to underlying sample method + """ + # Setup + constraint = UniqueCombinations( + columns=['city', 'state'], + handling_strategy='transform', + fit_columns_model=False + ) + data = pd.DataFrame({ + 'city': ['LA', 'SF', 'CHI', 'LA', 'LA'], + 'state': ['CA', 'CA', 'IL', 'CA', 'CA'], + 'age': [27, 28, 26, 21, 30] + }) + model = GaussianCopula(constraints=[constraint], categorical_transformer='label_encoding') + sampled_numeric_data = [pd.DataFrame({ + 'city#state.value': [0, 1, 2, 0, 0], + 'age.value': [30, 30, 30, 30, 30] + }), pd.DataFrame({ + 'city#state.value': [1], + 'age.value': [30] + })] + gm_mock.return_value.sample.side_effect = sampled_numeric_data + model.fit(data) + + # Run + conditions = [Condition({'age': 30, 'state': 'CA'}, num_rows=5)] + sampled_data = model.sample_conditions(conditions=conditions) + + # Assert + expected_transformed_conditions = {'age.value': 30} + expected_data = pd.DataFrame({ + 'city': ['LA', 'SF', 'LA', 'LA', 'SF'], + 'state': ['CA', 'CA', 'CA', 'CA', 'CA'], + 'age': [30, 30, 30, 30, 30] + }) + sample_calls = model._model.sample.mock_calls + assert len(sample_calls) == 2 + model._model.sample.assert_any_call(5, conditions=expected_transformed_conditions) + model._model.sample.assert_any_call(50, conditions=expected_transformed_conditions) + pd.testing.assert_frame_equal(sampled_data, expected_data) + + +@patch('sdv.tabular.copulas.copulas.multivariate.GaussianMultivariate', + spec_set=GaussianMultivariate) +def test_conditional_sampling_constraint_uses_columns_model(gm_mock): + """Test that the ``sample`` method handles constraints with conditions. + + The ``sample`` method is expected to properly apply constraint + transformations by sampling the missing columns for the constraint + if ``fit_columns_model`` is True. + + Setup: + - The model is being passed a ``UniqueCombination`` constraint and then + asked to sample with two conditions, one of which the constraint depends on. + The constraint will sample the columns it needs that are not present in + the conditions and will then use constraint transformations to meet the + requirements. + + Input: + - Conditions + Side Effects: + - Correct columns to condition on are passed to underlying sample method + """ + # Setup + constraint = UniqueCombinations( + columns=['city', 'state'], + handling_strategy='transform', + fit_columns_model=True, + ) + data = pd.DataFrame({ + 'city': ['LA', 'SF', 'CHI', 'LA', 'LA'], + 'state': ['CA', 'CA', 'IL', 'CA', 'CA'], + 'age': [27, 28, 26, 21, 30] + }) + model = GaussianCopula(constraints=[constraint], categorical_transformer='label_encoding') + sampled_numeric_data = [pd.DataFrame({ + 'city#state.value': [2], + 'age.value': [30] + }), pd.DataFrame({ + 'city#state.value': [1, 1, 0, 0, 0], + 'age.value': [30, 30, 30, 30, 30] + }), pd.DataFrame({ + 'city#state.value': [0, 0, 1, 1, 1], + 'age.value': [30, 30, 30, 30, 30]}) + ] + gm_mock.return_value.sample.side_effect = sampled_numeric_data + model.fit(data) + + # Run + conditions = [Condition({'age': 30, 'state': 'CA'}, num_rows=5)] + sampled_data = model.sample_conditions(conditions=conditions) + + # Assert + expected_states = pd.Series(['CA', 'CA', 'CA', 'CA', 'CA'], name='state') + expected_ages = pd.Series([30, 30, 30, 30, 30], name='age') + sample_calls = model._model.sample.mock_calls + assert len(sample_calls) >= 2 and len(sample_calls) <= 3 + assert all(c[2]['conditions']['age.value'] == 30 for c in sample_calls) + assert all('city#state.value' in c[2]['conditions'] for c in sample_calls) + pd.testing.assert_series_equal(sampled_data['age'], expected_ages) + pd.testing.assert_series_equal(sampled_data['state'], expected_states) + assert all(c in ('SF', 'LA') for c in sampled_data['city']) + + +@patch('sdv.constraints.base.GaussianMultivariate', + spec_set=GaussianMultivariate) +def test_conditional_sampling_constraint_uses_columns_model_reject_sampling(column_model_mock): + """Test that the ``sample`` method handles constraints with conditions. + + The ``sample`` method is expected to properly apply constraint + transformations by sampling the missing columns for the constraint + if ``fit_columns_model`` is True. All values sampled by the column + model should be valid because reject sampling is used on any that aren't. + + Setup: + - The model is being passed a ``GreaterThan`` constraint and then + asked to sample with one condition. One of the constraint columns is + the conditioned column. The ``GaussianMultivariate`` class is mocked + so that the constraint's ``_column_model`` returns some invalid rows + in order to test that the reject sampling is used. + + Input: + - Conditions + Side Effects: + - Correct columns to condition on are passed to underlying sample method + """ + # Setup + constraint = GreaterThan( + low='age_joined', + high='age', + handling_strategy='transform', + fit_columns_model=True, + drop='high' + ) + data = pd.DataFrame({ + 'age_joined': [22.0, 21.0, 15.0, 18.0, 29.0], + 'age': [27.0, 28.0, 26.0, 21.0, 30.0], + 'experience_years': [6.0, 7.0, 11.0, 3.0, 7.0], + }) + model = GaussianCopula(constraints=[constraint]) + sampled_conditions = [ + pd.DataFrame({ + 'age_joined.value': [26.0, 18.0, 31.0, 29.0, 32.0], + 'age.value': [30.0, 30.0, 30.0, 30.0, 30.0] + }), + pd.DataFrame({ + 'age_joined.value': [28.0, 33.0, 31.0], + 'age.value': [30.0, 30.0, 30.0] + }), + pd.DataFrame({ + 'age_joined.value': [27.0], + 'age.value': [30.0] + }) + ] + + column_model_mock.return_value.sample.side_effect = sampled_conditions + model.fit(data) + + # Run + conditions = [Condition({'age': 30.0}, num_rows=5)] + sampled_data = model.sample_conditions(conditions=conditions) + + # Assert + assert len(column_model_mock.return_value.sample.mock_calls) == 3 + + expected_result = pd.DataFrame({ + 'age_joined': [26.0, 18.0, 29.0, 28.0, 27.0], + 'age': [30.0, 30.0, 30.0, 30.0, 30.0] + }) + pd.testing.assert_frame_equal( + sampled_data[['age_joined', 'age']], + expected_result[['age_joined', 'age']], + ) diff --git a/tests/integration/tabular/test_copulagan.py b/tests/integration/tabular/test_copulagan.py index 449764bdc..bf7502d0e 100644 --- a/tests/integration/tabular/test_copulagan.py +++ b/tests/integration/tabular/test_copulagan.py @@ -3,6 +3,7 @@ from sdv.constraints import UniqueCombinations from sdv.demo import load_demo, load_tabular_demo +from sdv.sampling import Condition from sdv.tabular.copulagan import CopulaGAN @@ -89,23 +90,23 @@ def test_recreate(): assert (sampled.notnull().sum(axis=1) != 0).all() -#def test_conditional_sampling_dict(): -# data = pd.DataFrame({ -# "column1": [1.0, 0.5, 2.5] * 10, -# "column2": ["a", "b", "c"] * 10 -# }) -# -# model = CopulaGAN(epochs=1) -# model.fit(data) -# conditions = { -# "column2": "b" -# } -# sampled = model.sample(30, conditions=conditions) -# -# assert sampled.shape == data.shape -# assert set(sampled["column2"].unique()) == set(["b"]) -# -# +def test_conditional_sampling_dict(): + data = pd.DataFrame({ + "column1": [1.0, 0.5, 2.5] * 10, + "column2": ["a", "b", "c"] * 10 + }) + + model = CopulaGAN(epochs=1) + model.fit(data) + conditions = [Condition({ + "column2": "b" + }, num_rows=30)] + sampled = model.sample_conditions(conditions=conditions) + + assert sampled.shape == data.shape + assert set(sampled["column2"].unique()) == set(["b"]) + + #def test_conditional_sampling_dataframe(): # data = pd.DataFrame({ # "column1": [1.0, 0.5, 2.5] * 10, @@ -123,41 +124,41 @@ def test_recreate(): # assert (sampled["column2"] == np.array(["b", "b", "b", "c", "c"])).all() # # -#def test_conditional_sampling_two_conditions(): -# data = pd.DataFrame({ -# "column1": [1.0, 0.5, 2.5] * 10, -# "column2": ["a", "b", "c"] * 10, -# "column3": ["d", "e", "f"] * 10 -# }) -# -# model = CopulaGAN(epochs=1) -# model.fit(data) -# conditions = { -# "column2": "b", -# "column3": "f" -# } -# samples = model.sample(5, conditions=conditions) -# assert list(samples.column2) == ['b'] * 5 -# assert list(samples.column3) == ['f'] * 5 -# -# -#def test_conditional_sampling_numerical(): -# data = pd.DataFrame({ -# "column1": [1.0, 0.5, 2.5] * 10, -# "column2": ["a", "b", "c"] * 10, -# "column3": ["d", "e", "f"] * 10 -# }) -# -# model = CopulaGAN(epochs=1) -# model.fit(data) -# conditions = { -# "column1": 1.0, -# } -# sampled = model.sample(5, conditions=conditions) -# -# assert list(sampled.column1) == [1.0] * 5 -# -# +def test_conditional_sampling_two_conditions(): + data = pd.DataFrame({ + "column1": [1.0, 0.5, 2.5] * 10, + "column2": ["a", "b", "c"] * 10, + "column3": ["d", "e", "f"] * 10 + }) + + model = CopulaGAN(epochs=1) + model.fit(data) + conditions = [Condition({ + "column2": "b", + "column3": "f" + }, num_rows=5)] + samples = model.sample_conditions(conditions=conditions) + assert list(samples.column2) == ['b'] * 5 + assert list(samples.column3) == ['f'] * 5 + + +def test_conditional_sampling_numerical(): + data = pd.DataFrame({ + "column1": [1.0, 0.5, 2.5] * 10, + "column2": ["a", "b", "c"] * 10, + "column3": ["d", "e", "f"] * 10 + }) + + model = CopulaGAN(epochs=1) + model.fit(data) + conditions = [Condition({ + "column1": 1.0, + }, num_rows=5)] + sampled = model.sample_conditions(conditions=conditions) + + assert list(sampled.column1) == [1.0] * 5 + + def test_unique_combination_constraint(): employees = load_tabular_demo() diff --git a/tests/integration/tabular/test_copulas.py b/tests/integration/tabular/test_copulas.py index 0e816c211..0c6770297 100644 --- a/tests/integration/tabular/test_copulas.py +++ b/tests/integration/tabular/test_copulas.py @@ -3,6 +3,7 @@ import pytest from sdv.demo import load_demo +from sdv.sampling import Condition from sdv.tabular.base import NonParametricError from sdv.tabular.copulas import GaussianCopula @@ -182,23 +183,23 @@ def test_ids_only(): assert ids_only.equals(sampled) -#def test_conditional_sampling_dict(): -# data = pd.DataFrame({ -# "column1": [1.0, 0.5, 2.5] * 10, -# "column2": ["a", "b", "c"] * 10 -# }) -# -# model = GaussianCopula() -# model.fit(data) -# conditions = { -# "column2": "b" -# } -# sampled = model.sample(30, conditions=conditions) -# -# assert sampled.shape == data.shape -# assert set(sampled["column2"].unique()) == set(["b"]) -# -# +def test_conditional_sampling_dict(): + data = pd.DataFrame({ + "column1": [1.0, 0.5, 2.5] * 10, + "column2": ["a", "b", "c"] * 10 + }) + + model = GaussianCopula() + model.fit(data) + conditions = [Condition({ + "column2": "b" + }, num_rows=30)] + sampled = model.sample_conditions(conditions=conditions) + + assert sampled.shape == data.shape + assert set(sampled["column2"].unique()) == set(["b"]) + + #def test_conditional_sampling_dataframe(): # data = pd.DataFrame({ # "column1": [1.0, 0.5, 2.5] * 10, @@ -216,36 +217,36 @@ def test_ids_only(): # assert (sampled["column2"] == np.array(["b", "b", "b", "c", "c"])).all() # # -#def test_conditional_sampling_two_conditions(): -# data = pd.DataFrame({ -# "column1": [1.0, 0.5, 2.5] * 10, -# "column2": ["a", "b", "c"] * 10, -# "column3": ["d", "e", "f"] * 10 -# }) -# -# model = GaussianCopula() -# model.fit(data) -# conditions = { -# "column2": "b", -# "column3": "f" -# } -# samples = model.sample(5, conditions=conditions) -# assert list(samples.column2) == ['b'] * 5 -# assert list(samples.column3) == ['f'] * 5 -# -# -#def test_conditional_sampling_numerical(): -# data = pd.DataFrame({ -# "column1": [1.0, 0.5, 2.5] * 10, -# "column2": ["a", "b", "c"] * 10, -# "column3": ["d", "e", "f"] * 10 -# }) -# -# model = GaussianCopula() -# model.fit(data) -# conditions = { -# "column1": 1.0, -# } -# sampled = model.sample(5, conditions=conditions) -# -# assert list(sampled.column1) == [1.0] * 5 +def test_conditional_sampling_two_conditions(): + data = pd.DataFrame({ + "column1": [1.0, 0.5, 2.5] * 10, + "column2": ["a", "b", "c"] * 10, + "column3": ["d", "e", "f"] * 10 + }) + + model = GaussianCopula() + model.fit(data) + conditions = [Condition({ + "column2": "b", + "column3": "f" + }, num_rows=5)] + samples = model.sample_conditions(conditions=conditions) + assert list(samples.column2) == ['b'] * 5 + assert list(samples.column3) == ['f'] * 5 + + +def test_conditional_sampling_numerical(): + data = pd.DataFrame({ + "column1": [1.0, 0.5, 2.5] * 10, + "column2": ["a", "b", "c"] * 10, + "column3": ["d", "e", "f"] * 10 + }) + + model = GaussianCopula() + model.fit(data) + conditions = [Condition({ + "column1": 1.0, + }, num_rows=5)] + sampled = model.sample_conditions(conditions=conditions) + + assert list(sampled.column1) == [1.0] * 5 diff --git a/tests/integration/tabular/test_ctgan.py b/tests/integration/tabular/test_ctgan.py index 53a5e7503..259550cfa 100644 --- a/tests/integration/tabular/test_ctgan.py +++ b/tests/integration/tabular/test_ctgan.py @@ -3,6 +3,7 @@ from sdv.constraints import UniqueCombinations from sdv.demo import load_demo, load_tabular_demo +from sdv.sampling import Condition from sdv.tabular.ctgan import CTGAN @@ -86,23 +87,23 @@ def test_recreate(): assert (sampled.notnull().sum(axis=1) != 0).all() -#def test_conditional_sampling_dict(): -# data = pd.DataFrame({ -# "column1": [1.0, 0.5, 2.5] * 10, -# "column2": ["a", "b", "c"] * 10 -# }) -# -# model = CTGAN(epochs=1) -# model.fit(data) -# conditions = { -# "column2": "b" -# } -# sampled = model.sample(30, conditions=conditions) -# -# assert sampled.shape == data.shape -# assert set(sampled["column2"].unique()) == set(["b"]) -# -# +def test_conditional_sampling_dict(): + data = pd.DataFrame({ + "column1": [1.0, 0.5, 2.5] * 10, + "column2": ["a", "b", "c"] * 10 + }) + + model = CTGAN(epochs=1) + model.fit(data) + conditions = [Condition({ + "column2": "b" + }, num_rows=30)] + sampled = model.sample_conditions(conditions=conditions) + + assert sampled.shape == data.shape + assert set(sampled["column2"].unique()) == set(["b"]) + + #def test_conditional_sampling_dataframe(): # data = pd.DataFrame({ # "column1": [1.0, 0.5, 2.5] * 10, @@ -120,41 +121,41 @@ def test_recreate(): # assert (sampled["column2"] == np.array(["b", "b", "b", "c", "c"])).all() # # -#def test_conditional_sampling_two_conditions(): -# data = pd.DataFrame({ -# "column1": [1.0, 0.5, 2.5] * 10, -# "column2": ["a", "b", "c"] * 10, -# "column3": ["d", "e", "f"] * 10 -# }) -# -# model = CTGAN(epochs=1) -# model.fit(data) -# conditions = { -# "column2": "b", -# "column3": "f" -# } -# samples = model.sample(5, conditions=conditions) -# assert list(samples.column2) == ['b'] * 5 -# assert list(samples.column3) == ['f'] * 5 -# -# -#def test_conditional_sampling_numerical(): -# data = pd.DataFrame({ -# "column1": [1.0, 0.5, 2.5] * 10, -# "column2": ["a", "b", "c"] * 10, -# "column3": ["d", "e", "f"] * 10 -# }) -# -# model = CTGAN(epochs=1) -# model.fit(data) -# conditions = { -# "column1": 1.0, -# } -# sampled = model.sample(5, conditions=conditions) -# -# assert list(sampled.column1) == [1.0] * 5 -# -# +def test_conditional_sampling_two_conditions(): + data = pd.DataFrame({ + "column1": [1.0, 0.5, 2.5] * 10, + "column2": ["a", "b", "c"] * 10, + "column3": ["d", "e", "f"] * 10 + }) + + model = CTGAN(epochs=1) + model.fit(data) + conditions = [Condition({ + "column2": "b", + "column3": "f" + }, num_rows=5)] + samples = model.sample_conditions(conditions=conditions) + assert list(samples.column2) == ['b'] * 5 + assert list(samples.column3) == ['f'] * 5 + + +def test_conditional_sampling_numerical(): + data = pd.DataFrame({ + "column1": [1.0, 0.5, 2.5] * 10, + "column2": ["a", "b", "c"] * 10, + "column3": ["d", "e", "f"] * 10 + }) + + model = CTGAN(epochs=1) + model.fit(data) + conditions = [Condition({ + "column1": 1.0, + }, num_rows=5)] + sampled = model.sample_conditions(conditions=conditions) + + assert list(sampled.column1) == [1.0] * 5 + + def test_unique_combination_constraint(): employees = load_tabular_demo() diff --git a/tests/integration/tabular/test_tvae.py b/tests/integration/tabular/test_tvae.py index 25f888a37..a62b68a21 100644 --- a/tests/integration/tabular/test_tvae.py +++ b/tests/integration/tabular/test_tvae.py @@ -2,6 +2,7 @@ import pandas as pd from sdv.demo import load_demo +from sdv.sampling import Condition from sdv.tabular.ctgan import TVAE @@ -85,23 +86,23 @@ def test_recreate(): assert (sampled.notnull().sum(axis=1) != 0).all() -#def test_conditional_sampling_dict(): -# data = pd.DataFrame({ -# "column1": [1.0, 0.5, 2.5] * 10, -# "column2": ["a", "b", "c"] * 10 -# }) -# -# model = TVAE(epochs=1) -# model.fit(data) -# conditions = { -# "column2": "b" -# } -# sampled = model.sample(30, conditions=conditions) -# -# assert sampled.shape == data.shape -# assert set(sampled["column2"].unique()) == set(["b"]) -# -# +def test_conditional_sampling_dict(): + data = pd.DataFrame({ + "column1": [1.0, 0.5, 2.5] * 10, + "column2": ["a", "b", "c"] * 10 + }) + + model = TVAE(epochs=1) + model.fit(data) + conditions = [Condition({ + "column2": "b" + }, num_rows=30)] + sampled = model.sample_conditions(conditions=conditions) + + assert sampled.shape == data.shape + assert set(sampled["column2"].unique()) == set(["b"]) + + #def test_conditional_sampling_dataframe(): # data = pd.DataFrame({ # "column1": [1.0, 0.5, 2.5] * 10, @@ -119,36 +120,36 @@ def test_recreate(): # assert (sampled["column2"] == np.array(["b", "b", "b", "c", "c"])).all() # # -#def test_conditional_sampling_two_conditions(): -# data = pd.DataFrame({ -# "column1": [1.0, 0.5, 2.5] * 10, -# "column2": ["a", "b", "c"] * 10, -# "column3": ["d", "e", "f"] * 10 -# }) -# -# model = TVAE(epochs=1) -# model.fit(data) -# conditions = { -# "column2": "b", -# "column3": "f" -# } -# samples = model.sample(5, conditions=conditions) -# assert list(samples.column2) == ['b'] * 5 -# assert list(samples.column3) == ['f'] * 5 -# -# -#def test_conditional_sampling_numerical(): -# data = pd.DataFrame({ -# "column1": [1.0, 0.5, 2.5] * 10, -# "column2": ["a", "b", "c"] * 10, -# "column3": ["d", "e", "f"] * 10 -# }) -# -# model = TVAE(epochs=1) -# model.fit(data) -# conditions = { -# "column1": 1.0, -# } -# sampled = model.sample(5, conditions=conditions) -# -# assert list(sampled.column1) == [1.0] * 5 +def test_conditional_sampling_two_conditions(): + data = pd.DataFrame({ + "column1": [1.0, 0.5, 2.5] * 10, + "column2": ["a", "b", "c"] * 10, + "column3": ["d", "e", "f"] * 10 + }) + + model = TVAE(epochs=1) + model.fit(data) + conditions = [Condition({ + "column2": "b", + "column3": "f" + }, num_rows=5)] + samples = model.sample_conditions(conditions=conditions) + assert list(samples.column2) == ['b'] * 5 + assert list(samples.column3) == ['f'] * 5 + + +def test_conditional_sampling_numerical(): + data = pd.DataFrame({ + "column1": [1.0, 0.5, 2.5] * 10, + "column2": ["a", "b", "c"] * 10, + "column3": ["d", "e", "f"] * 10 + }) + + model = TVAE(epochs=1) + model.fit(data) + conditions = [Condition({ + "column1": 1.0, + }, num_rows=5)] + sampled = model.sample_conditions(conditions=conditions) + + assert list(sampled.column1) == [1.0] * 5 diff --git a/tests/unit/tabular/test_base.py b/tests/unit/tabular/test_base.py index e0fa11249..5b99d566e 100644 --- a/tests/unit/tabular/test_base.py +++ b/tests/unit/tabular/test_base.py @@ -382,7 +382,7 @@ def test__sample_with_conditions_empty_transformed_conditions(): pd.testing.assert_series_equal(args[0]['column1'], conditions_series) assert kwargs['on_missing_column'] == 'drop' model._metadata.transform.assert_called_once() - model._sample_batch.assert_called_with(5, 100, None, conditions, None, None) + model._sample_batch.assert_called_with(5, 100, None, conditions, None, 0.01) pd.testing.assert_frame_equal(output, expected_output) @@ -444,13 +444,13 @@ def test__sample_with_conditions_transform_conditions_correctly(): assert kwargs['on_missing_column'] == 'drop' model._metadata.transform.assert_called_once() model._sample_batch.assert_any_call( - 3, 100, None, {'column1': 25}, {'transformed_column': 50}, None + 3, 100, None, {'column1': 25}, {'transformed_column': 50}, 0.01 ) model._sample_batch.assert_any_call( - 1, 100, None, {'column1': 30}, {'transformed_column': 60}, None + 1, 100, None, {'column1': 30}, {'transformed_column': 60}, 0.01 ) model._sample_batch.assert_any_call( - 1, 100, None, {'column1': 30}, {'transformed_column': 70}, None + 1, 100, None, {'column1': 30}, {'transformed_column': 70}, 0.01 )