From c0a77230a8170ca5f75211bc8919476a9b408015 Mon Sep 17 00:00:00 2001 From: Katharine Xiao <2405771+katxiao@users.noreply.github.com> Date: Tue, 22 Feb 2022 11:21:57 -0800 Subject: [PATCH] Add method to sample remaining columns (3/3) (#708) * Add method to sample remaining columns * update integration tests * add unit tests * update tutorials and docs * Enable batch sampling (#709) * Add batch sampling and progress bar * Make sure to close progress bar * Periodically write to file * add unit tests * cr comments * fix test --- docs/user_guides/single_table/copulagan.rst | 37 ++-- docs/user_guides/single_table/ctgan.rst | 37 ++-- .../single_table/gaussian_copula.rst | 37 ++-- docs/user_guides/single_table/tvae.rst | 36 ++-- sdv/tabular/base.py | 167 +++++++++++++++--- tests/integration/tabular/test_base.py | 35 ++-- tests/integration/tabular/test_copulagan.py | 64 +++---- tests/integration/tabular/test_copulas.py | 64 +++---- tests/integration/tabular/test_ctgan.py | 64 +++---- tests/integration/tabular/test_tvae.py | 64 +++---- tests/unit/tabular/test_base.py | 165 ++++++++++++++++- .../01_GaussianCopula_Model.ipynb | 35 ++-- .../single_table_data/02_CTGAN_Model.ipynb | 35 ++-- .../03_CopulaGAN_Model.ipynb | 35 ++-- .../single_table_data/04_TVAE_Model.ipynb | 35 ++-- 15 files changed, 615 insertions(+), 295 deletions(-) diff --git a/docs/user_guides/single_table/copulagan.rst b/docs/user_guides/single_table/copulagan.rst index 18e15dd4b..0256114fd 100644 --- a/docs/user_guides/single_table/copulagan.rst +++ b/docs/user_guides/single_table/copulagan.rst @@ -688,19 +688,23 @@ Conditional Sampling As the name implies, conditional sampling allows us to sample from a conditional distribution using the ``CopulaGAN`` model, which means we can generate only values that -satisfy certain conditions. These conditional values can be passed to the ``conditions`` -parameter in the ``sample`` method either as a dataframe or a dictionary. +satisfy certain conditions. These conditional values can be passed to the ``sample_conditions`` +method as a list of ``sdv.sampling.Condition`` objects or to the ``sample_remaining_columns`` method +as a dataframe. -In case a dictionary is passed, the model will generate as many rows as requested, -all of which will satisfy the specified conditions, such as ``gender = M``. +When specifying a ``sdv.sampling.Condition`` object, we can pass in the desired conditions +as a dictionary, as well as specify the number of desired rows for that condition. .. ipython:: python :okwarning: - conditions = { + from sdv.sampling import Condition + + condition = Condition({ 'gender': 'M' - } - model.sample(5, conditions=conditions) + }, num_rows=5) + + model.sample_conditions(conditions=[condition]) It's also possible to condition on multiple columns, such as @@ -709,14 +713,16 @@ It's also possible to condition on multiple columns, such as .. ipython:: python :okwarning: - conditions = { + condition = Condition({ 'gender': 'M', 'experience_years': 0 - } - model.sample(5, conditions=conditions) + }, num_rows=5) + model.sample_conditions(conditions=[condition]) -The ``conditions`` can also be passed as a dataframe. In that case, the model + +In the ``sample_remaining_columns`` method, ``conditions`` is +passed as a dataframe. In that case, the model will generate one sample for each row of the dataframe, sorted in the same order. Since the model already knows how many samples to generate, passing it as a parameter is unnecessary. For example, if we want to generate three @@ -731,7 +737,7 @@ following: conditions = pd.DataFrame({ 'gender': ['M', 'M', 'M', 'F', 'F', 'F'], }) - model.sample(conditions=conditions) + model.sample_remaining_columns(conditions) ``CopulaGAN`` also supports conditioning on continuous values, as long as the values @@ -741,10 +747,11 @@ dataset are within 0 and 1, ``CopulaGAN`` will not be able to set this value to .. ipython:: python :okwarning: - conditions = { + condition = Condition({ 'degree_perc': 70.0 - } - model.sample(5, conditions=conditions) + }, num_rows=5) + + model.sample_conditions(conditions=[condition]) .. note:: diff --git a/docs/user_guides/single_table/ctgan.rst b/docs/user_guides/single_table/ctgan.rst index a9a6c33f6..bab15c081 100644 --- a/docs/user_guides/single_table/ctgan.rst +++ b/docs/user_guides/single_table/ctgan.rst @@ -499,19 +499,23 @@ Conditional Sampling As the name implies, conditional sampling allows us to sample from a conditional distribution using the ``CTGAN`` model, which means we can generate only values that -satisfy certain conditions. These conditional values can be passed to the ``conditions`` -parameter in the ``sample`` method either as a dataframe or a dictionary. +satisfy certain conditions. These conditional values can be passed to the ``sample_conditions`` +method as a list of ``sdv.sampling.Condition`` objects or to the ``sample_remaining_columns`` +method as a dataframe. -In case a dictionary is passed, the model will generate as many rows as requested, -all of which will satisfy the specified conditions, such as ``gender = M``. +When specifying a ``sdv.sampling.Condition`` object, we can pass in the desired conditions +as a dictionary, as well as specify the number of desired rows for that condition. .. ipython:: python :okwarning: - conditions = { + from sdv.sampling import Condition + + condition = Condition({ 'gender': 'M' - } - model.sample(5, conditions=conditions) + }, num_rows=5) + + model.sample_conditions(conditions=[condition]) It's also possible to condition on multiple columns, such as @@ -520,14 +524,16 @@ It's also possible to condition on multiple columns, such as .. ipython:: python :okwarning: - conditions = { + condition = Condition({ 'gender': 'M', 'experience_years': 0 - } - model.sample(5, conditions=conditions) + }, num_rows=5) + model.sample_conditions(conditions=[condition]) -The ``conditions`` can also be passed as a dataframe. In that case, the model + +In the ``sample_remaining_columns`` method, ``conditions`` is +passed as a dataframe. In that case, the model will generate one sample for each row of the dataframe, sorted in the same order. Since the model already knows how many samples to generate, passing it as a parameter is unnecessary. For example, if we want to generate three @@ -542,7 +548,7 @@ following: conditions = pd.DataFrame({ 'gender': ['M', 'M', 'M', 'F', 'F', 'F'], }) - model.sample(conditions=conditions) + model.sample_remaining_columns(conditions) ``CTGAN`` also supports conditioning on continuous values, as long as the values @@ -552,10 +558,11 @@ dataset are within 0 and 1, ``CTGAN`` will not be able to set this value to 1000 .. ipython:: python :okwarning: - conditions = { + condition = Condition({ 'degree_perc': 70.0 - } - model.sample(5, conditions=conditions) + }, num_rows=5) + + model.sample_conditions(conditions=[condition]) .. note:: diff --git a/docs/user_guides/single_table/gaussian_copula.rst b/docs/user_guides/single_table/gaussian_copula.rst index 7491fa555..f9125ef72 100644 --- a/docs/user_guides/single_table/gaussian_copula.rst +++ b/docs/user_guides/single_table/gaussian_copula.rst @@ -648,19 +648,23 @@ Conditional Sampling As the name implies, conditional sampling allows us to sample from a conditional distribution using the ``GaussianCopula`` model, which means we can generate only values that -satisfy certain conditions. These conditional values can be passed to the ``conditions`` -parameter in the ``sample`` method either as a dataframe or a dictionary. +satisfy certain conditions. These conditional values can be passed to the ``sample_conditions`` +method as a list of ``sdv.sampling.Condition`` objects or to the ``sample_remaining_columns`` +method as a dataframe. -In case a dictionary is passed, the model will generate as many rows as requested, -all of which will satisfy the specified conditions, such as ``gender = M``. +When specifying a ``sdv.sampling.Condition`` object, we can pass in the desired conditions +as a dictionary, as well as specify the number of desired rows for that condition. .. ipython:: python :okwarning: - conditions = { + from sdv.sampling import Condition + + condition = Condition({ 'gender': 'M' - } - model.sample(5, conditions=conditions) + }, num_rows=5) + + model.sample_conditions(conditions=[condition]) It's also possible to condition on multiple columns, such as @@ -669,14 +673,16 @@ It's also possible to condition on multiple columns, such as .. ipython:: python :okwarning: - conditions = { + condition = Condition({ 'gender': 'M', 'experience_years': 0 - } - model.sample(5, conditions=conditions) + }, num_rows=5) + model.sample_conditions(conditions=[condition]) -The ``conditions`` can also be passed as a dataframe. In that case, the model + +In the ``sample_remaining_columns`` method, ``conditions`` is +passed as a dataframe. In that case, the model will generate one sample for each row of the dataframe, sorted in the same order. Since the model already knows how many samples to generate, passing it as a parameter is unnecessary. For example, if we want to generate three @@ -691,7 +697,7 @@ following: conditions = pd.DataFrame({ 'gender': ['M', 'M', 'M', 'F', 'F', 'F'], }) - model.sample(conditions=conditions) + model.sample_remaining_columns(conditions) ``GaussianCopula`` also supports conditioning on continuous values, as long as the values @@ -701,10 +707,11 @@ dataset are within 0 and 1, ``GaussianCopula`` will not be able to set this valu .. ipython:: python :okwarning: - conditions = { + condition = Condition({ 'degree_perc': 70.0 - } - model.sample(5, conditions=conditions) + }, num_rows=5) + + model.sample_conditions(conditions=[condition]) .. note:: diff --git a/docs/user_guides/single_table/tvae.rst b/docs/user_guides/single_table/tvae.rst index a0bfea217..7d94e66bd 100644 --- a/docs/user_guides/single_table/tvae.rst +++ b/docs/user_guides/single_table/tvae.rst @@ -484,19 +484,22 @@ Conditional Sampling As the name implies, conditional sampling allows us to sample from a conditional distribution using the ``TVAE`` model, which means we can generate only values that -satisfy certain conditions. These conditional values can be passed to the ``conditions`` -parameter in the ``sample`` method either as a dataframe or a dictionary. +satisfy certain conditions. These conditional values can be passed to the ``sample_conditions`` +method as a list of ``sdv.sampling.Condition`` objects or to the ``sample_remaining_columns`` +method as a dataframe. -In case a dictionary is passed, the model will generate as many rows as requested, -all of which will satisfy the specified conditions, such as ``gender = M``. +When specifying a ``sdv.sampling.Condition`` object, we can pass in the desired conditions as a dictionary, as well as specify the number of desired rows for that condition. .. ipython:: python :okwarning: - conditions = { + from sdv.sampling import Condition + + condition = Condition({ 'gender': 'M' - } - model.sample(5, conditions=conditions) + }, num_rows=5) + + model.sample_conditions(conditions=[condition]) It's also possible to condition on multiple columns, such as @@ -505,14 +508,16 @@ It's also possible to condition on multiple columns, such as .. ipython:: python :okwarning: - conditions = { + condition = Condition({ 'gender': 'M', 'experience_years': 0 - } - model.sample(5, conditions=conditions) + }, num_rows=5) + model.sample_conditions(conditions=[condition]) -The ``conditions`` can also be passed as a dataframe. In that case, the model + +In the ``sample_remaining_columns`` method, ``conditions`` is +passed as a dataframe. In that case, the model will generate one sample for each row of the dataframe, sorted in the same order. Since the model already knows how many samples to generate, passing it as a parameter is unnecessary. For example, if we want to generate three @@ -527,7 +532,7 @@ following: conditions = pd.DataFrame({ 'gender': ['M', 'M', 'M', 'F', 'F', 'F'], }) - model.sample(conditions=conditions) + model.sample_remaining_columns(conditions) ``TVAE`` also supports conditioning on continuous values, as long as the values @@ -537,10 +542,11 @@ dataset are within 0 and 1, ``TVAE`` will not be able to set this value to 1000. .. ipython:: python :okwarning: - conditions = { + condition = Condition({ 'degree_perc': 70.0 - } - model.sample(5, conditions=conditions) + }, num_rows=5) + + model.sample_conditions(conditions=[condition]) .. note:: diff --git a/sdv/tabular/base.py b/sdv/tabular/base.py index 01728e147..fe8b2cb03 100644 --- a/sdv/tabular/base.py +++ b/sdv/tabular/base.py @@ -1,6 +1,9 @@ """Base Class for tabular models.""" +import functools import logging +import math +import os import pickle import uuid from collections import defaultdict @@ -8,6 +11,7 @@ import numpy as np import pandas as pd +from tqdm import tqdm from sdv.errors import ConstraintsNotMetError from sdv.metadata import Table @@ -256,7 +260,8 @@ def _sample_rows(self, num_rows, conditions=None, transformed_conditions=None, return sampled, num_rows def _sample_batch(self, num_rows=None, max_tries=100, batch_size_per_try=None, - conditions=None, transformed_conditions=None, float_rtol=0.01): + conditions=None, transformed_conditions=None, float_rtol=0.01, + progress_bar=None, output_file_path=None): """Sample a batch of rows with the given conditions. This will enter a reject-sampling loop in which rows will be sampled until @@ -292,6 +297,12 @@ def _sample_batch(self, num_rows=None, max_tries=100, batch_size_per_try=None, The dictionary of conditioning values transformed to the model format. float_rtol (float): Maximum tolerance when considering a float match. + progress_bar (tqdm.tqdm or None): + The progress bar to update when sampling. If None, a new tqdm progress + bar will be created. + output_file_path (str or None): + The file to periodically write sampled rows to. If None, does not write + rows anywhere. Returns: pandas.DataFrame: @@ -300,21 +311,42 @@ def _sample_batch(self, num_rows=None, max_tries=100, batch_size_per_try=None, if not batch_size_per_try: batch_size_per_try = num_rows * 10 - sampled, num_valid = self._sample_rows( - num_rows, conditions, transformed_conditions, float_rtol) + if not progress_bar: + progress_bar = tqdm(total=num_rows) + + counter = 0 + num_valid = 0 + prev_num_valid = None + remaining = num_rows + sampled = pd.DataFrame() - counter = 1 while num_valid < num_rows: if counter >= max_tries: break - remaining = num_rows - num_valid - - LOGGER.info(f'{remaining} valid rows remaining. Resampling {batch_size_per_try} rows') + prev_num_valid = num_valid sampled, num_valid = self._sample_rows( batch_size_per_try, conditions, transformed_conditions, float_rtol, sampled, ) + num_increase = min(num_valid - prev_num_valid, remaining) + if num_increase > 0: + if output_file_path: + append_kwargs = {'mode': 'a', 'header': False} if os.path.exists( + output_file_path) else {} + sampled.head(min(len(sampled), num_rows)).tail(num_increase).to_csv( + output_file_path, + index=False, + **append_kwargs, + ) + + progress_bar.update(num_increase) + + remaining = num_rows - num_valid + if remaining > 0: + LOGGER.info( + f'{remaining} valid rows remaining. Resampling {batch_size_per_try} rows') + counter += 1 return sampled.head(min(len(sampled), num_rows)) @@ -345,7 +377,8 @@ def _make_condition_dfs(self, conditions): def _conditionally_sample_rows(self, dataframe, condition, transformed_condition, max_tries=None, batch_size_per_try=None, float_rtol=0.01, - graceful_reject_sampling=True): + graceful_reject_sampling=True, progress_bar=None, + output_file_path=None): num_rows = len(dataframe) sampled_rows = self._sample_batch( num_rows, @@ -354,6 +387,8 @@ def _conditionally_sample_rows(self, dataframe, condition, transformed_condition condition, transformed_condition, float_rtol, + progress_bar, + output_file_path, ) num_sampled_rows = len(sampled_rows) @@ -379,15 +414,21 @@ def _conditionally_sample_rows(self, dataframe, condition, transformed_condition return sampled_rows - def sample(self, num_rows, randomize_samples=True, conditions=None): + def sample(self, num_rows, randomize_samples=True, batch_size=None, output_file_path=None, + conditions=None): """Sample rows from this table. Args: num_rows (int): Number of rows to sample. This parameter is required. randomize_samples (bool): - Whether or not to use a a fixed seed when sampling. Defaults + Whether or not to use a fixed seed when sampling. Defaults to True. + batch_size (int or None): + The batch size to sample. Defaults to `num_rows`, if None. + output_file_path (str or None): + The file to periodically write sampled rows to. If None, does not + write rows anywhere. conditions: Deprecated argument. Use the `sample_conditions` method with `sdv.sampling.Condition` objects instead. @@ -405,9 +446,30 @@ def sample(self, num_rows, randomize_samples=True, conditions=None): if num_rows is None: raise ValueError('You must specify the number of rows to sample (e.g. num_rows=100).') - return self._sample_batch(num_rows) + if num_rows == 0: + return pd.DataFrame() - def _sample_with_conditions(self, conditions, max_tries, batch_size_per_try): + output_file_path = self._validate_file_path(output_file_path) + + batch_size = min(batch_size, num_rows) if batch_size else num_rows + + sampled = [] + with tqdm(total=num_rows) as progress_bar: + progress_bar.set_description( + f'Sampling {num_rows} rows of data in batches of size {batch_size}') + for step in range(math.ceil(num_rows / batch_size)): + sampled_rows = self._sample_batch( + batch_size, + batch_size_per_try=batch_size, + progress_bar=progress_bar, + output_file_path=output_file_path, + ) + sampled.append(sampled_rows) + + return pd.concat(sampled, ignore_index=True) + + def _sample_with_conditions(self, conditions, max_tries, batch_size_per_try, + progress_bar=None, output_file_path=None): """Sample rows with conditions. Args: @@ -418,6 +480,11 @@ def _sample_with_conditions(self, conditions, max_tries, batch_size_per_try): batch_size_per_try (int): The batch size to use per attempt at sampling. Defaults to 10 times the number of rows. + progress_bar (tqdm.tqdm or None): + The progress bar to update. + output_file_path (str or None): + The file to periodically write sampled rows to. Defaults to + a temporary file, if None. Returns: pandas.DataFrame: @@ -429,13 +496,11 @@ def _sample_with_conditions(self, conditions, max_tries, batch_size_per_try): ValueError: If any of the following happens: * any of the conditions' columns are not valid. - * `graceful_reject_sampling` is `False` and not enough valid rows could be - sampled within `max_tries` trials. * no rows could be generated. """ for column in conditions.columns: if column not in self._metadata.get_fields(): - raise ValueError(f'Error: Unexpected column name `{column}`. ' + raise ValueError(f'Unexpected column name `{column}`. ' f'Use a column name that was present in the original data.') try: @@ -468,6 +533,8 @@ def _sample_with_conditions(self, conditions, max_tries, batch_size_per_try): None, max_tries, batch_size_per_try, + progress_bar=progress_bar, + output_file_path=output_file_path, ) all_sampled_rows.append(sampled_rows) else: @@ -484,6 +551,8 @@ def _sample_with_conditions(self, conditions, max_tries, batch_size_per_try): transformed_condition, max_tries, batch_size_per_try, + progress_bar=progress_bar, + output_file_path=output_file_path, ) all_sampled_rows.append(sampled_rows) @@ -496,7 +565,7 @@ def _sample_with_conditions(self, conditions, max_tries, batch_size_per_try): return all_sampled_rows def sample_conditions(self, conditions, max_tries=100, batch_size_per_try=None, - randomize_samples=True): + randomize_samples=True, output_file_path=None): """Sample rows from this table with the given conditions. Args: @@ -510,8 +579,11 @@ def sample_conditions(self, conditions, max_tries=100, batch_size_per_try=None, The batch size to use per attempt at sampling. Defaults to 10 times the number of rows. randomize_samples (bool): - Whether or not to use a a fixed seed when sampling. Defaults + Whether or not to use a fixed seed when sampling. Defaults to True. + output_file_path (str or None): + The file to periodically write sampled rows to. Defaults to + a temporary file, if None. Returns: pandas.DataFrame: @@ -525,13 +597,64 @@ def sample_conditions(self, conditions, max_tries=100, batch_size_per_try=None, * any of the conditions' columns are not valid. * no rows could be generated. """ + output_file_path = self._validate_file_path(output_file_path) + + num_rows = functools.reduce( + lambda num_rows, condition: condition.get_num_rows() + num_rows, conditions, 0) conditions = self._make_condition_dfs(conditions) - sampled = pd.DataFrame() - for condition_dataframe in conditions: - sampled_for_condition = self._sample_with_conditions( - condition_dataframe, max_tries, batch_size_per_try) - sampled = pd.concat([sampled, sampled_for_condition], ignore_index=True) + with tqdm(total=num_rows) as progress_bar: + sampled = pd.DataFrame() + for condition_dataframe in conditions: + sampled_for_condition = self._sample_with_conditions( + condition_dataframe, + max_tries, + batch_size_per_try, + progress_bar, + output_file_path, + ) + sampled = pd.concat([sampled, sampled_for_condition], ignore_index=True) + + return sampled + + def sample_remaining_columns(self, known_columns, max_tries=100, batch_size_per_try=None, + randomize_samples=True, output_file_path=None): + """Sample rows from this table. + + Args: + known_columns (pandas.DataFrame): + A pandas.DataFrame with the columns that are already known. The output + is a DataFrame such that each row in the output is sampled + conditionally on the corresponding row in the input. + max_tries (int): + Number of times to try sampling discarded rows. Defaults to 100. + batch_size_per_try (int): + The batch size to use per attempt at sampling. Defaults to 10 times + the number of rows. + randomize_samples (bool): + Whether or not to use a fixed seed when sampling. Defaults + to True. + output_file_path (str or None): + The file to periodically write sampled rows to. Defaults to + a temporary file, if None. + + Returns: + pandas.DataFrame: + Sampled data. + + Raises: + ConstraintsNotMetError: + If the conditions are not valid for the given constraints. + ValueError: + If any of the following happens: + * any of the conditions' columns are not valid. + * no rows could be generated. + """ + output_file_path = self._validate_file_path(output_file_path) + + with tqdm(total=len(known_columns)) as progress_bar: + sampled = self._sample_with_conditions( + known_columns, max_tries, batch_size_per_try, progress_bar, output_file_path) return sampled diff --git a/tests/integration/tabular/test_base.py b/tests/integration/tabular/test_base.py index 93b7eafa0..b90d4f4ca 100644 --- a/tests/integration/tabular/test_base.py +++ b/tests/integration/tabular/test_base.py @@ -38,23 +38,23 @@ def test_conditional_sampling_graceful_reject_sampling_True_dict(model): model.sample_conditions(conditions=conditions) -#@pytest.mark.parametrize('model', MODELS) -#def test_conditional_sampling_graceful_reject_sampling_True_dataframe(model): -# data = pd.DataFrame({ -# 'column1': list(range(100)), -# 'column2': list(range(100)), -# 'column3': list(range(100)) -# }) -# -# model.fit(data) -# conditions = pd.DataFrame({ -# 'column1': [28], -# 'column2': [37], -# 'column3': [93] -# }) -# -# with pytest.raises(ValueError): -# model.sample(conditions=conditions, graceful_reject_sampling=True) +@pytest.mark.parametrize('model', MODELS) +def test_conditional_sampling_graceful_reject_sampling_True_dataframe(model): + data = pd.DataFrame({ + 'column1': list(range(100)), + 'column2': list(range(100)), + 'column3': list(range(100)) + }) + + model.fit(data) + conditions = pd.DataFrame({ + 'column1': [28], + 'column2': [37], + 'column3': [93] + }) + + with pytest.raises(ValueError): + model.sample_remaining_columns(conditions) def test_fit_with_unique_constraint_on_data_with_only_index_column(): @@ -265,7 +265,6 @@ def test_conditional_sampling_constraint_uses_reject_sampling(gm_mock): }) sample_calls = model._model.sample.mock_calls assert len(sample_calls) == 2 - model._model.sample.assert_any_call(5, conditions=expected_transformed_conditions) model._model.sample.assert_any_call(50, conditions=expected_transformed_conditions) pd.testing.assert_frame_equal(sampled_data, expected_data) diff --git a/tests/integration/tabular/test_copulagan.py b/tests/integration/tabular/test_copulagan.py index bf7502d0e..9f10c2754 100644 --- a/tests/integration/tabular/test_copulagan.py +++ b/tests/integration/tabular/test_copulagan.py @@ -92,50 +92,50 @@ def test_recreate(): def test_conditional_sampling_dict(): data = pd.DataFrame({ - "column1": [1.0, 0.5, 2.5] * 10, - "column2": ["a", "b", "c"] * 10 + 'column1': [1.0, 0.5, 2.5] * 10, + 'column2': ['a', 'b', 'c'] * 10 }) model = CopulaGAN(epochs=1) model.fit(data) conditions = [Condition({ - "column2": "b" + 'column2': 'b' }, num_rows=30)] sampled = model.sample_conditions(conditions=conditions) assert sampled.shape == data.shape - assert set(sampled["column2"].unique()) == set(["b"]) - - -#def test_conditional_sampling_dataframe(): -# data = pd.DataFrame({ -# "column1": [1.0, 0.5, 2.5] * 10, -# "column2": ["a", "b", "c"] * 10 -# }) -# -# model = CopulaGAN(epochs=1) -# model.fit(data) -# conditions = pd.DataFrame({ -# "column2": ["b", "b", "b", "c", "c"] -# }) -# sampled = model.sample(conditions=conditions) -# -# assert sampled.shape[0] == len(conditions["column2"]) -# assert (sampled["column2"] == np.array(["b", "b", "b", "c", "c"])).all() -# -# + assert set(sampled['column2'].unique()) == set(['b']) + + +def test_conditional_sampling_dataframe(): + data = pd.DataFrame({ + 'column1': [1.0, 0.5, 2.5] * 10, + 'column2': ['a', 'b', 'c'] * 10 + }) + + model = CopulaGAN(epochs=1) + model.fit(data) + conditions = pd.DataFrame({ + 'column2': ['b', 'b', 'b', 'c', 'c'] + }) + sampled = model.sample_remaining_columns(conditions) + + assert sampled.shape[0] == len(conditions['column2']) + assert (sampled['column2'] == np.array(['b', 'b', 'b', 'c', 'c'])).all() + + def test_conditional_sampling_two_conditions(): data = pd.DataFrame({ - "column1": [1.0, 0.5, 2.5] * 10, - "column2": ["a", "b", "c"] * 10, - "column3": ["d", "e", "f"] * 10 + 'column1': [1.0, 0.5, 2.5] * 10, + 'column2': ['a', 'b', 'c'] * 10, + 'column3': ['d', 'e', 'f'] * 10 }) model = CopulaGAN(epochs=1) model.fit(data) conditions = [Condition({ - "column2": "b", - "column3": "f" + 'column2': 'b', + 'column3': 'f' }, num_rows=5)] samples = model.sample_conditions(conditions=conditions) assert list(samples.column2) == ['b'] * 5 @@ -144,15 +144,15 @@ def test_conditional_sampling_two_conditions(): def test_conditional_sampling_numerical(): data = pd.DataFrame({ - "column1": [1.0, 0.5, 2.5] * 10, - "column2": ["a", "b", "c"] * 10, - "column3": ["d", "e", "f"] * 10 + 'column1': [1.0, 0.5, 2.5] * 10, + 'column2': ['a', 'b', 'c'] * 10, + 'column3': ['d', 'e', 'f'] * 10 }) model = CopulaGAN(epochs=1) model.fit(data) conditions = [Condition({ - "column1": 1.0, + 'column1': 1.0, }, num_rows=5)] sampled = model.sample_conditions(conditions=conditions) diff --git a/tests/integration/tabular/test_copulas.py b/tests/integration/tabular/test_copulas.py index 0c6770297..6269f7bd9 100644 --- a/tests/integration/tabular/test_copulas.py +++ b/tests/integration/tabular/test_copulas.py @@ -185,50 +185,50 @@ def test_ids_only(): def test_conditional_sampling_dict(): data = pd.DataFrame({ - "column1": [1.0, 0.5, 2.5] * 10, - "column2": ["a", "b", "c"] * 10 + 'column1': [1.0, 0.5, 2.5] * 10, + 'column2': ['a', 'b', 'c'] * 10 }) model = GaussianCopula() model.fit(data) conditions = [Condition({ - "column2": "b" + 'column2': 'b' }, num_rows=30)] sampled = model.sample_conditions(conditions=conditions) assert sampled.shape == data.shape - assert set(sampled["column2"].unique()) == set(["b"]) - - -#def test_conditional_sampling_dataframe(): -# data = pd.DataFrame({ -# "column1": [1.0, 0.5, 2.5] * 10, -# "column2": ["a", "b", "c"] * 10 -# }) -# -# model = GaussianCopula() -# model.fit(data) -# conditions = pd.DataFrame({ -# "column2": ["b", "b", "b", "c", "c"] -# }) -# sampled = model.sample(conditions=conditions) -# -# assert sampled.shape[0] == len(conditions["column2"]) -# assert (sampled["column2"] == np.array(["b", "b", "b", "c", "c"])).all() -# -# + assert set(sampled['column2'].unique()) == set(['b']) + + +def test_conditional_sampling_dataframe(): + data = pd.DataFrame({ + 'column1': [1.0, 0.5, 2.5] * 10, + 'column2': ['a', 'b', 'c'] * 10 + }) + + model = GaussianCopula() + model.fit(data) + conditions = pd.DataFrame({ + 'column2': ['b', 'b', 'b', 'c', 'c'] + }) + sampled = model.sample_remaining_columns(conditions) + + assert sampled.shape[0] == len(conditions['column2']) + assert (sampled['column2'] == np.array(['b', 'b', 'b', 'c', 'c'])).all() + + def test_conditional_sampling_two_conditions(): data = pd.DataFrame({ - "column1": [1.0, 0.5, 2.5] * 10, - "column2": ["a", "b", "c"] * 10, - "column3": ["d", "e", "f"] * 10 + 'column1': [1.0, 0.5, 2.5] * 10, + 'column2': ['a', 'b', 'c'] * 10, + 'column3': ['d', 'e', 'f'] * 10 }) model = GaussianCopula() model.fit(data) conditions = [Condition({ - "column2": "b", - "column3": "f" + 'column2': 'b', + 'column3': 'f' }, num_rows=5)] samples = model.sample_conditions(conditions=conditions) assert list(samples.column2) == ['b'] * 5 @@ -237,15 +237,15 @@ def test_conditional_sampling_two_conditions(): def test_conditional_sampling_numerical(): data = pd.DataFrame({ - "column1": [1.0, 0.5, 2.5] * 10, - "column2": ["a", "b", "c"] * 10, - "column3": ["d", "e", "f"] * 10 + 'column1': [1.0, 0.5, 2.5] * 10, + 'column2': ['a', 'b', 'c'] * 10, + 'column3': ['d', 'e', 'f'] * 10 }) model = GaussianCopula() model.fit(data) conditions = [Condition({ - "column1": 1.0, + 'column1': 1.0, }, num_rows=5)] sampled = model.sample_conditions(conditions=conditions) diff --git a/tests/integration/tabular/test_ctgan.py b/tests/integration/tabular/test_ctgan.py index 259550cfa..62d31bb32 100644 --- a/tests/integration/tabular/test_ctgan.py +++ b/tests/integration/tabular/test_ctgan.py @@ -89,50 +89,50 @@ def test_recreate(): def test_conditional_sampling_dict(): data = pd.DataFrame({ - "column1": [1.0, 0.5, 2.5] * 10, - "column2": ["a", "b", "c"] * 10 + 'column1': [1.0, 0.5, 2.5] * 10, + 'column2': ['a', 'b', 'c'] * 10 }) model = CTGAN(epochs=1) model.fit(data) conditions = [Condition({ - "column2": "b" + 'column2': 'b' }, num_rows=30)] sampled = model.sample_conditions(conditions=conditions) assert sampled.shape == data.shape - assert set(sampled["column2"].unique()) == set(["b"]) - - -#def test_conditional_sampling_dataframe(): -# data = pd.DataFrame({ -# "column1": [1.0, 0.5, 2.5] * 10, -# "column2": ["a", "b", "c"] * 10 -# }) -# -# model = CTGAN(epochs=1) -# model.fit(data) -# conditions = pd.DataFrame({ -# "column2": ["b", "b", "b", "c", "c"] -# }) -# sampled = model.sample(conditions=conditions) -# -# assert sampled.shape[0] == len(conditions["column2"]) -# assert (sampled["column2"] == np.array(["b", "b", "b", "c", "c"])).all() -# -# + assert set(sampled['column2'].unique()) == set(['b']) + + +def test_conditional_sampling_dataframe(): + data = pd.DataFrame({ + 'column1': [1.0, 0.5, 2.5] * 10, + 'column2': ['a', 'b', 'c'] * 10 + }) + + model = CTGAN(epochs=1) + model.fit(data) + conditions = pd.DataFrame({ + 'column2': ['b', 'b', 'b', 'c', 'c'] + }) + sampled = model.sample_remaining_columns(conditions) + + assert sampled.shape[0] == len(conditions['column2']) + assert (sampled['column2'] == np.array(['b', 'b', 'b', 'c', 'c'])).all() + + def test_conditional_sampling_two_conditions(): data = pd.DataFrame({ - "column1": [1.0, 0.5, 2.5] * 10, - "column2": ["a", "b", "c"] * 10, - "column3": ["d", "e", "f"] * 10 + 'column1': [1.0, 0.5, 2.5] * 10, + 'column2': ['a', 'b', 'c'] * 10, + 'column3': ['d', 'e', 'f'] * 10 }) model = CTGAN(epochs=1) model.fit(data) conditions = [Condition({ - "column2": "b", - "column3": "f" + 'column2': 'b', + 'column3': 'f' }, num_rows=5)] samples = model.sample_conditions(conditions=conditions) assert list(samples.column2) == ['b'] * 5 @@ -141,15 +141,15 @@ def test_conditional_sampling_two_conditions(): def test_conditional_sampling_numerical(): data = pd.DataFrame({ - "column1": [1.0, 0.5, 2.5] * 10, - "column2": ["a", "b", "c"] * 10, - "column3": ["d", "e", "f"] * 10 + 'column1': [1.0, 0.5, 2.5] * 10, + 'column2': ['a', 'b', 'c'] * 10, + 'column3': ['d', 'e', 'f'] * 10 }) model = CTGAN(epochs=1) model.fit(data) conditions = [Condition({ - "column1": 1.0, + 'column1': 1.0, }, num_rows=5)] sampled = model.sample_conditions(conditions=conditions) diff --git a/tests/integration/tabular/test_tvae.py b/tests/integration/tabular/test_tvae.py index a62b68a21..b15bd465c 100644 --- a/tests/integration/tabular/test_tvae.py +++ b/tests/integration/tabular/test_tvae.py @@ -88,50 +88,50 @@ def test_recreate(): def test_conditional_sampling_dict(): data = pd.DataFrame({ - "column1": [1.0, 0.5, 2.5] * 10, - "column2": ["a", "b", "c"] * 10 + 'column1': [1.0, 0.5, 2.5] * 10, + 'column2': ['a', 'b', 'c'] * 10 }) model = TVAE(epochs=1) model.fit(data) conditions = [Condition({ - "column2": "b" + 'column2': 'b' }, num_rows=30)] sampled = model.sample_conditions(conditions=conditions) assert sampled.shape == data.shape - assert set(sampled["column2"].unique()) == set(["b"]) - - -#def test_conditional_sampling_dataframe(): -# data = pd.DataFrame({ -# "column1": [1.0, 0.5, 2.5] * 10, -# "column2": ["a", "b", "c"] * 10 -# }) -# -# model = TVAE(epochs=1) -# model.fit(data) -# conditions = pd.DataFrame({ -# "column2": ["b", "b", "b", "c", "c"] -# }) -# sampled = model.sample(conditions=conditions) -# -# assert sampled.shape[0] == len(conditions["column2"]) -# assert (sampled["column2"] == np.array(["b", "b", "b", "c", "c"])).all() -# -# + assert set(sampled['column2'].unique()) == set(['b']) + + +def test_conditional_sampling_dataframe(): + data = pd.DataFrame({ + 'column1': [1.0, 0.5, 2.5] * 10, + 'column2': ['a', 'b', 'c'] * 10 + }) + + model = TVAE(epochs=1) + model.fit(data) + conditions = pd.DataFrame({ + 'column2': ['b', 'b', 'b', 'c', 'c'] + }) + sampled = model.sample_remaining_columns(conditions) + + assert sampled.shape[0] == len(conditions['column2']) + assert (sampled['column2'] == np.array(['b', 'b', 'b', 'c', 'c'])).all() + + def test_conditional_sampling_two_conditions(): data = pd.DataFrame({ - "column1": [1.0, 0.5, 2.5] * 10, - "column2": ["a", "b", "c"] * 10, - "column3": ["d", "e", "f"] * 10 + 'column1': [1.0, 0.5, 2.5] * 10, + 'column2': ['a', 'b', 'c'] * 10, + 'column3': ['d', 'e', 'f'] * 10 }) model = TVAE(epochs=1) model.fit(data) conditions = [Condition({ - "column2": "b", - "column3": "f" + 'column2': 'b', + 'column3': 'f' }, num_rows=5)] samples = model.sample_conditions(conditions=conditions) assert list(samples.column2) == ['b'] * 5 @@ -140,15 +140,15 @@ def test_conditional_sampling_two_conditions(): def test_conditional_sampling_numerical(): data = pd.DataFrame({ - "column1": [1.0, 0.5, 2.5] * 10, - "column2": ["a", "b", "c"] * 10, - "column3": ["d", "e", "f"] * 10 + 'column1': [1.0, 0.5, 2.5] * 10, + 'column2': ['a', 'b', 'c'] * 10, + 'column3': ['d', 'e', 'f'] * 10 }) model = TVAE(epochs=1) model.fit(data) conditions = [Condition({ - "column1": 1.0, + 'column1': 1.0, }, num_rows=5)] sampled = model.sample_conditions(conditions=conditions) diff --git a/tests/unit/tabular/test_base.py b/tests/unit/tabular/test_base.py index 5b99d566e..b654797e8 100644 --- a/tests/unit/tabular/test_base.py +++ b/tests/unit/tabular/test_base.py @@ -1,4 +1,4 @@ -from unittest.mock import Mock, call, patch +from unittest.mock import ANY, Mock, call, patch import pandas as pd import pytest @@ -69,6 +69,8 @@ def test__sample_with_conditions_no_transformed_columns(self): None, 100, None, + progress_bar=None, + output_file_path=None, ) pd.testing.assert_frame_equal(out, expected) @@ -171,6 +173,73 @@ def test_sample_num_rows_none(self): match=r'You must specify the number of rows to sample \(e.g. num_rows=100\)'): model.sample(num_rows) + def test_sample_batch_size(self): + """Test the `BaseTabularModel.sample` method with a valid `batch_size` argument. + + Expect that the expected calls to `_sample_batch` are made. + + Input: + - num_rows = 10 + - batch_size = 5 + Output: + - The requested number of sampled rows. + Side Effect: + - Call `_sample_batch` method twice with the expected number of rows. + """ + # Setup + gaussian_copula = Mock(spec_set=GaussianCopula) + sampled_data = pd.DataFrame({ + 'column1': [28, 28, 21, 1, 2], + 'column2': [37, 37, 1, 4, 5], + 'column3': [93, 93, 6, 4, 12], + }) + gaussian_copula._sample_batch.side_effect = [sampled_data, sampled_data] + + # Run + output = BaseTabularModel.sample(gaussian_copula, 10, batch_size=5) + + # Assert + assert gaussian_copula._sample_batch.has_calls([ + call(5, batch_size_per_try=5, progress_bar=ANY, output_file_path=None), + call(5, batch_size_per_try=5, progress_bar=ANY, output_file_path=None), + ]) + assert len(output) == 10 + + def test__sample_batch_with_batch_size_per_try(self): + """Test the `BaseTabularModel._sample_batch` method with `batch_size_per_try`. + + Expect that the expected calls to `_sample_rows` are made. + + Input: + - num_rows = 10 + - batch_size_per_try = 5 + Output: + - The requested number of sampled rows. + Side Effect: + - Call `_sample_rows` method twice with the expected number of rows. + """ + # Setup + gaussian_copula = Mock(spec_set=GaussianCopula) + sampled_data = pd.DataFrame({ + 'column1': [28, 28, 21, 1, 2], + 'column2': [37, 37, 1, 4, 5], + 'column3': [93, 93, 6, 4, 12], + }) + gaussian_copula._sample_rows.side_effect = [ + (sampled_data, 5), + (sampled_data.append(sampled_data, ignore_index=False), 10), + ] + + # Run + output = BaseTabularModel._sample_batch(gaussian_copula, num_rows=10, batch_size_per_try=5) + + # Assert + assert gaussian_copula._sample_rows.has_calls([ + call(5, None, None, 0.01, DataFrameMatcher(pd.DataFrame())), + call(5, None, None, 0.01, DataFrameMatcher(sampled_data)), + ]) + assert len(output) == 10 + def test_sample_conditions_with_multiple_conditions(self): """Test the `BaseTabularModel.sample_conditions` method with multiple condtions. @@ -184,6 +253,7 @@ def test_sample_conditions_with_multiple_conditions(self): """ # Setup gaussian_copula = Mock(spec_set=GaussianCopula) + gaussian_copula._validate_file_path.return_value = None condition_values1 = {'cola': 'a'} condition1 = Condition(condition_values1, num_rows=2) @@ -212,11 +282,92 @@ def test_sample_conditions_with_multiple_conditions(self): # Asserts gaussian_copula._sample_with_conditions.assert_has_calls([ - call(DataFrameMatcher(pd.DataFrame([condition_values1] * 2)), 100, None), - call(DataFrameMatcher(pd.DataFrame([condition_values2] * 3)), 100, None), + call(DataFrameMatcher(pd.DataFrame([condition_values1] * 2)), 100, + None, ANY, None), + call(DataFrameMatcher(pd.DataFrame([condition_values2] * 3)), 100, + None, ANY, None), ]) pd.testing.assert_frame_equal(out, expected) + def test_sample_remaining_columns(self): + """Test the `BaseTabularModel.sample_remaining_colmns` method. + + When a valid DataFrame is given, expect `_sample_with_conditions` to be called + with the input DataFrame. + + Input: + - DataFrame with condition column values populated. + Output: + - The expected sampled rows. + Side Effects: + - `_sample_with_conditions` is called once. + """ + # Setup + gaussian_copula = Mock(spec_set=GaussianCopula) + gaussian_copula._validate_file_path.return_value = None + + conditions = pd.DataFrame([{'cola': 'a'}] * 5) + + sampled = pd.DataFrame({ + 'cola': ['a', 'a', 'a', 'a', 'a'], + 'colb': [1, 2, 1, 1, 1], + }) + gaussian_copula._sample_with_conditions.return_value = sampled + + # Run + out = GaussianCopula.sample_remaining_columns(gaussian_copula, conditions) + + # Asserts + gaussian_copula._sample_with_conditions.assert_called_once_with( + DataFrameMatcher(conditions), 100, None, ANY, None) + pd.testing.assert_frame_equal(out, sampled) + + def test__sample_with_conditions_invalid_column(self): + """Test the `BaseTabularModel._sample_with_conditions` method with an invalid column. + + When a condition has an invalid column, expect a ValueError. + + Setup: + - Conditions DataFrame contains `colb` which is not present in the metadata. + Input: + - Conditions DataFrame with an invalid column. + Side Effects: + - A ValueError is thrown. + """ + # Setup + gaussian_copula = Mock(spec_set=GaussianCopula) + metadata_mock = Mock() + metadata_mock.get_fields.return_value = {'cola': {}} + gaussian_copula._metadata = metadata_mock + + conditions = pd.DataFrame([{'colb': 'a'}] * 5) + + # Run and Assert + with pytest.raises(ValueError, match=( + 'Unexpected column name `colb`. ' + 'Use a column name that was present in the original data.')): + GaussianCopula._sample_with_conditions(gaussian_copula, conditions, 100, None) + + @patch('sdv.tabular.base.os.path') + def test__validate_file_path(self, path_mock): + """Test the `BaseTabularModel._validate_file_path` method. + + Expect that an error is thrown if the file path already exists. + + Input: + - A file path that already exists. + Side Effects: + - An AssertionError. + """ + # Setup + path_mock.exists.return_value = True + path_mock.abspath.return_value = 'path/to/file' + gaussian_copula = Mock(spec_set=GaussianCopula) + + # Run and Assert + with pytest.raises(AssertionError, match='path/to/file already exists'): + BaseTabularModel._validate_file_path(gaussian_copula, 'file_path') + @patch('sdv.tabular.base.Table', spec_set=Table) def test__init__passes_correct_parameters(metadata_mock): @@ -382,7 +533,7 @@ def test__sample_with_conditions_empty_transformed_conditions(): pd.testing.assert_series_equal(args[0]['column1'], conditions_series) assert kwargs['on_missing_column'] == 'drop' model._metadata.transform.assert_called_once() - model._sample_batch.assert_called_with(5, 100, None, conditions, None, 0.01) + model._sample_batch.assert_called_with(5, 100, None, conditions, None, 0.01, None, None) pd.testing.assert_frame_equal(output, expected_output) @@ -444,13 +595,13 @@ def test__sample_with_conditions_transform_conditions_correctly(): assert kwargs['on_missing_column'] == 'drop' model._metadata.transform.assert_called_once() model._sample_batch.assert_any_call( - 3, 100, None, {'column1': 25}, {'transformed_column': 50}, 0.01 + 3, 100, None, {'column1': 25}, {'transformed_column': 50}, 0.01, None, None, ) model._sample_batch.assert_any_call( - 1, 100, None, {'column1': 30}, {'transformed_column': 60}, 0.01 + 1, 100, None, {'column1': 30}, {'transformed_column': 60}, 0.01, None, None, ) model._sample_batch.assert_any_call( - 1, 100, None, {'column1': 30}, {'transformed_column': 70}, 0.01 + 1, 100, None, {'column1': 30}, {'transformed_column': 70}, 0.01, None, None, ) diff --git a/tutorials/single_table_data/01_GaussianCopula_Model.ipynb b/tutorials/single_table_data/01_GaussianCopula_Model.ipynb index 788a1a5f8..286dabdb3 100644 --- a/tutorials/single_table_data/01_GaussianCopula_Model.ipynb +++ b/tutorials/single_table_data/01_GaussianCopula_Model.ipynb @@ -2653,9 +2653,9 @@ "source": [ "### Conditional Sampling\n", "\n", - "As the name implies, conditional sampling allows us to sample from a conditional distribution using `GaussianCopula`, which means we can generate only values that satisfy certain conditions. These conditional values can be passed to the `conditions` parameter in the `sample` method either as a dataframe or a dictionary.\n", + "As the name implies, conditional sampling allows us to sample from a conditional distribution using `GaussianCopula`, which means we can generate only values that satisfy certain conditions. These conditional values can be passed to the `sample_conditions` method as a list of `sdv.sampling.Condition` objects or to the `sample_remaining_columns` method as a dataframe. \n", "\n", - "In case a dictionary is passed, the model will generate as many rows as requested, all of which will satisfy the specified conditions, such as `gender = M`." + "When specifying a `sdv.sampling.Condition` object, we can pass in the desired conditions as a dictionary, as well as specify the number of desired rows for that condition." ] }, { @@ -2837,10 +2837,13 @@ } ], "source": [ - "conditions = {\n", + "from sdv.sampling import Condition\n", + "\n", + "condition = Condition({\n", " 'gender': 'M'\n", - "}\n", - "model.sample(5, conditions=conditions)" + "}, num_rows=5)\n", + "\n", + "model.sample_conditions(conditions=[condition])" ] }, { @@ -3029,18 +3032,19 @@ } ], "source": [ - "conditions = {\n", + "condition = Condition({\n", " 'gender': 'M',\n", " 'experience_years': 0\n", - "}\n", - "model.sample(5, conditions=conditions)" + "}, num_rows=5)\n", + "\n", + "model.sample_conditions(conditions=[condition])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "`conditions` can also be passed as a dataframe. In that case, the model will generate one sample for each row of the dataframe, sorted in the same order. Since the model already knows how many samples to generate, passing it as a parameter is unnecessary. For example, if we want to generate three samples where `gender = M` and three samples with `gender = F`, all of them with `work_experience = True`, we can do the following: " + "In the `sample_remaining_columns` method, `conditions` is passed as a dataframe. In that case, the model will generate one sample for each row of the dataframe, sorted in the same order. Since the model already knows how many samples to generate, passing it as a parameter is unnecessary. For example, if we want to generate three samples where `gender = M` and three samples with `gender = F`, all of them with `work_experience = True`, we can do the following: " ] }, { @@ -3251,7 +3255,7 @@ " 'gender': ['M', 'M', 'M', 'F', 'F', 'F'],\n", " 'work_experience': [True, True, True, True, True, True]\n", "})\n", - "model.sample(conditions=conditions)" + "model.sample_remaining_columns(conditions)" ] }, { @@ -3440,10 +3444,11 @@ } ], "source": [ - "conditions = {\n", + "condition = Condition({\n", " 'degree_perc': 70.0\n", - "}\n", - "model.sample(5, conditions=conditions)" + "}, num_rows=5)\n", + "\n", + "model.sample_conditions(conditions=[condition])" ] }, { @@ -3489,7 +3494,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -3503,7 +3508,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.8" + "version": "3.8.12" } }, "nbformat": 4, diff --git a/tutorials/single_table_data/02_CTGAN_Model.ipynb b/tutorials/single_table_data/02_CTGAN_Model.ipynb index aef3d23e8..3a52819cf 100644 --- a/tutorials/single_table_data/02_CTGAN_Model.ipynb +++ b/tutorials/single_table_data/02_CTGAN_Model.ipynb @@ -2074,9 +2074,9 @@ "source": [ "### Conditional Sampling\n", "\n", - "As the name implies, conditional sampling allows us to sample from a conditional distribution using the `CTGAN` model, which means we can generate only values that satisfy certain conditions. These conditional values can be passed to the `conditions` parameter in the `sample` method either as a dataframe or a dictionary.\n", + "As the name implies, conditional sampling allows us to sample from a conditional distribution using the `CTGAN` model, which means we can generate only values that satisfy certain conditions. These conditional values can be passed to the `sample_conditions` method as a list of `sdv.sampling.Condition` objects or to the `sample_remaining_columns` method as a dataframe. \n", "\n", - "In case a dictionary is passed, the model will generate as many rows as requested, all of which will satisfy the specified conditions, such as `gender = M`." + "When specifying a `sdv.sampling.Condition` object, we can pass in the desired conditions as a dictionary, as well as specify the number of desired rows for that condition." ] }, { @@ -2258,10 +2258,13 @@ } ], "source": [ - "conditions = {\n", + "from sdv.sampling import Condition\n", + "\n", + "condition = Condition({\n", " 'gender': 'M'\n", - "}\n", - "model.sample(5, conditions=conditions)" + "}, num_rows=5)\n", + "\n", + "model.sample_conditions(conditions=[condition])" ] }, { @@ -2450,18 +2453,19 @@ } ], "source": [ - "conditions = {\n", + "condition = Condition({\n", " 'gender': 'M',\n", " 'experience_years': 0\n", - "}\n", - "model.sample(5, conditions=conditions)" + "}, num_rows=5)\n", + "\n", + "model.sample_conditions(conditions=[condition])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "`conditions` can also be passed as a dataframe. In that case, the model will generate one sample for each row of the dataframe, sorted in the same order. Since the model already knows how many samples to generate, passing it as a parameter is unnecessary. For example, if we want to generate three samples where `gender = M` and three samples with `gender = F`, all of them with `work_experience = True`, we can do the following: " + "In the `sample_remaining_columns` method, `conditions` is passed as a dataframe. In that case, the model will generate one sample for each row of the dataframe, sorted in the same order. Since the model already knows how many samples to generate, passing it as a parameter is unnecessary. For example, if we want to generate three samples where `gender = M` and three samples with `gender = F`, all of them with `work_experience = True`, we can do the following: " ] }, { @@ -2672,7 +2676,7 @@ " 'gender': ['M', 'M', 'M', 'F', 'F', 'F'],\n", " 'work_experience': [True, True, True, True, True, True]\n", "})\n", - "model.sample(conditions=conditions)" + "model.sample_remaining_columns(conditions)" ] }, { @@ -2861,10 +2865,11 @@ } ], "source": [ - "conditions = {\n", + "condition = Condition({\n", " 'degree_perc': 70.0\n", - "}\n", - "model.sample(5, conditions=conditions)" + "}, num_rows=5)\n", + "\n", + "model.sample_conditions(conditions=[condition])" ] }, { @@ -2910,7 +2915,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -2924,7 +2929,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.8.12" } }, "nbformat": 4, diff --git a/tutorials/single_table_data/03_CopulaGAN_Model.ipynb b/tutorials/single_table_data/03_CopulaGAN_Model.ipynb index 294a9d15a..0192d6f0c 100644 --- a/tutorials/single_table_data/03_CopulaGAN_Model.ipynb +++ b/tutorials/single_table_data/03_CopulaGAN_Model.ipynb @@ -2769,9 +2769,9 @@ "source": [ "### Conditional Sampling\n", "\n", - "As the name implies, conditional sampling allows us to sample from a conditional distribution using the `CopulaGAN` model, which means we can generate only values that satisfy certain conditions. These conditional values can be passed to the `conditions` parameter in the `sample` method either as a dataframe or a dictionary.\n", + "As the name implies, conditional sampling allows us to sample from a conditional distribution using the `CopulaGAN` model, which means we can generate only values that satisfy certain conditions. These conditional values can be passed to the `sample_conditions` method as a list of `sdv.sampling.Condition` objects or to the `sample_remaining_columns` method as a dataframe. \n", "\n", - "In case a dictionary is passed, the model will generate as many rows as requested, all of which will satisfy the specified conditions, such as `gender = M`." + "When specifying a `sdv.sampling.Condition` object, we can pass in the desired conditions as a dictionary, as well as specify the number of desired rows for that condition." ] }, { @@ -2953,10 +2953,13 @@ } ], "source": [ - "conditions = {\n", + "from sdv.sampling import Condition\n", + "\n", + "condition = Condition({\n", " 'gender': 'M'\n", - "}\n", - "model.sample(5, conditions=conditions)" + "}, num_rows=5)\n", + "\n", + "model.sample_conditions(conditions=[condition])" ] }, { @@ -3145,18 +3148,19 @@ } ], "source": [ - "conditions = {\n", + "condition = Condition({\n", " 'gender': 'M',\n", " 'experience_years': 0\n", - "}\n", - "model.sample(5, conditions=conditions)" + "}, num_rows=5)\n", + "\n", + "model.sample_conditions(conditions=[condition])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "`conditions` can also be passed as a dataframe. In that case, the model will generate one sample for each row of the dataframe, sorted in the same order. Since the model already knows how many samples to generate, passing it as a parameter is unnecessary. For example, if we want to generate three samples where `gender = M` and three samples with `gender = F` we can do the following: " + "In the `sample_remaining_columns` method, `conditions` is passed as a dataframe. In that case, the model will generate one sample for each row of the dataframe, sorted in the same order. Since the model already knows how many samples to generate, passing it as a parameter is unnecessary. For example, if we want to generate three samples where `gender = M` and three samples with `gender = F` we can do the following: " ] }, { @@ -3366,7 +3370,7 @@ "conditions = pd.DataFrame({\n", " 'gender': ['M', 'M', 'M', 'F', 'F', 'F']\n", "})\n", - "model.sample(conditions=conditions)" + "model.sample_remaining_columns(conditions)" ] }, { @@ -3555,10 +3559,11 @@ } ], "source": [ - "conditions = {\n", + "condition = Condition({\n", " 'degree_perc': 70.0\n", - "}\n", - "model.sample(5, conditions=conditions)" + "}, num_rows=5)\n", + "\n", + "model.sample_conditions(conditions=[condition])" ] }, { @@ -3604,7 +3609,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -3618,7 +3623,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.8.12" } }, "nbformat": 4, diff --git a/tutorials/single_table_data/04_TVAE_Model.ipynb b/tutorials/single_table_data/04_TVAE_Model.ipynb index 660d8b18d..3856c5a75 100644 --- a/tutorials/single_table_data/04_TVAE_Model.ipynb +++ b/tutorials/single_table_data/04_TVAE_Model.ipynb @@ -1981,9 +1981,9 @@ "source": [ "### Conditional Sampling\n", "\n", - "As the name implies, conditional sampling allows us to sample from a conditional distribution using the `TVAE` model, which means we can generate only values that satisfy certain conditions. These conditional values can be passed to the `conditions` parameter in the `sample` method either as a dataframe or a dictionary.\n", + "As the name implies, conditional sampling allows us to sample from a conditional distribution using the `TVAE` model, which means we can generate only values that satisfy certain conditions. These conditional values can be passed to the `sample_conditions` method as a list of `sdv.sampling.Condition` objects or to the `sample_remaining_columns` method as a dataframe. \n", "\n", - "In case a dictionary is passed, the model will generate as many rows as requested, all of which will satisfy the specified conditions, such as `gender = M`." + "When specifying a `sdv.sampling.Condition` object, we can pass in the desired conditions as a dictionary, as well as specify the number of desired rows for that condition." ] }, { @@ -2178,10 +2178,13 @@ } ], "source": [ - "conditions = {\n", + "from sdv.sampling import Condition\n", + "\n", + "condition = Condition({\n", " 'gender': 'M'\n", - "}\n", - "model.sample(5, conditions=conditions)" + "}, num_rows=5)\n", + "\n", + "model.sample_conditions(conditions=[condition])" ] }, { @@ -2383,18 +2386,19 @@ } ], "source": [ - "conditions = {\n", + "condition = Condition({\n", " 'gender': 'M',\n", " 'experience_years': 0\n", - "}\n", - "model.sample(5, conditions=conditions)" + "}, num_rows=5)\n", + "\n", + "model.sample_conditions(conditions=[condition])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "`conditions` can also be passed as a dataframe. In that case, the model will generate one sample for each row of the dataframe, sorted in the same order. Since the model already knows how many samples to generate, passing it as a parameter is unnecessary. For example, if we want to generate three samples where `gender = M` and three samples with `gender = F`, we can do the following: " + "In the `sample_remaining_columns` method, `conditions` is passed as a dataframe. In that case, the model will generate one sample for each row of the dataframe, sorted in the same order. Since the model already knows how many samples to generate, passing it as a parameter is unnecessary. For example, if we want to generate three samples where `gender = M` and three samples with `gender = F`, we can do the following: " ] }, { @@ -2619,7 +2623,7 @@ "conditions = pd.DataFrame({\n", " 'gender': ['M', 'M', 'M', 'F', 'F', 'F'],\n", "})\n", - "model.sample(conditions=conditions)" + "model.sample_remaining_columns(conditions)" ] }, { @@ -2821,10 +2825,11 @@ } ], "source": [ - "conditions = {\n", + "condition = Condition({\n", " 'degree_perc': 70.0\n", - "}\n", - "model.sample(5, conditions=conditions)" + "}, num_rows=5)\n", + "\n", + "model.sample_conditions(conditions=[condition])" ] }, { @@ -2870,7 +2875,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -2884,7 +2889,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.8.12" } }, "nbformat": 4,