diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 5339d5f9d..c3c2e8ac7 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -9,7 +9,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: [3.6, 3.7, 3.8] + python-version: [3.6, 3.7, 3.8, 3.9] os: [ubuntu-latest, macos-10.15, windows-latest] steps: - uses: actions/checkout@v1 diff --git a/.github/workflows/minimum.yml b/.github/workflows/minimum.yml index bd90c4422..cb2f3af55 100644 --- a/.github/workflows/minimum.yml +++ b/.github/workflows/minimum.yml @@ -9,7 +9,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: [3.6, 3.7, 3.8] + python-version: [3.6, 3.7, 3.8, 3.9] os: [ubuntu-latest, macos-10.15, windows-latest] steps: - uses: actions/checkout@v1 diff --git a/.github/workflows/readme.yml b/.github/workflows/readme.yml index bd4d6d868..2fe4b64c5 100644 --- a/.github/workflows/readme.yml +++ b/.github/workflows/readme.yml @@ -9,7 +9,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: [3.6, 3.7, 3.8] + python-version: [3.6, 3.7, 3.8, 3.9] os: [ubuntu-latest, macos-10.15] # skip windows bc rundoc fails steps: - uses: actions/checkout@v1 diff --git a/.github/workflows/tutorials.yml b/.github/workflows/tutorials.yml index 5b67c7eea..d6957d77d 100644 --- a/.github/workflows/tutorials.yml +++ b/.github/workflows/tutorials.yml @@ -9,8 +9,8 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: [3.6, 3.7, 3.8] - os: [ubuntu-latest, macos-latest, windows-latest] + python-version: [3.6, 3.7, 3.8, 3.9] + os: [ubuntu-latest, macos-10.15, windows-latest] steps: - uses: actions/checkout@v1 - name: Set up Python ${{ matrix.python-version }} diff --git a/.github/workflows/unit.yml b/.github/workflows/unit.yml index ef1c619c1..185587314 100644 --- a/.github/workflows/unit.yml +++ b/.github/workflows/unit.yml @@ -9,7 +9,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: [3.6, 3.7, 3.8] + python-version: [3.6, 3.7, 3.8, 3.9] os: [ubuntu-latest, macos-10.15, windows-latest] steps: - uses: actions/checkout@v1 diff --git a/.gitignore b/.gitignore index 624f18db7..1bf92b0b5 100644 --- a/.gitignore +++ b/.gitignore @@ -109,7 +109,6 @@ ENV/ sdv/data/ docs/**/*.pkl docs/**/*metadata.json -docs/images docs/savefig tutorials/**/*.pkl tutorials/**/*metadata.json diff --git a/HISTORY.md b/HISTORY.md index add372eeb..43a6eed6b 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,25 @@ # Release Notes +## 0.13.1 - 2021-12-22 + +This release adds support for passing tabular constraints to the HMA1 model, and adds more explicit error handling for +metric evaluation. It also includes a fix for using categorical columns in the PAR model and documentation updates +for metadata and HMA1. + +### Bugs Fixed + +* Categorical column after sequence_index column - Issue [#314](https://github.com/sdv-dev/SDV/issues/314) by @fealho + +### New Features + +* Support passing tabular constraints to the HMA1 model - Issue [#296](https://github.com/sdv-dev/SDV/issues/296) by @katxiao +* Metric evaluation error handling metrics - Issue [#638](https://github.com/sdv-dev/SDV/issues/638) by @katxiao + +### Documentation Changes + +* Make true/false values lowercase in Metadata Schema specification - Issue [#664](https://github.com/sdv-dev/SDV/issues/664) by @katxiao +* Update docstrings for hma1 methods - Issue [#642](https://github.com/sdv-dev/SDV/issues/642) by @katxiao + ## 0.13.0 - 2021-11-22 This release makes multiple improvements to different `Constraint` classes. The `Unique` constraint can now diff --git a/README.md b/README.md index 534f1715b..c87d64bb9 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,7 @@ -

- - DAI-Lab - - An Open Source Project from the Data to AI Lab, at MIT +

+
+

+ This repository is part of The Synthetic Data Vault Project, a project from DataCebo.

[![Development Status](https://img.shields.io/badge/Development%20Status-2%20--%20Pre--Alpha-yellow)](https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha) @@ -13,17 +12,16 @@ [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/sdv-dev/SDV/master?filepath=tutorials) [![Slack](https://img.shields.io/badge/Slack%20Workspace-Join%20now!-36C5F0?logo=slack)](https://join.slack.com/t/sdv-space/shared_invite/zt-gdsfcb5w-0QQpFMVoyB2Yd6SRiMplcw) - +
+
+

+ +

+
-* Website: https://sdv.dev -* Documentation: https://sdv.dev/SDV - * [User Guides](https://sdv.dev/SDV/user_guides/index.html) - * [Developer Guides](https://sdv.dev/SDV/developer_guides/index.html) -* Github: https://github.com/sdv-dev/SDV -* License: [MIT](https://github.com/sdv-dev/SDV/blob/master/LICENSE) -* Development Status: [Pre-Alpha](https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha) +
-## Overview +# Overview The **Synthetic Data Vault (SDV)** is a **Synthetic Data Generation** ecosystem of libraries that allows users to easily learn [single-table]( @@ -41,7 +39,27 @@ Underneath the hood it uses several probabilistic graphical modeling and deep le techniques. To enable a variety of data storage structures, we employ unique hierarchical generative modeling and recursive sampling techniques. -### Current functionality and features: +| Important Links | | +| -------------------------- | -------------------------------------------------------------- | +| :computer: **[Website]** | Check out the SDV Website for more information about the project. | +| :orange_book: **[SDV Blog]** | Regular publshing of useful content about Synthetic Data Generation. | +| :book: **[Documentation]** | Quickstarts, User and Development Guides, and API Reference. | +| :octocat: **[Repository]** | The link to the Github Repository of this library. | +| :scroll: **[License]** | The entire ecosystem is published under the MIT License. | +| :keyboard: **[Development Status]** | This software is in its Pre-Alpha stage. | +| ![](slack.png) **[Community]** | Join our Slack Workspace for announcements and discussions. | +| ![](mybinder.png) **[Tutorials]** | Run the SDV Tutorials in a Binder environment. | + +[Website]: https://sdv.dev +[SDV Blog]: https://sdv.dev/blog +[Documentation]: https://sdv.dev/SDV +[Repository]: https://github.com/sdv-dev/SDV +[License]: https://github.com/sdv-dev/SDV/blob/master/LICENSE +[Development Status]: https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha +[Community]: https://join.slack.com/t/sdv-space/shared_invite/zt-gdsfcb5w-0QQpFMVoyB2Yd6SRiMplcw +[Tutorials]: https://mybinder.org/v2/gh/sdv-dev/SDV/master?filepath=tutorials + +## Current functionality and features: * Synthetic data generators for [single tables]( https://sdv.dev/SDV/user_guides/single_table/index.html) with the following @@ -89,7 +107,7 @@ pip install sdv **Using `conda`:** ```bash -conda install -c sdv-dev -c pytorch -c conda-forge sdv +conda install -c pytorch -c conda-forge sdv ``` For more installation options please visit the [SDV installation Guide]( @@ -254,3 +272,26 @@ Neha Patki, Roy Wedge, Kalyan Veeramachaneni. [The Synthetic Data Vault](https:/ month={Oct} } ``` + +--- + + +
+ +
+
+
+ +The [DataCebo team](https://datacebo.com) is the proud developer of [The Synthetic Data Vault Project]( +https://sdv.dev), the largest open source ecosystem for synthetic data generation & evaluation. +The ecosystem is home to multiple libraries that support synthetic data, including: + +* 🔄 Data discovery & transformation. Reverse the transforms to reproduce realistic data. +* 🧠 Multiple machine learning models -- ranging from Copulas to Deep Learning -- to create tabular, + multi table and time series data. +* 📊 Measuring quality and privacy of synthetic data, and comparing different synthetic data + generation models. + +[Get started using the SDV package](https://sdv.dev/SDV/getting_started/install.html) -- a fully +integrated solution and your one-stop shop for synthetic data.Or, use the standalone libraries +for specific needs. diff --git a/conda/meta.yaml b/conda/meta.yaml index 96ea4395c..4aae36b37 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -1,5 +1,5 @@ {% set name = 'sdv' %} -{% set version = '0.13.0' %} +{% set version = '0.13.1.dev1' %} package: name: "{{ name|lower }}" @@ -28,7 +28,7 @@ requirements: - ctgan >=0.5.0,<0.6 - deepecho >=0.3.0.post1,<0.4 - rdt >=0.6.1,<0.7 - - sdmetrics >=0.4.0,<0.5 + - sdmetrics >=0.4.1,<0.5 run: - graphviz - python >=3.6,<3.10 @@ -41,7 +41,7 @@ requirements: - ctgan >=0.5.0,<0.6 - deepecho >=0.3.0.post1,<0.4 - rdt >=0.6.1,<0.7 - - sdmetrics >=0.4.0,<0.5 + - sdmetrics >=0.4.1,<0.5 about: home: "https://sdv.dev" diff --git a/docs/developer_guides/sdv/metadata.rst b/docs/developer_guides/sdv/metadata.rst index ad509a155..5ef322573 100644 --- a/docs/developer_guides/sdv/metadata.rst +++ b/docs/developer_guides/sdv/metadata.rst @@ -130,7 +130,7 @@ the following keys. "fields": { "social_security_number": { "type": "categorical", - "pii": True, + "pii": true, "pii_category": "ssn" }, ... @@ -180,7 +180,7 @@ A list of all possible localizations can be found on the `Faker documentation si "fields": { "address": { "type": "categorical", - "pii": True, + "pii": true, "pii_category": "address" "pii_locales": ["sv_SE", "en_US"] }, @@ -215,7 +215,7 @@ If a field is specified as a ``primary_key`` of the table, then the field must b ... } -If the subtype of the primary key is integer, an optional regular expression can be passed to +If the subtype of the primary key is string, an optional regular expression can be passed to generate keys that match it: .. code-block:: python diff --git a/docs/images/CTGAN-DataCebo.png b/docs/images/CTGAN-DataCebo.png new file mode 100644 index 000000000..b913cfe9b Binary files /dev/null and b/docs/images/CTGAN-DataCebo.png differ diff --git a/docs/images/Copulas-DataCebo.png b/docs/images/Copulas-DataCebo.png new file mode 100644 index 000000000..4e198747b Binary files /dev/null and b/docs/images/Copulas-DataCebo.png differ diff --git a/docs/images/DataCebo-Blue.png b/docs/images/DataCebo-Blue.png new file mode 100644 index 000000000..993e00a69 Binary files /dev/null and b/docs/images/DataCebo-Blue.png differ diff --git a/docs/images/DataCebo.png b/docs/images/DataCebo.png new file mode 100644 index 000000000..22df6e6e5 Binary files /dev/null and b/docs/images/DataCebo.png differ diff --git a/docs/images/DeepEcho-DataCebo.png b/docs/images/DeepEcho-DataCebo.png new file mode 100644 index 000000000..eb0b26aaf Binary files /dev/null and b/docs/images/DeepEcho-DataCebo.png differ diff --git a/docs/images/RDT-DataCebo.png b/docs/images/RDT-DataCebo.png new file mode 100644 index 000000000..e5839f79e Binary files /dev/null and b/docs/images/RDT-DataCebo.png differ diff --git a/docs/images/SDGym-DataCebo.png b/docs/images/SDGym-DataCebo.png new file mode 100644 index 000000000..e9af4a9ee Binary files /dev/null and b/docs/images/SDGym-DataCebo.png differ diff --git a/docs/images/SDMetrics-DataCebo.png b/docs/images/SDMetrics-DataCebo.png new file mode 100644 index 000000000..6ed21cc24 Binary files /dev/null and b/docs/images/SDMetrics-DataCebo.png differ diff --git a/docs/images/SDV-DataCebo.png b/docs/images/SDV-DataCebo.png new file mode 100644 index 000000000..5f2d07b2c Binary files /dev/null and b/docs/images/SDV-DataCebo.png differ diff --git a/docs/user_guides/relational/constraints.rst b/docs/user_guides/relational/constraints.rst new file mode 100644 index 000000000..7fd1a797f --- /dev/null +++ b/docs/user_guides/relational/constraints.rst @@ -0,0 +1,82 @@ +.. _relational_constraints: + +Constraints +=========== + +SDV supports adding constraints within a single table. See :ref:`single_table_constraints` +for more information about the available single table constraints. + +In order to use single-table constraints within a relational model, you can pass +in a list of applicable constraints when adding a table to your relational ``Metadata``. +(See :ref:`relational_metadata` for more information on constructing a ``Metadata`` object.) + +In this example, we wish to add a ``UniqueCombinations`` constraint to our ``sessions`` table, +which is a child table of ``users``. First, we will create a ``Metadata`` object and add the +``users`` table. + +.. ipython:: python + :okwarning: + + from sdv import load_demo, Metadata + + tables = load_demo() + + metadata = Metadata() + + metadata.add_table( + name='users', + data=tables['users'], + primary_key='user_id' + ) + +The metadata now contains the ``users`` table. + +.. ipython:: python + :okwarning: + + metadata + +Now, we want to add a child table ``sessions`` which contains a single table constraint. +In the ``sessions`` table, we wish to only have combinations of ``(device, os)`` that +appear in the original data. + +.. ipython:: python + :okwarning: + + from sdv.constraints import UniqueCombinations + + constraint = UniqueCombinations(columns=['device', 'os']) + + metadata.add_table( + name='sessions', + data=tables['sessions'], + primary_key='session_id', + parent='users', + foreign_key='user_id', + constraints=[constraint], + ) + +If we get the table metadata for ``sessions``, we can see that the constraint has been added. + +.. ipython:: python + :okwarning: + + metadata.get_table_meta('sessions') + +We can now use this metadata to fit a relational model and synthesize data. + +.. ipython:: python + :okwarning: + + from sdv.relational import HMA1 + + model = HMA1(metadata) + model.fit(tables) + new_data = model.sample() + +In the sampled data, we should see that our constraint is being satisfied. + +.. ipython:: python + :okwarning: + + new_data diff --git a/docs/user_guides/relational/index.rst b/docs/user_guides/relational/index.rst index b34183efa..b262039ca 100644 --- a/docs/user_guides/relational/index.rst +++ b/docs/user_guides/relational/index.rst @@ -10,3 +10,4 @@ Relational Data data_description models + constraints diff --git a/docs/user_guides/single_table/custom_constraints.rst b/docs/user_guides/single_table/custom_constraints.rst index de422512f..821dbfb83 100644 --- a/docs/user_guides/single_table/custom_constraints.rst +++ b/docs/user_guides/single_table/custom_constraints.rst @@ -23,7 +23,7 @@ Let's look at a demo dataset: employees = load_tabular_demo() employees -The dataset defined in :ref:`_single_table_constraints` contains basic details about employees. +The dataset defined in :ref:`handling_constraints` contains basic details about employees. We will use this dataset to demonstrate how you can create your own constraint. diff --git a/sdv/__init__.py b/sdv/__init__.py index 05dcc9321..d02191756 100644 --- a/sdv/__init__.py +++ b/sdv/__init__.py @@ -6,7 +6,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.13.0' +__version__ = '0.13.1.dev1' from sdv import constraints, evaluation, metadata, relational, tabular from sdv.demo import get_available_demos, load_demo diff --git a/sdv/evaluation.py b/sdv/evaluation.py index 44a18bb6e..1304fb5b9 100644 --- a/sdv/evaluation.py +++ b/sdv/evaluation.py @@ -133,7 +133,6 @@ def evaluate(synthetic_data, real_data=None, metadata=None, root_path=None, synthetic_data = synthetic_data[table] scores = sdmetrics.compute_metrics(metrics, real_data, synthetic_data, metadata=metadata) - scores.dropna(inplace=True) if aggregate: return scores.normalized_score.mean() diff --git a/sdv/metadata/dataset.py b/sdv/metadata/dataset.py index 2e078bef3..bd77b95a5 100644 --- a/sdv/metadata/dataset.py +++ b/sdv/metadata/dataset.py @@ -10,6 +10,7 @@ import pandas as pd from rdt import HyperTransformer, transformers +from sdv.constraints import Constraint from sdv.metadata import visualization from sdv.metadata.errors import MetadataError @@ -871,7 +872,7 @@ def _get_field_details(self, data, fields): return fields_metadata def add_table(self, name, data=None, fields=None, fields_metadata=None, - primary_key=None, parent=None, foreign_key=None): + primary_key=None, parent=None, foreign_key=None, constraints=None): """Add a new table to this metadata. ``fields`` list can be a mixture of field names, which will be build automatically @@ -902,7 +903,10 @@ def add_table(self, name, data=None, fields=None, fields_metadata=None, parent (str): Table name to refere a foreign key field. Defaults to ``None``. foreign_key (str): - Foreing key field name to ``parent`` table primary key. Defaults to ``None``. + Foreign key field name to ``parent`` table primary key. Defaults to ``None``. + constraints (list[Constraint, dict]): + List of Constraint objects or dicts representing the constraints for the + given table. Raises: ValueError: @@ -938,6 +942,16 @@ def add_table(self, name, data=None, fields=None, fields_metadata=None, self._metadata['tables'][name] = table_metadata + if constraints: + meta_constraints = [] + for constraint in constraints: + if isinstance(constraint, Constraint): + meta_constraints.append(constraint.to_dict()) + else: + meta_constraints.append(constraint) + + table_metadata['constraints'] = meta_constraints + try: if primary_key: self.set_primary_key(name, primary_key) diff --git a/sdv/relational/hma.py b/sdv/relational/hma.py index 6111ef8ea..d308e2834 100644 --- a/sdv/relational/hma.py +++ b/sdv/relational/hma.py @@ -12,7 +12,7 @@ class HMA1(BaseRelationalModel): - """Hierarchical Modeling Alrogirhtm One. + """Hierarchical Modeling Algorithm One. Args: metadata (dict, str or Metadata): @@ -57,17 +57,14 @@ def __init__(self, metadata, root_path=None, model=None, model_kwargs=None): def _get_extension(self, child_name, child_table, foreign_key): """Generate the extension columns for this child table. - Each element of the list is generated for one single children. - That dataframe should have as ``index.name`` the ``foreign_key`` name, and as index - it's values. - + The resulting dataframe will have an index that contains all the foreign key values. The values for a given index are generated by flattening a model fitted with - the related data to that index in the children table. + the child rows with that foreign key value. Args: child_name (str): Name of the child table. - child_table (set[str]): + child_table (pandas.DataFrame): Data for the child table. foreign_key (str): Name of the foreign key field. @@ -115,7 +112,18 @@ def _get_extension(self, child_name, child_table, foreign_key): return pd.DataFrame(extension_rows, index=index) def _load_table(self, tables, table_name): - if tables: + """Load the specified table. + + Args: + tables (dict or None): + A dictionary mapping table name to table. + table_name (str): + The name of the desired table. + + Returns: + pandas.DataFrame + """ + if tables and table_name in tables: table = tables[table_name].copy() else: table = self.metadata.load_table(table_name) @@ -124,6 +132,23 @@ def _load_table(self, tables, table_name): return table def _extend_table(self, table, tables, table_name): + """Generate the extension columns for this table. + + For each of the table's foreign keys, generate the related extension columns, + and extend the provided table. + + Args: + table (pandas.DataFrame): + The table to extend. + tables (dict): + A dictionary mapping table_name to table data (pandas.DataFrame). + table_name (str): + The name of the table. + + Returns: + pandas.DataFrame: + The extended table. + """ LOGGER.info('Computing extensions for table %s', table_name) for child_name in self.metadata.get_children(table_name): if child_name not in self._models: @@ -142,6 +167,27 @@ def _extend_table(self, table, tables, table_name): return table def _prepare_for_modeling(self, table_data, table_name, primary_key): + """Prepare the given table for modeling. + + In preparation for modeling a given table, do the following: + - drop the primary key if exists + - drop any other columns of type 'id' + - add unknown fields to metadata as numerical fields, + and fill missing values in those fields + + Args: + table_data (pandas.DataFrame): + The data of the desired table. + table_name (str): + The name of the table. + primary_key (str): + The name of the primary key column. + + Returns: + (dict, dict): + A tuple containing the table metadata to use for modeling, and + the values of the id columns. + """ table_meta = self.metadata.get_table_meta(table_name) table_meta['name'] = table_name @@ -325,6 +371,20 @@ def _sample_rows(self, model, table_name, num_rows=None): return sampled def _sample_child_rows(self, table_name, parent_name, parent_row, sampled_data): + """Sample child rows that reference the given parent row. + + The sampled rows will be stored in ``sampled_data`` under the ``table_name`` key. + + Args: + table_name (str): + The name of the table to sample. + parent_name (str): + The name of the parent table. + parent_row (pandas.Series): + The parent row the child rows should reference. + sampled_data (dict): + A map of table name to sampled table data (pandas.DataFrame). + """ foreign_key = self.metadata.get_foreign_keys(parent_name, table_name)[0] parameters = self._extract_parameters(parent_row, table_name, foreign_key) @@ -345,6 +405,18 @@ def _sample_child_rows(self, table_name, parent_name, parent_row, sampled_data): [previous, table_rows]).reset_index(drop=True) def _sample_children(self, table_name, sampled_data, table_rows): + """Recursively sample the child tables of the given table. + + Sampled child data will be stored into `sampled_data`. + + Args: + table_name (str): + The name of the table whose children will be sampled. + sampled_data (dict): + A map of table name to the sampled table data (pandas.DataFrame). + table_rows (pandas.DataFrame): + The sampled rows of the given table. + """ for child_name in self.metadata.get_children(table_name): if child_name not in sampled_data: LOGGER.info('Sampling rows from child table %s', child_name) @@ -356,12 +428,26 @@ def _sample_children(self, table_name, sampled_data, table_rows): @staticmethod def _find_parent_id(likelihoods, num_rows): + """Find the parent id for one row based on the likelihoods of parent id values. + + If likelihoods are invalid, fall back to the num_rows. + + Args: + likelihoods (pandas.Series): + The likelihood of parent id values. + num_rows (pandas.Series): + The number of times each parent id value appears in the data. + + Returns: + int: + The parent id for this row, chosen based on likelihoods. + """ mean = likelihoods.mean() if (likelihoods == 0).all(): # All rows got 0 likelihood, fallback to num_rows likelihoods = num_rows elif pd.isnull(mean) or mean == 0: - # Some rows got singlar matrix error and the rest were 0 + # Some rows got singular matrix error and the rest were 0 # Fallback to num_rows on the singular matrix rows and # keep 0s on the rest. likelihoods = likelihoods.fillna(num_rows) @@ -382,6 +468,22 @@ def _find_parent_id(likelihoods, num_rows): return np.random.choice(likelihoods.index, p=weights) def _get_likelihoods(self, table_rows, parent_rows, table_name, foreign_key): + """Calculate the likelihood of each parent id value appearing in the data. + + Args: + table_rows (pandas.DataFrame): + The rows in the child table. + parent_rows (pandas.DataFrame): + The rows in the parent table. + table_name (str): + The name of the child table. + foreign_key (str): + The foreign key column in the child table. + + Returns: + pandas.DataFrame: + A DataFrame of the likelihood of each parent id. + """ likelihoods = dict() for parent_id, row in parent_rows.iterrows(): parameters = self._extract_parameters(row, table_name, foreign_key) @@ -396,6 +498,26 @@ def _get_likelihoods(self, table_rows, parent_rows, table_name, foreign_key): return pd.DataFrame(likelihoods, index=table_rows.index) def _find_parent_ids(self, table_name, parent_name, foreign_key, sampled_data): + """Find parent ids for the given table and foreign key. + + The parent ids are chosen randomly based on the likelihood of the available + parent ids in the parent table. If the parent table is not sampled, this method + will first sample rows for the parent table. + + Args: + table_name (str): + The name of the table to find parent ids for. + parent_name (str): + The name of the parent table. + foreign_key (str): + The name of the foreign key column in the child table. + sampled_data (dict): + Map of table name to sampled data (pandas.DataFrame). + + Returns: + pandas.Series: + The parent ids for the given table data. + """ table_rows = sampled_data[table_name] if parent_name in sampled_data: parent_rows = sampled_data[parent_name] diff --git a/setup.cfg b/setup.cfg index 4cca4b6ab..e16ccf3e6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.13.0 +current_version = 0.13.1.dev1 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 62d529dc3..cf4ab7b56 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ 'ctgan>=0.5.0,<0.6', 'deepecho>=0.3.0.post1,<0.4', 'rdt>=0.6.1,<0.7', - 'sdmetrics>=0.4.0,<0.5', + 'sdmetrics>=0.4.1,<0.5', ] pomegranate_requires = [ @@ -91,6 +91,7 @@ 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', ], description='Synthetic Data Generation for tabular, relational and time series data.', extras_require={ @@ -111,6 +112,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/sdv-dev/SDV', - version='0.13.0', + version='0.13.1.dev1', zip_safe=False, ) diff --git a/tests/integration/timeseries/test_par.py b/tests/integration/timeseries/test_par.py index 8a89eb0b5..af22602c4 100644 --- a/tests/integration/timeseries/test_par.py +++ b/tests/integration/timeseries/test_par.py @@ -1,3 +1,5 @@ +import datetime + import pandas as pd from deepecho import load_demo @@ -47,3 +49,42 @@ def test_par(): assert sampled.shape == data.shape assert (sampled.dtypes == data.dtypes).all() assert (sampled.notnull().sum(axis=1) != 0).all() + + +def test_column_after_date_simple(): + """Test that adding a column after the `sequence_index` column works.""" + date = datetime.datetime.strptime('2020-01-01', '%Y-%m-%d') + data = pd.DataFrame({ + 'col': ['a', 'a'], + 'date': [date, date], + 'col2': ['hello', 'world'], + }) + + model = PAR(entity_columns=['col'], sequence_index='date', epochs=1) + model.fit(data) + sampled = model.sample() + + assert sampled.shape == data.shape + assert (sampled.dtypes == data.dtypes).all() + assert (sampled.notnull().sum(axis=1) != 0).all() + + +def test_column_after_date_complex(): + """Test that adding multiple columns after the `sequence_index` column works.""" + date = datetime.datetime.strptime('2020-01-01', '%Y-%m-%d') + data = pd.DataFrame({ + 'column1': [1.0, 2.0, 1.5, 1.3], + 'date': [date, date, date, date], + 'column2': ['b', 'a', 'a', 'c'], + 'entity': ['person1', 'person1', 'person2', 'person2'], + 'context': ['a', 'a', 'b', 'b'] + }) + + model = PAR(entity_columns=['entity'], context_columns=['context'], sequence_index='date', + epochs=1) + model.fit(data) + sampled = model.sample() + + assert sampled.shape == data.shape + assert (sampled.dtypes == data.dtypes).all() + assert (sampled.notnull().sum(axis=1) != 0).all() diff --git a/tests/unit/metadata/test_dataset.py b/tests/unit/metadata/test_dataset.py index 5da5625f9..a1f8a88a9 100644 --- a/tests/unit/metadata/test_dataset.py +++ b/tests/unit/metadata/test_dataset.py @@ -879,6 +879,72 @@ def test_add_table_with_data_str(self, mock_read_csv): metadata.set_primary_key.call_count == 0 metadata.add_relationship.call_count == 0 + def test_add_table_with_constraints(self): + """Test the ``Metadata.add_table`` method with constraints. + + Expect that when constraints are provided, the metadata for the + specified table is created with the given constraints. + + Input: + - Metadata object + - Table name of the desired table to add + - Metadata for the table's fields + - Constraints for the given table + Side Effects: + - An entry is added to the metadata for the provided table, which contains + the given fields and constrants. + """ + # Setup + metadata = Mock(spec_set=Metadata) + metadata.get_tables.return_value = ['a_table', 'b_table'] + metadata._metadata = {'tables': dict()} + + # Run + fields_metadata = { + 'a_field': {'type': 'numerical', 'subtype': 'integer'}, + 'b_field': {'type': 'numerical', 'subtype': 'integer'} + } + constraints = [ + { + 'constraint': 'sdv.constraints.tabular.GreaterThan', + 'columns': [ + 'a_field', + 'b_field', + ], + 'handling_strategy': 'transform', + } + ] + + Metadata.add_table( + metadata, + 'x_table', + fields_metadata=fields_metadata, + constraints=constraints, + ) + + # Asserts + expected_table_meta = { + 'fields': { + 'a_field': {'type': 'numerical', 'subtype': 'integer'}, + 'b_field': {'type': 'numerical', 'subtype': 'integer'}, + }, + 'constraints': [ + { + 'constraint': 'sdv.constraints.tabular.GreaterThan', + 'columns': [ + 'a_field', + 'b_field', + ], + 'handling_strategy': 'transform', + }, + ] + } + + assert metadata._metadata['tables']['x_table'] == expected_table_meta + + metadata.set_primary_key.call_count == 0 + metadata.add_relationship.call_count == 0 + def test_add_relationship_table_no_exist(self): """Add relationship table no exist""" # Setup