diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml
index 5339d5f9d..c3c2e8ac7 100644
--- a/.github/workflows/integration.yml
+++ b/.github/workflows/integration.yml
@@ -9,7 +9,7 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
- python-version: [3.6, 3.7, 3.8]
+ python-version: [3.6, 3.7, 3.8, 3.9]
os: [ubuntu-latest, macos-10.15, windows-latest]
steps:
- uses: actions/checkout@v1
diff --git a/.github/workflows/minimum.yml b/.github/workflows/minimum.yml
index bd90c4422..cb2f3af55 100644
--- a/.github/workflows/minimum.yml
+++ b/.github/workflows/minimum.yml
@@ -9,7 +9,7 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
- python-version: [3.6, 3.7, 3.8]
+ python-version: [3.6, 3.7, 3.8, 3.9]
os: [ubuntu-latest, macos-10.15, windows-latest]
steps:
- uses: actions/checkout@v1
diff --git a/.github/workflows/readme.yml b/.github/workflows/readme.yml
index bd4d6d868..2fe4b64c5 100644
--- a/.github/workflows/readme.yml
+++ b/.github/workflows/readme.yml
@@ -9,7 +9,7 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
- python-version: [3.6, 3.7, 3.8]
+ python-version: [3.6, 3.7, 3.8, 3.9]
os: [ubuntu-latest, macos-10.15] # skip windows bc rundoc fails
steps:
- uses: actions/checkout@v1
diff --git a/.github/workflows/tutorials.yml b/.github/workflows/tutorials.yml
index 5b67c7eea..d6957d77d 100644
--- a/.github/workflows/tutorials.yml
+++ b/.github/workflows/tutorials.yml
@@ -9,8 +9,8 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
- python-version: [3.6, 3.7, 3.8]
- os: [ubuntu-latest, macos-latest, windows-latest]
+ python-version: [3.6, 3.7, 3.8, 3.9]
+ os: [ubuntu-latest, macos-10.15, windows-latest]
steps:
- uses: actions/checkout@v1
- name: Set up Python ${{ matrix.python-version }}
diff --git a/.github/workflows/unit.yml b/.github/workflows/unit.yml
index ef1c619c1..185587314 100644
--- a/.github/workflows/unit.yml
+++ b/.github/workflows/unit.yml
@@ -9,7 +9,7 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
- python-version: [3.6, 3.7, 3.8]
+ python-version: [3.6, 3.7, 3.8, 3.9]
os: [ubuntu-latest, macos-10.15, windows-latest]
steps:
- uses: actions/checkout@v1
diff --git a/.gitignore b/.gitignore
index 624f18db7..1bf92b0b5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -109,7 +109,6 @@ ENV/
sdv/data/
docs/**/*.pkl
docs/**/*metadata.json
-docs/images
docs/savefig
tutorials/**/*.pkl
tutorials/**/*metadata.json
diff --git a/HISTORY.md b/HISTORY.md
index add372eeb..43a6eed6b 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,5 +1,25 @@
# Release Notes
+## 0.13.1 - 2021-12-22
+
+This release adds support for passing tabular constraints to the HMA1 model, and adds more explicit error handling for
+metric evaluation. It also includes a fix for using categorical columns in the PAR model and documentation updates
+for metadata and HMA1.
+
+### Bugs Fixed
+
+* Categorical column after sequence_index column - Issue [#314](https://github.com/sdv-dev/SDV/issues/314) by @fealho
+
+### New Features
+
+* Support passing tabular constraints to the HMA1 model - Issue [#296](https://github.com/sdv-dev/SDV/issues/296) by @katxiao
+* Metric evaluation error handling metrics - Issue [#638](https://github.com/sdv-dev/SDV/issues/638) by @katxiao
+
+### Documentation Changes
+
+* Make true/false values lowercase in Metadata Schema specification - Issue [#664](https://github.com/sdv-dev/SDV/issues/664) by @katxiao
+* Update docstrings for hma1 methods - Issue [#642](https://github.com/sdv-dev/SDV/issues/642) by @katxiao
+
## 0.13.0 - 2021-11-22
This release makes multiple improvements to different `Constraint` classes. The `Unique` constraint can now
diff --git a/README.md b/README.md
index 534f1715b..c87d64bb9 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,7 @@
-
-
-
-
- An Open Source Project from the Data to AI Lab, at MIT
+
+
+
+ This repository is part of The Synthetic Data Vault Project, a project from DataCebo.
[![Development Status](https://img.shields.io/badge/Development%20Status-2%20--%20Pre--Alpha-yellow)](https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha)
@@ -13,17 +12,16 @@
[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/sdv-dev/SDV/master?filepath=tutorials)
[![Slack](https://img.shields.io/badge/Slack%20Workspace-Join%20now!-36C5F0?logo=slack)](https://join.slack.com/t/sdv-space/shared_invite/zt-gdsfcb5w-0QQpFMVoyB2Yd6SRiMplcw)
-
+
+
+
+
+
+
-* Website: https://sdv.dev
-* Documentation: https://sdv.dev/SDV
- * [User Guides](https://sdv.dev/SDV/user_guides/index.html)
- * [Developer Guides](https://sdv.dev/SDV/developer_guides/index.html)
-* Github: https://github.com/sdv-dev/SDV
-* License: [MIT](https://github.com/sdv-dev/SDV/blob/master/LICENSE)
-* Development Status: [Pre-Alpha](https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha)
+
-## Overview
+# Overview
The **Synthetic Data Vault (SDV)** is a **Synthetic Data Generation** ecosystem of libraries
that allows users to easily learn [single-table](
@@ -41,7 +39,27 @@ Underneath the hood it uses several probabilistic graphical modeling and deep le
techniques. To enable a variety of data storage structures, we employ unique
hierarchical generative modeling and recursive sampling techniques.
-### Current functionality and features:
+| Important Links | |
+| -------------------------- | -------------------------------------------------------------- |
+| :computer: **[Website]** | Check out the SDV Website for more information about the project. |
+| :orange_book: **[SDV Blog]** | Regular publshing of useful content about Synthetic Data Generation. |
+| :book: **[Documentation]** | Quickstarts, User and Development Guides, and API Reference. |
+| :octocat: **[Repository]** | The link to the Github Repository of this library. |
+| :scroll: **[License]** | The entire ecosystem is published under the MIT License. |
+| :keyboard: **[Development Status]** | This software is in its Pre-Alpha stage. |
+| ![](slack.png) **[Community]** | Join our Slack Workspace for announcements and discussions. |
+| ![](mybinder.png) **[Tutorials]** | Run the SDV Tutorials in a Binder environment. |
+
+[Website]: https://sdv.dev
+[SDV Blog]: https://sdv.dev/blog
+[Documentation]: https://sdv.dev/SDV
+[Repository]: https://github.com/sdv-dev/SDV
+[License]: https://github.com/sdv-dev/SDV/blob/master/LICENSE
+[Development Status]: https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha
+[Community]: https://join.slack.com/t/sdv-space/shared_invite/zt-gdsfcb5w-0QQpFMVoyB2Yd6SRiMplcw
+[Tutorials]: https://mybinder.org/v2/gh/sdv-dev/SDV/master?filepath=tutorials
+
+## Current functionality and features:
* Synthetic data generators for [single tables](
https://sdv.dev/SDV/user_guides/single_table/index.html) with the following
@@ -89,7 +107,7 @@ pip install sdv
**Using `conda`:**
```bash
-conda install -c sdv-dev -c pytorch -c conda-forge sdv
+conda install -c pytorch -c conda-forge sdv
```
For more installation options please visit the [SDV installation Guide](
@@ -254,3 +272,26 @@ Neha Patki, Roy Wedge, Kalyan Veeramachaneni. [The Synthetic Data Vault](https:/
month={Oct}
}
```
+
+---
+
+
+
+
+
+
+
+
+The [DataCebo team](https://datacebo.com) is the proud developer of [The Synthetic Data Vault Project](
+https://sdv.dev), the largest open source ecosystem for synthetic data generation & evaluation.
+The ecosystem is home to multiple libraries that support synthetic data, including:
+
+* 🔄 Data discovery & transformation. Reverse the transforms to reproduce realistic data.
+* 🧠Multiple machine learning models -- ranging from Copulas to Deep Learning -- to create tabular,
+ multi table and time series data.
+* 📊 Measuring quality and privacy of synthetic data, and comparing different synthetic data
+ generation models.
+
+[Get started using the SDV package](https://sdv.dev/SDV/getting_started/install.html) -- a fully
+integrated solution and your one-stop shop for synthetic data.Or, use the standalone libraries
+for specific needs.
diff --git a/conda/meta.yaml b/conda/meta.yaml
index 96ea4395c..4aae36b37 100644
--- a/conda/meta.yaml
+++ b/conda/meta.yaml
@@ -1,5 +1,5 @@
{% set name = 'sdv' %}
-{% set version = '0.13.0' %}
+{% set version = '0.13.1.dev1' %}
package:
name: "{{ name|lower }}"
@@ -28,7 +28,7 @@ requirements:
- ctgan >=0.5.0,<0.6
- deepecho >=0.3.0.post1,<0.4
- rdt >=0.6.1,<0.7
- - sdmetrics >=0.4.0,<0.5
+ - sdmetrics >=0.4.1,<0.5
run:
- graphviz
- python >=3.6,<3.10
@@ -41,7 +41,7 @@ requirements:
- ctgan >=0.5.0,<0.6
- deepecho >=0.3.0.post1,<0.4
- rdt >=0.6.1,<0.7
- - sdmetrics >=0.4.0,<0.5
+ - sdmetrics >=0.4.1,<0.5
about:
home: "https://sdv.dev"
diff --git a/docs/developer_guides/sdv/metadata.rst b/docs/developer_guides/sdv/metadata.rst
index ad509a155..5ef322573 100644
--- a/docs/developer_guides/sdv/metadata.rst
+++ b/docs/developer_guides/sdv/metadata.rst
@@ -130,7 +130,7 @@ the following keys.
"fields": {
"social_security_number": {
"type": "categorical",
- "pii": True,
+ "pii": true,
"pii_category": "ssn"
},
...
@@ -180,7 +180,7 @@ A list of all possible localizations can be found on the `Faker documentation si
"fields": {
"address": {
"type": "categorical",
- "pii": True,
+ "pii": true,
"pii_category": "address"
"pii_locales": ["sv_SE", "en_US"]
},
@@ -215,7 +215,7 @@ If a field is specified as a ``primary_key`` of the table, then the field must b
...
}
-If the subtype of the primary key is integer, an optional regular expression can be passed to
+If the subtype of the primary key is string, an optional regular expression can be passed to
generate keys that match it:
.. code-block:: python
diff --git a/docs/images/CTGAN-DataCebo.png b/docs/images/CTGAN-DataCebo.png
new file mode 100644
index 000000000..b913cfe9b
Binary files /dev/null and b/docs/images/CTGAN-DataCebo.png differ
diff --git a/docs/images/Copulas-DataCebo.png b/docs/images/Copulas-DataCebo.png
new file mode 100644
index 000000000..4e198747b
Binary files /dev/null and b/docs/images/Copulas-DataCebo.png differ
diff --git a/docs/images/DataCebo-Blue.png b/docs/images/DataCebo-Blue.png
new file mode 100644
index 000000000..993e00a69
Binary files /dev/null and b/docs/images/DataCebo-Blue.png differ
diff --git a/docs/images/DataCebo.png b/docs/images/DataCebo.png
new file mode 100644
index 000000000..22df6e6e5
Binary files /dev/null and b/docs/images/DataCebo.png differ
diff --git a/docs/images/DeepEcho-DataCebo.png b/docs/images/DeepEcho-DataCebo.png
new file mode 100644
index 000000000..eb0b26aaf
Binary files /dev/null and b/docs/images/DeepEcho-DataCebo.png differ
diff --git a/docs/images/RDT-DataCebo.png b/docs/images/RDT-DataCebo.png
new file mode 100644
index 000000000..e5839f79e
Binary files /dev/null and b/docs/images/RDT-DataCebo.png differ
diff --git a/docs/images/SDGym-DataCebo.png b/docs/images/SDGym-DataCebo.png
new file mode 100644
index 000000000..e9af4a9ee
Binary files /dev/null and b/docs/images/SDGym-DataCebo.png differ
diff --git a/docs/images/SDMetrics-DataCebo.png b/docs/images/SDMetrics-DataCebo.png
new file mode 100644
index 000000000..6ed21cc24
Binary files /dev/null and b/docs/images/SDMetrics-DataCebo.png differ
diff --git a/docs/images/SDV-DataCebo.png b/docs/images/SDV-DataCebo.png
new file mode 100644
index 000000000..5f2d07b2c
Binary files /dev/null and b/docs/images/SDV-DataCebo.png differ
diff --git a/docs/user_guides/relational/constraints.rst b/docs/user_guides/relational/constraints.rst
new file mode 100644
index 000000000..7fd1a797f
--- /dev/null
+++ b/docs/user_guides/relational/constraints.rst
@@ -0,0 +1,82 @@
+.. _relational_constraints:
+
+Constraints
+===========
+
+SDV supports adding constraints within a single table. See :ref:`single_table_constraints`
+for more information about the available single table constraints.
+
+In order to use single-table constraints within a relational model, you can pass
+in a list of applicable constraints when adding a table to your relational ``Metadata``.
+(See :ref:`relational_metadata` for more information on constructing a ``Metadata`` object.)
+
+In this example, we wish to add a ``UniqueCombinations`` constraint to our ``sessions`` table,
+which is a child table of ``users``. First, we will create a ``Metadata`` object and add the
+``users`` table.
+
+.. ipython:: python
+ :okwarning:
+
+ from sdv import load_demo, Metadata
+
+ tables = load_demo()
+
+ metadata = Metadata()
+
+ metadata.add_table(
+ name='users',
+ data=tables['users'],
+ primary_key='user_id'
+ )
+
+The metadata now contains the ``users`` table.
+
+.. ipython:: python
+ :okwarning:
+
+ metadata
+
+Now, we want to add a child table ``sessions`` which contains a single table constraint.
+In the ``sessions`` table, we wish to only have combinations of ``(device, os)`` that
+appear in the original data.
+
+.. ipython:: python
+ :okwarning:
+
+ from sdv.constraints import UniqueCombinations
+
+ constraint = UniqueCombinations(columns=['device', 'os'])
+
+ metadata.add_table(
+ name='sessions',
+ data=tables['sessions'],
+ primary_key='session_id',
+ parent='users',
+ foreign_key='user_id',
+ constraints=[constraint],
+ )
+
+If we get the table metadata for ``sessions``, we can see that the constraint has been added.
+
+.. ipython:: python
+ :okwarning:
+
+ metadata.get_table_meta('sessions')
+
+We can now use this metadata to fit a relational model and synthesize data.
+
+.. ipython:: python
+ :okwarning:
+
+ from sdv.relational import HMA1
+
+ model = HMA1(metadata)
+ model.fit(tables)
+ new_data = model.sample()
+
+In the sampled data, we should see that our constraint is being satisfied.
+
+.. ipython:: python
+ :okwarning:
+
+ new_data
diff --git a/docs/user_guides/relational/index.rst b/docs/user_guides/relational/index.rst
index b34183efa..b262039ca 100644
--- a/docs/user_guides/relational/index.rst
+++ b/docs/user_guides/relational/index.rst
@@ -10,3 +10,4 @@ Relational Data
data_description
models
+ constraints
diff --git a/docs/user_guides/single_table/custom_constraints.rst b/docs/user_guides/single_table/custom_constraints.rst
index de422512f..821dbfb83 100644
--- a/docs/user_guides/single_table/custom_constraints.rst
+++ b/docs/user_guides/single_table/custom_constraints.rst
@@ -23,7 +23,7 @@ Let's look at a demo dataset:
employees = load_tabular_demo()
employees
-The dataset defined in :ref:`_single_table_constraints` contains basic details about employees.
+The dataset defined in :ref:`handling_constraints` contains basic details about employees.
We will use this dataset to demonstrate how you can create your own constraint.
diff --git a/sdv/__init__.py b/sdv/__init__.py
index 05dcc9321..d02191756 100644
--- a/sdv/__init__.py
+++ b/sdv/__init__.py
@@ -6,7 +6,7 @@
__author__ = """MIT Data To AI Lab"""
__email__ = 'dailabmit@gmail.com'
-__version__ = '0.13.0'
+__version__ = '0.13.1.dev1'
from sdv import constraints, evaluation, metadata, relational, tabular
from sdv.demo import get_available_demos, load_demo
diff --git a/sdv/evaluation.py b/sdv/evaluation.py
index 44a18bb6e..1304fb5b9 100644
--- a/sdv/evaluation.py
+++ b/sdv/evaluation.py
@@ -133,7 +133,6 @@ def evaluate(synthetic_data, real_data=None, metadata=None, root_path=None,
synthetic_data = synthetic_data[table]
scores = sdmetrics.compute_metrics(metrics, real_data, synthetic_data, metadata=metadata)
- scores.dropna(inplace=True)
if aggregate:
return scores.normalized_score.mean()
diff --git a/sdv/metadata/dataset.py b/sdv/metadata/dataset.py
index 2e078bef3..bd77b95a5 100644
--- a/sdv/metadata/dataset.py
+++ b/sdv/metadata/dataset.py
@@ -10,6 +10,7 @@
import pandas as pd
from rdt import HyperTransformer, transformers
+from sdv.constraints import Constraint
from sdv.metadata import visualization
from sdv.metadata.errors import MetadataError
@@ -871,7 +872,7 @@ def _get_field_details(self, data, fields):
return fields_metadata
def add_table(self, name, data=None, fields=None, fields_metadata=None,
- primary_key=None, parent=None, foreign_key=None):
+ primary_key=None, parent=None, foreign_key=None, constraints=None):
"""Add a new table to this metadata.
``fields`` list can be a mixture of field names, which will be build automatically
@@ -902,7 +903,10 @@ def add_table(self, name, data=None, fields=None, fields_metadata=None,
parent (str):
Table name to refere a foreign key field. Defaults to ``None``.
foreign_key (str):
- Foreing key field name to ``parent`` table primary key. Defaults to ``None``.
+ Foreign key field name to ``parent`` table primary key. Defaults to ``None``.
+ constraints (list[Constraint, dict]):
+ List of Constraint objects or dicts representing the constraints for the
+ given table.
Raises:
ValueError:
@@ -938,6 +942,16 @@ def add_table(self, name, data=None, fields=None, fields_metadata=None,
self._metadata['tables'][name] = table_metadata
+ if constraints:
+ meta_constraints = []
+ for constraint in constraints:
+ if isinstance(constraint, Constraint):
+ meta_constraints.append(constraint.to_dict())
+ else:
+ meta_constraints.append(constraint)
+
+ table_metadata['constraints'] = meta_constraints
+
try:
if primary_key:
self.set_primary_key(name, primary_key)
diff --git a/sdv/relational/hma.py b/sdv/relational/hma.py
index 6111ef8ea..d308e2834 100644
--- a/sdv/relational/hma.py
+++ b/sdv/relational/hma.py
@@ -12,7 +12,7 @@
class HMA1(BaseRelationalModel):
- """Hierarchical Modeling Alrogirhtm One.
+ """Hierarchical Modeling Algorithm One.
Args:
metadata (dict, str or Metadata):
@@ -57,17 +57,14 @@ def __init__(self, metadata, root_path=None, model=None, model_kwargs=None):
def _get_extension(self, child_name, child_table, foreign_key):
"""Generate the extension columns for this child table.
- Each element of the list is generated for one single children.
- That dataframe should have as ``index.name`` the ``foreign_key`` name, and as index
- it's values.
-
+ The resulting dataframe will have an index that contains all the foreign key values.
The values for a given index are generated by flattening a model fitted with
- the related data to that index in the children table.
+ the child rows with that foreign key value.
Args:
child_name (str):
Name of the child table.
- child_table (set[str]):
+ child_table (pandas.DataFrame):
Data for the child table.
foreign_key (str):
Name of the foreign key field.
@@ -115,7 +112,18 @@ def _get_extension(self, child_name, child_table, foreign_key):
return pd.DataFrame(extension_rows, index=index)
def _load_table(self, tables, table_name):
- if tables:
+ """Load the specified table.
+
+ Args:
+ tables (dict or None):
+ A dictionary mapping table name to table.
+ table_name (str):
+ The name of the desired table.
+
+ Returns:
+ pandas.DataFrame
+ """
+ if tables and table_name in tables:
table = tables[table_name].copy()
else:
table = self.metadata.load_table(table_name)
@@ -124,6 +132,23 @@ def _load_table(self, tables, table_name):
return table
def _extend_table(self, table, tables, table_name):
+ """Generate the extension columns for this table.
+
+ For each of the table's foreign keys, generate the related extension columns,
+ and extend the provided table.
+
+ Args:
+ table (pandas.DataFrame):
+ The table to extend.
+ tables (dict):
+ A dictionary mapping table_name to table data (pandas.DataFrame).
+ table_name (str):
+ The name of the table.
+
+ Returns:
+ pandas.DataFrame:
+ The extended table.
+ """
LOGGER.info('Computing extensions for table %s', table_name)
for child_name in self.metadata.get_children(table_name):
if child_name not in self._models:
@@ -142,6 +167,27 @@ def _extend_table(self, table, tables, table_name):
return table
def _prepare_for_modeling(self, table_data, table_name, primary_key):
+ """Prepare the given table for modeling.
+
+ In preparation for modeling a given table, do the following:
+ - drop the primary key if exists
+ - drop any other columns of type 'id'
+ - add unknown fields to metadata as numerical fields,
+ and fill missing values in those fields
+
+ Args:
+ table_data (pandas.DataFrame):
+ The data of the desired table.
+ table_name (str):
+ The name of the table.
+ primary_key (str):
+ The name of the primary key column.
+
+ Returns:
+ (dict, dict):
+ A tuple containing the table metadata to use for modeling, and
+ the values of the id columns.
+ """
table_meta = self.metadata.get_table_meta(table_name)
table_meta['name'] = table_name
@@ -325,6 +371,20 @@ def _sample_rows(self, model, table_name, num_rows=None):
return sampled
def _sample_child_rows(self, table_name, parent_name, parent_row, sampled_data):
+ """Sample child rows that reference the given parent row.
+
+ The sampled rows will be stored in ``sampled_data`` under the ``table_name`` key.
+
+ Args:
+ table_name (str):
+ The name of the table to sample.
+ parent_name (str):
+ The name of the parent table.
+ parent_row (pandas.Series):
+ The parent row the child rows should reference.
+ sampled_data (dict):
+ A map of table name to sampled table data (pandas.DataFrame).
+ """
foreign_key = self.metadata.get_foreign_keys(parent_name, table_name)[0]
parameters = self._extract_parameters(parent_row, table_name, foreign_key)
@@ -345,6 +405,18 @@ def _sample_child_rows(self, table_name, parent_name, parent_row, sampled_data):
[previous, table_rows]).reset_index(drop=True)
def _sample_children(self, table_name, sampled_data, table_rows):
+ """Recursively sample the child tables of the given table.
+
+ Sampled child data will be stored into `sampled_data`.
+
+ Args:
+ table_name (str):
+ The name of the table whose children will be sampled.
+ sampled_data (dict):
+ A map of table name to the sampled table data (pandas.DataFrame).
+ table_rows (pandas.DataFrame):
+ The sampled rows of the given table.
+ """
for child_name in self.metadata.get_children(table_name):
if child_name not in sampled_data:
LOGGER.info('Sampling rows from child table %s', child_name)
@@ -356,12 +428,26 @@ def _sample_children(self, table_name, sampled_data, table_rows):
@staticmethod
def _find_parent_id(likelihoods, num_rows):
+ """Find the parent id for one row based on the likelihoods of parent id values.
+
+ If likelihoods are invalid, fall back to the num_rows.
+
+ Args:
+ likelihoods (pandas.Series):
+ The likelihood of parent id values.
+ num_rows (pandas.Series):
+ The number of times each parent id value appears in the data.
+
+ Returns:
+ int:
+ The parent id for this row, chosen based on likelihoods.
+ """
mean = likelihoods.mean()
if (likelihoods == 0).all():
# All rows got 0 likelihood, fallback to num_rows
likelihoods = num_rows
elif pd.isnull(mean) or mean == 0:
- # Some rows got singlar matrix error and the rest were 0
+ # Some rows got singular matrix error and the rest were 0
# Fallback to num_rows on the singular matrix rows and
# keep 0s on the rest.
likelihoods = likelihoods.fillna(num_rows)
@@ -382,6 +468,22 @@ def _find_parent_id(likelihoods, num_rows):
return np.random.choice(likelihoods.index, p=weights)
def _get_likelihoods(self, table_rows, parent_rows, table_name, foreign_key):
+ """Calculate the likelihood of each parent id value appearing in the data.
+
+ Args:
+ table_rows (pandas.DataFrame):
+ The rows in the child table.
+ parent_rows (pandas.DataFrame):
+ The rows in the parent table.
+ table_name (str):
+ The name of the child table.
+ foreign_key (str):
+ The foreign key column in the child table.
+
+ Returns:
+ pandas.DataFrame:
+ A DataFrame of the likelihood of each parent id.
+ """
likelihoods = dict()
for parent_id, row in parent_rows.iterrows():
parameters = self._extract_parameters(row, table_name, foreign_key)
@@ -396,6 +498,26 @@ def _get_likelihoods(self, table_rows, parent_rows, table_name, foreign_key):
return pd.DataFrame(likelihoods, index=table_rows.index)
def _find_parent_ids(self, table_name, parent_name, foreign_key, sampled_data):
+ """Find parent ids for the given table and foreign key.
+
+ The parent ids are chosen randomly based on the likelihood of the available
+ parent ids in the parent table. If the parent table is not sampled, this method
+ will first sample rows for the parent table.
+
+ Args:
+ table_name (str):
+ The name of the table to find parent ids for.
+ parent_name (str):
+ The name of the parent table.
+ foreign_key (str):
+ The name of the foreign key column in the child table.
+ sampled_data (dict):
+ Map of table name to sampled data (pandas.DataFrame).
+
+ Returns:
+ pandas.Series:
+ The parent ids for the given table data.
+ """
table_rows = sampled_data[table_name]
if parent_name in sampled_data:
parent_rows = sampled_data[parent_name]
diff --git a/setup.cfg b/setup.cfg
index 4cca4b6ab..e16ccf3e6 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 0.13.0
+current_version = 0.13.1.dev1
commit = True
tag = True
parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))?
diff --git a/setup.py b/setup.py
index 62d529dc3..cf4ab7b56 100644
--- a/setup.py
+++ b/setup.py
@@ -22,7 +22,7 @@
'ctgan>=0.5.0,<0.6',
'deepecho>=0.3.0.post1,<0.4',
'rdt>=0.6.1,<0.7',
- 'sdmetrics>=0.4.0,<0.5',
+ 'sdmetrics>=0.4.1,<0.5',
]
pomegranate_requires = [
@@ -91,6 +91,7 @@
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
+ 'Programming Language :: Python :: 3.9',
],
description='Synthetic Data Generation for tabular, relational and time series data.',
extras_require={
@@ -111,6 +112,6 @@
test_suite='tests',
tests_require=tests_require,
url='https://github.com/sdv-dev/SDV',
- version='0.13.0',
+ version='0.13.1.dev1',
zip_safe=False,
)
diff --git a/tests/integration/timeseries/test_par.py b/tests/integration/timeseries/test_par.py
index 8a89eb0b5..af22602c4 100644
--- a/tests/integration/timeseries/test_par.py
+++ b/tests/integration/timeseries/test_par.py
@@ -1,3 +1,5 @@
+import datetime
+
import pandas as pd
from deepecho import load_demo
@@ -47,3 +49,42 @@ def test_par():
assert sampled.shape == data.shape
assert (sampled.dtypes == data.dtypes).all()
assert (sampled.notnull().sum(axis=1) != 0).all()
+
+
+def test_column_after_date_simple():
+ """Test that adding a column after the `sequence_index` column works."""
+ date = datetime.datetime.strptime('2020-01-01', '%Y-%m-%d')
+ data = pd.DataFrame({
+ 'col': ['a', 'a'],
+ 'date': [date, date],
+ 'col2': ['hello', 'world'],
+ })
+
+ model = PAR(entity_columns=['col'], sequence_index='date', epochs=1)
+ model.fit(data)
+ sampled = model.sample()
+
+ assert sampled.shape == data.shape
+ assert (sampled.dtypes == data.dtypes).all()
+ assert (sampled.notnull().sum(axis=1) != 0).all()
+
+
+def test_column_after_date_complex():
+ """Test that adding multiple columns after the `sequence_index` column works."""
+ date = datetime.datetime.strptime('2020-01-01', '%Y-%m-%d')
+ data = pd.DataFrame({
+ 'column1': [1.0, 2.0, 1.5, 1.3],
+ 'date': [date, date, date, date],
+ 'column2': ['b', 'a', 'a', 'c'],
+ 'entity': ['person1', 'person1', 'person2', 'person2'],
+ 'context': ['a', 'a', 'b', 'b']
+ })
+
+ model = PAR(entity_columns=['entity'], context_columns=['context'], sequence_index='date',
+ epochs=1)
+ model.fit(data)
+ sampled = model.sample()
+
+ assert sampled.shape == data.shape
+ assert (sampled.dtypes == data.dtypes).all()
+ assert (sampled.notnull().sum(axis=1) != 0).all()
diff --git a/tests/unit/metadata/test_dataset.py b/tests/unit/metadata/test_dataset.py
index 5da5625f9..a1f8a88a9 100644
--- a/tests/unit/metadata/test_dataset.py
+++ b/tests/unit/metadata/test_dataset.py
@@ -879,6 +879,72 @@ def test_add_table_with_data_str(self, mock_read_csv):
metadata.set_primary_key.call_count == 0
metadata.add_relationship.call_count == 0
+ def test_add_table_with_constraints(self):
+ """Test the ``Metadata.add_table`` method with constraints.
+
+ Expect that when constraints are provided, the metadata for the
+ specified table is created with the given constraints.
+
+ Input:
+ - Metadata object
+ - Table name of the desired table to add
+ - Metadata for the table's fields
+ - Constraints for the given table
+ Side Effects:
+ - An entry is added to the metadata for the provided table, which contains
+ the given fields and constrants.
+ """
+ # Setup
+ metadata = Mock(spec_set=Metadata)
+ metadata.get_tables.return_value = ['a_table', 'b_table']
+ metadata._metadata = {'tables': dict()}
+
+ # Run
+ fields_metadata = {
+ 'a_field': {'type': 'numerical', 'subtype': 'integer'},
+ 'b_field': {'type': 'numerical', 'subtype': 'integer'}
+ }
+ constraints = [
+ {
+ 'constraint': 'sdv.constraints.tabular.GreaterThan',
+ 'columns': [
+ 'a_field',
+ 'b_field',
+ ],
+ 'handling_strategy': 'transform',
+ }
+ ]
+
+ Metadata.add_table(
+ metadata,
+ 'x_table',
+ fields_metadata=fields_metadata,
+ constraints=constraints,
+ )
+
+ # Asserts
+ expected_table_meta = {
+ 'fields': {
+ 'a_field': {'type': 'numerical', 'subtype': 'integer'},
+ 'b_field': {'type': 'numerical', 'subtype': 'integer'},
+ },
+ 'constraints': [
+ {
+ 'constraint': 'sdv.constraints.tabular.GreaterThan',
+ 'columns': [
+ 'a_field',
+ 'b_field',
+ ],
+ 'handling_strategy': 'transform',
+ },
+ ]
+ }
+
+ assert metadata._metadata['tables']['x_table'] == expected_table_meta
+
+ metadata.set_primary_key.call_count == 0
+ metadata.add_relationship.call_count == 0
+
def test_add_relationship_table_no_exist(self):
"""Add relationship table no exist"""
# Setup