From 6606e6d2d410bd698aa81271d467c1e726b1960c Mon Sep 17 00:00:00 2001 From: JDTheRipperPC Date: Thu, 14 Nov 2019 15:26:21 +0100 Subject: [PATCH 01/16] Added methods to generate metadata --- sdv/metadata.py | 305 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 305 insertions(+) diff --git a/sdv/metadata.py b/sdv/metadata.py index 83ebfa46a..9d0fc0f44 100644 --- a/sdv/metadata.py +++ b/sdv/metadata.py @@ -445,3 +445,308 @@ def reverse_transform(self, table_name, data): reversed_data[name] = reversed_data[name].dropna().astype(dtype) return reversed_data + + def _analyze(data, columns=None): + fields = dict() + for column in fields or data.columns: + dtype = data[column].dtype + fields[column] = {'name': column} + + if dtype.kind == 'i': + fields[column]['type'] = 'numerical' + fields[column]['type'] = 'integer' + + elif dtype.kind == 'f': + fields[column]['type'] = 'numerical' + fields[column]['type'] = 'float' + + elif dtype.kind == 'O': + fields[column]['type'] = 'categorical' + + elif dtype.kind == 'b': + fields[column]['type'] = 'boolean' + + elif dtype.kind == 'M': + fields[column]['type'] = 'datetime' + + else: + raise ValueError('Unsupported dtype: {} in column {}'.format(dtype, column)) + + return fields + + def _validate_field(self, field): + dtype = field['type'] + if dtype == 'categorical': + pass + + elif dtype == 'id': + pass + + elif dtype == 'numerical': + subtype = field.get('subtype') + if subtype and subtype != 'integer' and subtype != 'float': + raise ValueError() + + elif dtype == 'boolean': + pass + + elif dtype == 'datetime': + pass + + else: + raise ValueError('Type {} is not supported.'.format(dtype)) + + def add_table(self, name, primary_key=None, fields=None, data=None, parent=None, + foreign_key=None): + """Add a new table to the metadata. + + First, assert that the ``name`` table exists. + Create the table with the table name and an empty fields. + If ``primary_key`` is defined add it to the table. + + When ``fields`` is a ``dict``, use it to set the ``fields`` key from the table + (fields are validated). + When ``data`` is provided and ``fields`` is not or it's a ``list`` type, + analyze the data for all the columns or just the ``fields`` columns. + If the ``data`` is a ``str`` it should point to a csv file with the data to analazy. + It may be the relative path, if it's then concat the ``root_path`` with the ``data`` path. + + Finally, if ``parent`` and ``foreign_key`` are provided, create their relationship. + + Args: + name (str): + primary_key (str): + fields (dict or list): + data (str or pandas.DataFrame): + parent (str): + foreign_key (str): + """ + if table_name in self.get_table_names(): + raise ValueError('Table "{}" already exists.'.format(name)) + + table = {'name': name, 'fields': dict()} + + if primary_key: + table['primary_key'] = primary_key + + if isinstance(fields, dict): + for field_key, field_value in fields.items(): + # fields[field_key]['name'] = field_key + self._validate_field(field_value) + self.add_field(field[field_key]) + # table['fields'] = fields + + elif data and (not fields or isinstance(fields, list)): + if isinstance(data, str): + data = data if os.path.exists(data) else os.path.join(root_path, data) + data = pd.read_csv(data) + + table['fields'] = Metadata._analyze(data, columns=fields) + + self._metadata['tables'][name] = table + + # Add relationship + if not parent or not foreign_key: + return + + self.add_relationship(name, parent, foreign_key) + + # parent_meta = self.get_table_meta(parent) + # foreign_field = { + # 'name': foreign_key, + # 'type': 'id', + # 'ref': { + # 'field': foreign_key, + # 'table': parent + # } + # } + # self.get_fields(name)[foreign_key] = foreign_field + + def remove_table(self, table): + """Remove a table, their childrens and their relationships. + + First, assert that the ``table`` exists. Then, get their childrens and remove them too, + including their relationships. Finally, remove the given ``table``. + + Args: + table (str): + Table to be removed. + """ + self._assert_table_exists(table) + + childrens = self.get_children(table) + for children in list(childrens): + self.remove_table(children) + + parents = self.get_parents(table) + for parent in parents: + self.get_children(parent).discard(table) + + del self._metadata['tables'][table] + childrens.clear() + parents.clear() + + def add_relationship(self, table, parent, foreign_key): + """Add a new relationship between a 2 tables. + + By a given ``table`` and ``parent`` add a new relationship using ``foreign_key`` + to reference the parent field. + + First, assert that the ``table`` and ``parent`` exists. + Then, assert if already exists a relationship between both tables. + If not, add their relationships, in ``table`` add a new parent ``parent`` + and in ``parent`` add a new children ``table``. + + Args: + table (str): + Table name to add a new relationship with a parent table. + parent (str): + Table name to add a new relationship with a children table. + foreign_key (str): + Field name from the parent table to create the reference in the children table. + """ + if table not in self.get_table_names(): + raise ValueError('Table "{}" doesn\'t exists.'.format(table)) + + if parent not in self.get_table_names(): + raise ValueError('Table "{}" doesn\'t exists.'.format(parent)) + + if parent in self.get_parents(table): + raise ValueError('Table {} is the parent table of {}.'.format(parent, table)) + + if parent in self.get_children(table): + raise ValueError('Table {} is the children table of {}.'.format(parent, table)) + + primary_key = self.get_primary_key(parent) + if not primary_key: + raise ValueError('Parent table {} have not primary key.'.format(primary_key)) + + ref = {'field': primary_key, 'table': parent} + field = {'name': foreign_key, 'type': 'id', 'ref': ref} + self.add_field(table, foreign_key, field) + self._get_relationships() + + def remove_relationship(self, table, parent): + """Remove a relationship between a table and her parent. + + By a given ``table`` and ``parent`` remove their relationship. + Also, remove the ``'ref'`` key-value from the ``foreign_key`` field. + + First, assert that the ``table`` and ``parent`` exists. + Then, discard the ``table`` from the childrens of ``parent`` and the ``parent`` from + the parents of ``table``. + Finally, remove the ``'ref'`` key-value from the ``foreign_key`` field. + + Args: + table (str): + Table name to remove their relationship with a parent table. + parent (str): + Table name to remove their relationship with a children table. + """ + if table not in self.get_table_names(): + raise ValueError('Table "{}" doesn\'t exists.'.format(table)) + + if parent not in self.get_table_names(): + raise ValueError('Table "{}" doesn\'t exists.'.format(parent)) + + parents = self.get_parents(table) + if parent in parents: + parents.discard(parent) + + childrens = self.get_children(parent) + if table in childrens: + childrens.discard(table) + + foreign_key = self.get_foreign_key(parent, table) + fields = self.get_fields(table) + del fields[foreign_key]['ref'] + + def add_field(self, table, field, field_details): + """Add a new field into a given table. + + First, assert that the ``table`` exists and the ``field`` does not. + Then, validate the ``field_details`` format. Finally, add the field. + + Args: + table (str): + Table name to add the new field, it must exist. + field (str): + Field name to be added, it must not exist. + field_details (dict): + Dictionary with the details for the new table field. + """ + if table not in self.get_table_names(): + raise ValueError('Table "{}" doesn\'t exists.'.format(table)) + + if field in self.get_fields(table).keys(): + raise ValueError( + 'Table {}, field {} already exists. Use "update_field()" to modify it.' \ + .format(table, field) + ) + + field_details['name'] = field + self._validate_field(field_details) + self.get_fields(table)[field] = field_details + + def update_field(self, table, field, field_details): + """Update a field from a gibven table. + + First, assert that the ``table`` and ``field`` exists. + Then, validate the ``field_details`` format. Finally, update the field. + + Args: + table (str): + Table name to update the field, it must exist. + field (str): + Field name to be updated, it must exist. + field_details (dict): + Dictionary with the details to be updated. + """ + if table not in self.get_table_names(): + raise ValueError('Table "{}" doesn\'t exists.'.format(table)) + + if field not in self.get_fields(table).keys(): + raise ValueError('Table {}, field {} doesn\'t exists.'.format(table, field)) + + self._validate_field(field_details) + self.get_fields(table)[field].update(field_details) + + def remove_field(self, table, field): + """Remove a field from a given table. + + First, assert that the ``table`` and``field`` exists. + Finally, remove the field. + + If the field to be removed is the reference on other table, remove their relationship. + + Args: + table (str): + Table name to remove the field, it must exist. + field (str): + Field name to be removed, it must exist. + """ + if table not in self.get_table_names(): + raise ValueError('Table "{}" doesn\'t exists.'.format(table)) + + if field not in self.get_fields(table).keys(): + raise ValueError('Table {}, field {} doesn\'t exists.'.format(table, field)) + + primary_key = self.get_primary_key(table) + if field == primary_key: + # TODO: remove relationship + print("TODO: remove relationship from primary_key") + return + + for parent in list(self.get_parents(table)): + if self.get_foreign_key(parent, table) == field: + # TODO: remove relationship + print("TODO: remove relationship from foreign_key") + return + + del self.get_fields(table)[field] + + def to_dict(self): + return self._metadata + + def to_json(self): + pass From 2ef7ac131194fe8b74abbc53d5b31d11284e70bc Mon Sep 17 00:00:00 2001 From: JDTheRipperPC Date: Mon, 18 Nov 2019 12:57:23 +0100 Subject: [PATCH 02/16] updated metadata access methods --- sdv/metadata.py | 123 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 95 insertions(+), 28 deletions(-) diff --git a/sdv/metadata.py b/sdv/metadata.py index 9d0fc0f44..2a08dca09 100644 --- a/sdv/metadata.py +++ b/sdv/metadata.py @@ -447,6 +447,27 @@ def reverse_transform(self, table_name, data): return reversed_data def _analyze(data, columns=None): + """Get a dictionary with the metadata analyzed from a dictionary. + + Analyze a ``pandas.DataFrame`` to build a ``dict`` with the name of the column, and + their data type and subtype. If ``columns`` are provided, only those columns will be + analyzed. + + Args: + data (pandas.DataFrame): + Table to be analyzed. + columns(list): + List of columns used to specify which fields analyze from the data. + + Returns: + dict: + Generated metadata from a ``pandas.DataFrame``. + + Raises: + ValueError: + A ``ValueError`` is raised when a column from the data analyzed is an unsupported + data type. + """ fields = dict() for column in fields or data.columns: dtype = data[column].dtype @@ -454,11 +475,11 @@ def _analyze(data, columns=None): if dtype.kind == 'i': fields[column]['type'] = 'numerical' - fields[column]['type'] = 'integer' + fields[column]['subtype'] = 'integer' elif dtype.kind == 'f': fields[column]['type'] = 'numerical' - fields[column]['type'] = 'float' + fields[column]['subtype'] = 'float' elif dtype.kind == 'O': fields[column]['type'] = 'categorical' @@ -551,17 +572,6 @@ def add_table(self, name, primary_key=None, fields=None, data=None, parent=None, self.add_relationship(name, parent, foreign_key) - # parent_meta = self.get_table_meta(parent) - # foreign_field = { - # 'name': foreign_key, - # 'type': 'id', - # 'ref': { - # 'field': foreign_key, - # 'table': parent - # } - # } - # self.get_fields(name)[foreign_key] = foreign_field - def remove_table(self, table): """Remove a table, their childrens and their relationships. @@ -572,7 +582,8 @@ def remove_table(self, table): table (str): Table to be removed. """ - self._assert_table_exists(table) + if table not in self.get_table_names(): + raise ValueError('Table "{}" doesn\'t exists.'.format(table)) childrens = self.get_children(table) for children in list(childrens): @@ -667,6 +678,9 @@ def add_field(self, table, field, field_details): First, assert that the ``table`` exists and the ``field`` does not. Then, validate the ``field_details`` format. Finally, add the field. + The error message displayed, when the ``ValueError`` is raised because the ``fields`` + already exists in the table, recommends you to use the ``update_fields`` instead. + Args: table (str): Table name to add the new field, it must exist. @@ -674,6 +688,11 @@ def add_field(self, table, field, field_details): Field name to be added, it must not exist. field_details (dict): Dictionary with the details for the new table field. + + Raises: + ValueError: + A ``ValueError`` is raised when the ``table`` doesn't exists or the ``field`` + exists in the table. """ if table not in self.get_table_names(): raise ValueError('Table "{}" doesn\'t exists.'.format(table)) @@ -686,8 +705,18 @@ def add_field(self, table, field, field_details): field_details['name'] = field self._validate_field(field_details) + + type_id = field_details.get('type') + field_ref = field_details.get('ref') self.get_fields(table)[field] = field_details + if type_id and field_ref: + # add foreign key + pass + + if type_id and not field_ref: + self.get_table_meta(table)['primary_key'] = field + def update_field(self, table, field, field_details): """Update a field from a gibven table. @@ -701,6 +730,11 @@ def update_field(self, table, field, field_details): Field name to be updated, it must exist. field_details (dict): Dictionary with the details to be updated. + + Raises: + ValueError: + A ``ValueError`` is raised when the ``table`` doesn't exists or the ``field`` + doesn't exists in the table. """ if table not in self.get_table_names(): raise ValueError('Table "{}" doesn\'t exists.'.format(table)) @@ -708,22 +742,57 @@ def update_field(self, table, field, field_details): if field not in self.get_fields(table).keys(): raise ValueError('Table {}, field {} doesn\'t exists.'.format(table, field)) - self._validate_field(field_details) - self.get_fields(table)[field].update(field_details) + # Rename childrens field reference + new_name = field_details.get('name', field) + renamed = field != new_name + if renamed and new_name in self.get_fields(table).keys(): + raise ValueError( + 'Table {}, field {} already exists. Can\'t be renamed.'.format(table, new_name) + ) + + primary_key = self.get_primary_key(table) + if field == primary_key and renamed: + childrens = self.get_children(table) + for children in list(childrens): + foreign_key = self.get_foreign_key(table, children) + self.get_fields(children)[foreign_key]['ref']['field'] = new_name + + # Update table "primary_key" + self.get_table_meta(table)['primary_key'] = new_name + + # Protect edit the relationships directly + if field_details.get('ref'): + del field_details['ref'] + + fields = self.get_fields(table) + + # Create the renamed field, remove the old one and + # replace the field name to update it with the field_details + if renamed: + fields[field_details['name']] = fields[field] + del fields[field] + field = field_details['name'] + + fields[field].update(field_details) + self._validate_field(fields[field]) def remove_field(self, table, field): """Remove a field from a given table. First, assert that the ``table`` and``field`` exists. - Finally, remove the field. - - If the field to be removed is the reference on other table, remove their relationship. + If the field to be removed is the primary key, get their childrens and remove their + reference field. Finally, remove the field and recomptue the tables relationships. Args: table (str): Table name to remove the field, it must exist. field (str): Field name to be removed, it must exist. + + Raises: + ValueError: + A ``ValueError`` is raised when the ``table`` doesn't exists or the ``field`` + doesn't exists in the table. """ if table not in self.get_table_names(): raise ValueError('Table "{}" doesn\'t exists.'.format(table)) @@ -733,20 +802,18 @@ def remove_field(self, table, field): primary_key = self.get_primary_key(table) if field == primary_key: - # TODO: remove relationship - print("TODO: remove relationship from primary_key") - return + childrens = self.get_children(table) + for children in list(childrens): + foreign_key = self.get_foreign_key(table, children) + self.remove_field(children, foreign_key) - for parent in list(self.get_parents(table)): - if self.get_foreign_key(parent, table) == field: - # TODO: remove relationship - print("TODO: remove relationship from foreign_key") - return + del self.get_table_meta(table)['primary_key'] del self.get_fields(table)[field] + self._get_relationships() def to_dict(self): - return self._metadata + return copy.deepcopy(self._metadata) def to_json(self): pass From 0eecbe1dcfce3388eb82af0948e2e5740af2730f Mon Sep 17 00:00:00 2001 From: JDTheRipperPC Date: Tue, 19 Nov 2019 14:29:04 +0100 Subject: [PATCH 03/16] metadata api methods --- sdv/metadata.py | 171 ++++++++++++++++++++++++++++-------------------- 1 file changed, 100 insertions(+), 71 deletions(-) diff --git a/sdv/metadata.py b/sdv/metadata.py index 2a08dca09..d36910d0c 100644 --- a/sdv/metadata.py +++ b/sdv/metadata.py @@ -400,6 +400,17 @@ def get_primary_key(self, table_name): """ return self.get_table_meta(table_name).get('primary_key') + def set_primary_key(self, table_name, field_name): + """Set the primary key name of the indicated table. + + Args: + table_name (str): + Name of table for which to set the primary key field. + field_name (str): + Name of field to set as the primary key field. + """ + self.get_table_meta(table_name)['primary_key'] = field_name + def get_foreign_key(self, parent, child): """Get table foreign key field name. @@ -469,7 +480,7 @@ def _analyze(data, columns=None): data type. """ fields = dict() - for column in fields or data.columns: + for column in columns or data.columns: dtype = data[column].dtype fields[column] = {'name': column} @@ -542,35 +553,34 @@ def add_table(self, name, primary_key=None, fields=None, data=None, parent=None, parent (str): foreign_key (str): """ - if table_name in self.get_table_names(): + if name in self.get_table_names(): raise ValueError('Table "{}" already exists.'.format(name)) table = {'name': name, 'fields': dict()} - if primary_key: + if primary_key: # pk must be in fields or data.columns? table['primary_key'] = primary_key if isinstance(fields, dict): for field_key, field_value in fields.items(): - # fields[field_key]['name'] = field_key self._validate_field(field_value) - self.add_field(field[field_key]) - # table['fields'] = fields - elif data and (not fields or isinstance(fields, list)): + elif data is not None and (not fields or isinstance(fields, list)): if isinstance(data, str): data = data if os.path.exists(data) else os.path.join(root_path, data) data = pd.read_csv(data) - table['fields'] = Metadata._analyze(data, columns=fields) + fields = Metadata._analyze(data, columns=fields) + + elif not data and isinstance(fields, list): + fields = dict() + table['fields'] = fields or dict() self._metadata['tables'][name] = table # Add relationship - if not parent or not foreign_key: - return - - self.add_relationship(name, parent, foreign_key) + if parent and foreign_key: + self.add_relationship(name, parent, foreign_key) def remove_table(self, table): """Remove a table, their childrens and their relationships. @@ -590,12 +600,10 @@ def remove_table(self, table): self.remove_table(children) parents = self.get_parents(table) - for parent in parents: - self.get_children(parent).discard(table) + self.remove_relationship(table, list(parents)[0]) del self._metadata['tables'][table] - childrens.clear() - parents.clear() + self._get_relationships() def add_relationship(self, table, parent, foreign_key): """Add a new relationship between a 2 tables. @@ -633,7 +641,7 @@ def add_relationship(self, table, parent, foreign_key): raise ValueError('Parent table {} have not primary key.'.format(primary_key)) ref = {'field': primary_key, 'table': parent} - field = {'name': foreign_key, 'type': 'id', 'ref': ref} + field = {'name': foreign_key, 'type': 'id', 'ref': ref} # ¿subtype? self.add_field(table, foreign_key, field) self._get_relationships() @@ -660,17 +668,31 @@ def remove_relationship(self, table, parent): if parent not in self.get_table_names(): raise ValueError('Table "{}" doesn\'t exists.'.format(parent)) - parents = self.get_parents(table) - if parent in parents: - parents.discard(parent) + if parent in self.get_children(table): + self._delete_foreign_key(table, parent) - childrens = self.get_children(parent) - if table in childrens: - childrens.discard(table) + if table in self.get_children(parent): + self._delete_foreign_key(parent, table) - foreign_key = self.get_foreign_key(parent, table) - fields = self.get_fields(table) - del fields[foreign_key]['ref'] + self._get_relationships() + + def add_primary_key(self, table, field): + if table not in self.get_table_names(): + raise ValueError('Table "{}" doesn\'t exists.'.format(table)) + + if field in self.get_fields(table).keys(): + raise ValueError('Table {}, field {} already exists.'.format(table, field)) + + if self.get_primary_key(table): + raise ValueError('Table {} already have primary_key.'.format(table)) + + field_details = { + 'name': field, + 'type': 'id' + } + + self.get_fields(table)[field] = field_details + self.set_primary_key(table, field) def add_field(self, table, field, field_details): """Add a new field into a given table. @@ -706,16 +728,48 @@ def add_field(self, table, field, field_details): field_details['name'] = field self._validate_field(field_details) - type_id = field_details.get('type') - field_ref = field_details.get('ref') - self.get_fields(table)[field] = field_details + has_id = field_details['type'] == 'id' + has_ref = field_details.get('ref') is not None - if type_id and field_ref: - # add foreign key - pass + if has_id and not has_ref: + self.add_primary_key(table, field) + + elif has_id and has_ref: + self.get_fields(table)[field] = field_details + self._get_relationships() + + else: + self.get_fields(table)[field] = field_details + + def add_fields(self, data): + """Add a list or dict of fields into tables. + + List format: + [{ + 'table': 'table_name', + 'field': 'field_name', + 'field_details': {...} + }, ...] + + Dictionary format: + { + ('table_name', 'field_name'): {...}, + ... + } + """ + if isinstance(data, list): + for item in data: + self.add_field(item['table'], item['field'], item['field_details']) + return + + if isinstance(data, dict): + for key, value in data.items(): + add_field[key[0], key[1], value] + return - if type_id and not field_ref: - self.get_table_meta(table)['primary_key'] = field + raise TypeError( + 'Invalid data type {}, only list and dict are supported.'.format(type(data)) + ) def update_field(self, table, field, field_details): """Update a field from a gibven table. @@ -742,37 +796,7 @@ def update_field(self, table, field, field_details): if field not in self.get_fields(table).keys(): raise ValueError('Table {}, field {} doesn\'t exists.'.format(table, field)) - # Rename childrens field reference - new_name = field_details.get('name', field) - renamed = field != new_name - if renamed and new_name in self.get_fields(table).keys(): - raise ValueError( - 'Table {}, field {} already exists. Can\'t be renamed.'.format(table, new_name) - ) - - primary_key = self.get_primary_key(table) - if field == primary_key and renamed: - childrens = self.get_children(table) - for children in list(childrens): - foreign_key = self.get_foreign_key(table, children) - self.get_fields(children)[foreign_key]['ref']['field'] = new_name - - # Update table "primary_key" - self.get_table_meta(table)['primary_key'] = new_name - - # Protect edit the relationships directly - if field_details.get('ref'): - del field_details['ref'] - fields = self.get_fields(table) - - # Create the renamed field, remove the old one and - # replace the field name to update it with the field_details - if renamed: - fields[field_details['name']] = fields[field] - del fields[field] - field = field_details['name'] - fields[field].update(field_details) self._validate_field(fields[field]) @@ -797,21 +821,26 @@ def remove_field(self, table, field): if table not in self.get_table_names(): raise ValueError('Table "{}" doesn\'t exists.'.format(table)) - if field not in self.get_fields(table).keys(): + fields = self.get_fields(table) + if field not in fields.keys(): raise ValueError('Table {}, field {} doesn\'t exists.'.format(table, field)) primary_key = self.get_primary_key(table) if field == primary_key: - childrens = self.get_children(table) - for children in list(childrens): - foreign_key = self.get_foreign_key(table, children) - self.remove_field(children, foreign_key) - + self._delete_foreign_key_to(table) del self.get_table_meta(table)['primary_key'] - del self.get_fields(table)[field] + del fields[field] self._get_relationships() + def _delete_foreign_key(self, parent, child): + foreign_key = self.get_foreign_key(parent, child) + del self.get_fields(child)[foreign_key] + + def _delete_foreign_key_to(self, table_name): + for children in list(self.get_children(table_name)): + self._delete_foreign_key(table_name, children) + def to_dict(self): return copy.deepcopy(self._metadata) From ff9e1031ea3c8789339248f57e14e9f8dfb32081 Mon Sep 17 00:00:00 2001 From: JDTheRipperPC Date: Wed, 20 Nov 2019 10:34:02 +0100 Subject: [PATCH 04/16] update metadata api v1 methods --- sdv/metadata.py | 333 ++++++++++++++++++++++++------------------------ 1 file changed, 165 insertions(+), 168 deletions(-) diff --git a/sdv/metadata.py b/sdv/metadata.py index d36910d0c..a7096d88c 100644 --- a/sdv/metadata.py +++ b/sdv/metadata.py @@ -400,17 +400,6 @@ def get_primary_key(self, table_name): """ return self.get_table_meta(table_name).get('primary_key') - def set_primary_key(self, table_name, field_name): - """Set the primary key name of the indicated table. - - Args: - table_name (str): - Name of table for which to set the primary key field. - field_name (str): - Name of field to set as the primary key field. - """ - self.get_table_meta(table_name)['primary_key'] = field_name - def get_foreign_key(self, parent, child): """Get table foreign key field name. @@ -482,28 +471,36 @@ def _analyze(data, columns=None): fields = dict() for column in columns or data.columns: dtype = data[column].dtype - fields[column] = {'name': column} + subtype = None if dtype.kind == 'i': - fields[column]['type'] = 'numerical' - fields[column]['subtype'] = 'integer' + type = 'numerical' + subtype = 'integer' elif dtype.kind == 'f': - fields[column]['type'] = 'numerical' - fields[column]['subtype'] = 'float' + type = 'numerical' + subtype = 'float' elif dtype.kind == 'O': - fields[column]['type'] = 'categorical' + type = 'categorical' elif dtype.kind == 'b': - fields[column]['type'] = 'boolean' + type = 'boolean' elif dtype.kind == 'M': - fields[column]['type'] = 'datetime' + type = 'datetime' else: raise ValueError('Unsupported dtype: {} in column {}'.format(dtype, column)) + fields[column] = { + 'name': column, + 'type': type + } + + if subtype: + fields[column]['subtype'] = subtype + return fields def _validate_field(self, field): @@ -528,14 +525,145 @@ def _validate_field(self, field): else: raise ValueError('Type {} is not supported.'.format(dtype)) - def add_table(self, name, primary_key=None, fields=None, data=None, parent=None, + def _validate_circular_relationships(self, parent, children=None): + if children is None: + children = self.get_children(parent) + + if parent in children: + raise ValueError('Circular relationship not supported') + + for grandchild in children: + self._validate_circular_relationships(parent, self.get_children(grandchild)) + + def add_field(self, table, field, type, subtype, properties): + """Add a new field into a given table. + + First, assert that the ``table`` exists and the ``field`` does not. + Then, validate the ``field_details`` format. Finally, add the field. + + The error message displayed, when the ``ValueError`` is raised because the ``fields`` + already exists in the table, recommends you to use the ``update_fields`` instead. + + Args: + table (str): + Table name to add the new field, it must exist. + field (str): + Field name to be added, it must not exist. + field_details (dict): + Dictionary with the details for the new table field. + + Raises: + ValueError: + A ``ValueError`` is raised when the ``table`` doesn't exists or the ``field`` + exists in the table. + """ + if table not in self.get_table_names(): + raise ValueError('Table "{}" doesn\'t exists.'.format(table)) + + if field in self.get_fields(table).keys(): + raise ValueError( + 'Table {}, field {} already exists. Use "update_field()" to modify it.' + .format(table, field) + ) + + # Validate type, subtype and properties + field_details = { + 'name': field, + 'type': type + } + + if subtype: + field_details['subtype'] = subtype + + if properties: + for property_name, property_value in properties.items(): + field_details[property_name] = property_value + + self.get_fields(table)[field] = field_details + + def add_primary_key(self, table, field): + """Add a primary key into a given table. + + First, assert that the ``table`` exists and the ``field`` does not. + Then, assert that the ``table`` doesn't have primary key. Finally, add primary key. + + Args: + table (str): + Table name to add the new primary key, it must exist. + field (str): + Field name to be the new primary key, it must not exist. + + Raises: + ValueError: + A ``ValueError`` is raised when the table not exist, + the field already exist or the primary key already exist. + """ + if table not in self.get_table_names(): + raise ValueError('Table "{}" doesn\'t exists.'.format(table)) + + if field in self.get_fields(table).keys(): + raise ValueError('Table {}, field {} already exists.'.format(table, field)) + + if self.get_primary_key(table): + raise ValueError('Table {} already have primary key.'.format(table)) + + self.get_table_meta(table)['primary_key'] = field + self.add_field(table, field, 'id', None, None) + + def add_relationship(self, table, parent, foreign_key=None): + """Add a new relationship between a 2 tables. + + By a given ``table`` and ``parent`` add a new relationship using ``foreign_key`` + to reference the parent field. + + First, assert that the ``table`` and ``parent`` exists. + Then, assert if already exists a relationship between both tables. + If not, add their relationships, in ``table`` add a new parent ``parent`` + and in ``parent`` add a new children ``table``. + + Args: + table (str): + Table name to add a new relationship with a parent table. + parent (str): + Table name to add a new relationship with a children table. + foreign_key (str): + Field name from the parent table to create the reference in the children table. + """ + if table not in self.get_table_names(): + raise ValueError('Table "{}" doesn\'t exists.'.format(table)) + + if parent not in self.get_table_names(): + raise ValueError('Table "{}" doesn\'t exists.'.format(parent)) + + if parent in self.get_parents(table): + raise ValueError('Table {} is the parent table of {}.'.format(parent, table)) + + if parent in self.get_children(table): + raise ValueError('Table {} is the children table of {}.'.format(parent, table)) + + primary_key = self.get_primary_key(parent) + if not primary_key: + raise ValueError('Parent table {} have not primary key.'.format(primary_key)) + + self._validate_circular_relationships(parent, self.get_children(table)) + + properties = {'ref': {'field': primary_key, 'table': parent}} + self.add_field(table, foreign_key or primary_key, 'id', None, properties) + + self._get_relationships() + + def _add_table_load_data(self, data): + data = data if os.path.exists(data) else os.path.join(self.root_path, data) + return pd.read_csv(data) + + def add_table(self, name, primary_key=None, fields=dict(), data=None, parent=None, foreign_key=None): """Add a new table to the metadata. First, assert that the ``name`` table exists. Create the table with the table name and an empty fields. If ``primary_key`` is defined add it to the table. - + When ``fields`` is a ``dict``, use it to set the ``fields`` key from the table (fields are validated). When ``data`` is provided and ``fields`` is not or it's a ``list`` type, @@ -556,30 +684,34 @@ def add_table(self, name, primary_key=None, fields=None, data=None, parent=None, if name in self.get_table_names(): raise ValueError('Table "{}" already exists.'.format(name)) - table = {'name': name, 'fields': dict()} - - if primary_key: # pk must be in fields or data.columns? - table['primary_key'] = primary_key - if isinstance(fields, dict): for field_key, field_value in fields.items(): self._validate_field(field_value) - elif data is not None and (not fields or isinstance(fields, list)): + if isinstance(fields, list) and data is None: + raise ValueError() + + if isinstance(fields, list) and data is not None: if isinstance(data, str): - data = data if os.path.exists(data) else os.path.join(root_path, data) - data = pd.read_csv(data) + data = self._add_table_load_data(data) fields = Metadata._analyze(data, columns=fields) - elif not data and isinstance(fields, list): - fields = dict() + if not fields and data is not None: + if isinstance(data, str): + data = self._add_table_load_data(data) + + fields = Metadata._analyze(data) - table['fields'] = fields or dict() + table = {'name': name, 'fields': fields} self._metadata['tables'][name] = table + if primary_key: + self.add_primary_key(name, primary_key) + # table['primary_key'] = primary_key + # Add relationship - if parent and foreign_key: + if parent: self.add_relationship(name, parent, foreign_key) def remove_table(self, table): @@ -605,46 +737,6 @@ def remove_table(self, table): del self._metadata['tables'][table] self._get_relationships() - def add_relationship(self, table, parent, foreign_key): - """Add a new relationship between a 2 tables. - - By a given ``table`` and ``parent`` add a new relationship using ``foreign_key`` - to reference the parent field. - - First, assert that the ``table`` and ``parent`` exists. - Then, assert if already exists a relationship between both tables. - If not, add their relationships, in ``table`` add a new parent ``parent`` - and in ``parent`` add a new children ``table``. - - Args: - table (str): - Table name to add a new relationship with a parent table. - parent (str): - Table name to add a new relationship with a children table. - foreign_key (str): - Field name from the parent table to create the reference in the children table. - """ - if table not in self.get_table_names(): - raise ValueError('Table "{}" doesn\'t exists.'.format(table)) - - if parent not in self.get_table_names(): - raise ValueError('Table "{}" doesn\'t exists.'.format(parent)) - - if parent in self.get_parents(table): - raise ValueError('Table {} is the parent table of {}.'.format(parent, table)) - - if parent in self.get_children(table): - raise ValueError('Table {} is the children table of {}.'.format(parent, table)) - - primary_key = self.get_primary_key(parent) - if not primary_key: - raise ValueError('Parent table {} have not primary key.'.format(primary_key)) - - ref = {'field': primary_key, 'table': parent} - field = {'name': foreign_key, 'type': 'id', 'ref': ref} # ¿subtype? - self.add_field(table, foreign_key, field) - self._get_relationships() - def remove_relationship(self, table, parent): """Remove a relationship between a table and her parent. @@ -676,101 +768,6 @@ def remove_relationship(self, table, parent): self._get_relationships() - def add_primary_key(self, table, field): - if table not in self.get_table_names(): - raise ValueError('Table "{}" doesn\'t exists.'.format(table)) - - if field in self.get_fields(table).keys(): - raise ValueError('Table {}, field {} already exists.'.format(table, field)) - - if self.get_primary_key(table): - raise ValueError('Table {} already have primary_key.'.format(table)) - - field_details = { - 'name': field, - 'type': 'id' - } - - self.get_fields(table)[field] = field_details - self.set_primary_key(table, field) - - def add_field(self, table, field, field_details): - """Add a new field into a given table. - - First, assert that the ``table`` exists and the ``field`` does not. - Then, validate the ``field_details`` format. Finally, add the field. - - The error message displayed, when the ``ValueError`` is raised because the ``fields`` - already exists in the table, recommends you to use the ``update_fields`` instead. - - Args: - table (str): - Table name to add the new field, it must exist. - field (str): - Field name to be added, it must not exist. - field_details (dict): - Dictionary with the details for the new table field. - - Raises: - ValueError: - A ``ValueError`` is raised when the ``table`` doesn't exists or the ``field`` - exists in the table. - """ - if table not in self.get_table_names(): - raise ValueError('Table "{}" doesn\'t exists.'.format(table)) - - if field in self.get_fields(table).keys(): - raise ValueError( - 'Table {}, field {} already exists. Use "update_field()" to modify it.' \ - .format(table, field) - ) - - field_details['name'] = field - self._validate_field(field_details) - - has_id = field_details['type'] == 'id' - has_ref = field_details.get('ref') is not None - - if has_id and not has_ref: - self.add_primary_key(table, field) - - elif has_id and has_ref: - self.get_fields(table)[field] = field_details - self._get_relationships() - - else: - self.get_fields(table)[field] = field_details - - def add_fields(self, data): - """Add a list or dict of fields into tables. - - List format: - [{ - 'table': 'table_name', - 'field': 'field_name', - 'field_details': {...} - }, ...] - - Dictionary format: - { - ('table_name', 'field_name'): {...}, - ... - } - """ - if isinstance(data, list): - for item in data: - self.add_field(item['table'], item['field'], item['field_details']) - return - - if isinstance(data, dict): - for key, value in data.items(): - add_field[key[0], key[1], value] - return - - raise TypeError( - 'Invalid data type {}, only list and dict are supported.'.format(type(data)) - ) - def update_field(self, table, field, field_details): """Update a field from a gibven table. From 82c2a61e09803b4901d1121a2ebc94480cf3fbaa Mon Sep 17 00:00:00 2001 From: JDTheRipperPC Date: Wed, 20 Nov 2019 12:02:32 +0100 Subject: [PATCH 05/16] fixed metadata and added unittests --- sdv/metadata.py | 6 +- tests/test_metadata.py | 337 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 341 insertions(+), 2 deletions(-) diff --git a/sdv/metadata.py b/sdv/metadata.py index a7096d88c..7d853c488 100644 --- a/sdv/metadata.py +++ b/sdv/metadata.py @@ -560,7 +560,8 @@ def add_field(self, table, field, type, subtype, properties): if table not in self.get_table_names(): raise ValueError('Table "{}" doesn\'t exists.'.format(table)) - if field in self.get_fields(table).keys(): + fields = self.get_fields(table) + if field in fields: raise ValueError( 'Table {}, field {} already exists. Use "update_field()" to modify it.' .format(table, field) @@ -579,7 +580,7 @@ def add_field(self, table, field, type, subtype, properties): for property_name, property_value in properties.items(): field_details[property_name] = property_value - self.get_fields(table)[field] = field_details + fields[field] = field_details def add_primary_key(self, table, field): """Add a primary key into a given table. @@ -686,6 +687,7 @@ def add_table(self, name, primary_key=None, fields=dict(), data=None, parent=Non if isinstance(fields, dict): for field_key, field_value in fields.items(): + field_value['name'] = field_key self._validate_field(field_value) if isinstance(fields, list) and data is None: diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 54a58288f..19b9cbefa 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -659,3 +659,340 @@ def test_reverse_transform(self): ht_mock.reverse_transform.call_args[0][0], expected_call ) + + def test_add_table_already_exist(self): + """Try to add a new table that already exist""" + # Setup + table_names = ['a_table', 'b_table'] + + # Run and asserts + metadata = Mock(spec=Metadata) + metadata.get_table_names.return_value = table_names + + with pytest.raises(ValueError): + Metadata.add_table(metadata, 'a_table') + + def test_add_table_only_name(self): + """Add table with only the name""" + # Setup + table_names = ['a_table', 'b_table'] + + # Run + metadata = Mock(spec=Metadata) + metadata.get_table_names.return_value = table_names + metadata._metadata = {'tables': dict()} + + Metadata.add_table(metadata, 'x_table') + + # Asserts + expected_table_meta = { + 'name': 'x_table', + 'fields': dict() + } + + assert metadata._metadata['tables']['x_table'] == expected_table_meta + + metadata.add_primary_key.call_count == 0 + metadata.add_relationship.call_count == 0 + + def test_add_table_with_primary_key(self): + """Add table with primary key""" + # Setup + table_names = ['a_table', 'b_table'] + + # Run + metadata = Mock(spec=Metadata) + metadata.get_table_names.return_value = table_names + metadata._metadata = {'tables': dict()} + + Metadata.add_table(metadata, 'x_table', primary_key='id') + + # Asserts + expected_table_meta = { + 'name': 'x_table', + 'fields': dict() + } + + assert metadata._metadata['tables']['x_table'] == expected_table_meta + + metadata.add_primary_key.assert_called_once_with('x_table', 'id') + metadata.add_relationship.call_count == 0 + + def test_add_table_with_foreign_key(self): + """Add table with foreign key""" + # Setup + table_names = ['a_table', 'b_table'] + + # Run + metadata = Mock(spec=Metadata) + metadata.get_table_names.return_value = table_names + metadata._metadata = {'tables': dict()} + + Metadata.add_table(metadata, 'x_table', parent='users') + + # Asserts + expected_table_meta = { + 'name': 'x_table', + 'fields': dict() + } + + assert metadata._metadata['tables']['x_table'] == expected_table_meta + + metadata.add_primary_key.call_count == 0 + metadata.add_relationship.assert_called_once_with('x_table', 'users', None) + + def test_add_table_with_fields_dict(self): + """Add table with fields(dict)""" + # Setup + table_names = ['a_table', 'b_table'] + + # Run + metadata = Mock(spec=Metadata) + metadata.get_table_names.return_value = table_names + metadata._metadata = {'tables': dict()} + + fields = { + 'a_field': {'type': 'numerical', 'subtype': 'integer'} + } + + Metadata.add_table(metadata, 'x_table', fields=fields) + + # Asserts + expected_table_meta = { + 'name': 'x_table', + 'fields': { + 'a_field': {'name': 'a_field', 'type': 'numerical', 'subtype': 'integer'} + } + } + + assert metadata._metadata['tables']['x_table'] == expected_table_meta + + assert metadata._validate_field.call_args_list == [ + call({'name': 'a_field', 'type': 'numerical', 'subtype': 'integer'}) + ] + + metadata.add_primary_key.call_count == 0 + metadata.add_relationship.call_count == 0 + + def test_add_table_with_field_list_no_data(self): + """Add table with fields(list) no data""" + # Setup + table_names = ['a_table', 'b_table'] + + # Run + metadata = Mock(spec=Metadata) + metadata.get_table_names.return_value = table_names + metadata._metadata = {'tables': dict()} + + fields = ['a_field', 'b_field'] + + with pytest.raises(ValueError): + Metadata.add_table(metadata, 'x_table', fields=fields) + + def test_add_table_with_field_list_data(self): + """Add table with fields(list) data""" + # Setup + table_names = ['a_table', 'b_table'] + + # Run + metadata = Mock(spec=Metadata) + metadata.get_table_names.return_value = table_names + metadata._metadata = {'tables': dict()} + + fields = ['a_field', 'b_field'] + data = pd.DataFrame({'a_field': [0, 1], 'b_field': [True, False], 'c_field': ['a', 'b']}) + + Metadata.add_table(metadata, 'x_table', fields=fields, data=data) + + # Asserts + expected_table_meta = { + 'name': 'x_table', + 'fields': { + 'a_field': {'name': 'a_field', 'type': 'numerical', 'subtype': 'integer'}, + 'b_field': {'name': 'b_field', 'type': 'boolean'} + } + } + + assert metadata._metadata['tables']['x_table'] == expected_table_meta + + metadata.add_primary_key.call_count == 0 + metadata.add_relationship.call_count == 0 + + def test_add_table_with_data_analyze(self): + """Add table with data to analyze all""" + # Setup + table_names = ['a_table', 'b_table'] + + # Run + metadata = Mock(spec=Metadata) + metadata.get_table_names.return_value = table_names + metadata._metadata = {'tables': dict()} + + data = pd.DataFrame({'a_field': [0, 1], 'b_field': [True, False], 'c_field': ['a', 'b']}) + + Metadata.add_table(metadata, 'x_table', data=data) + + # Asserts + expected_table_meta = { + 'name': 'x_table', + 'fields': { + 'a_field': {'name': 'a_field', 'type': 'numerical', 'subtype': 'integer'}, + 'b_field': {'name': 'b_field', 'type': 'boolean'}, + 'c_field': {'name': 'c_field', 'type': 'categorical'} + } + } + + assert metadata._metadata['tables']['x_table'] == expected_table_meta + + metadata.add_primary_key.call_count == 0 + metadata.add_relationship.call_count == 0 + + def test_add_relationship_table_no_exist(self): + """Add relationship table no exist""" + # Run and asserts + metadata = Mock(spec=Metadata) + metadata.get_table_names.return_value = list() + + with pytest.raises(ValueError): + Metadata.add_relationship(metadata, 'a_table', 'b_table') + + def test_add_relationship_parent_no_exist(self): + """Add relationship table no exist""" + # Run and asserts + metadata = Mock(spec=Metadata) + metadata.get_table_names.return_value = ['a_table'] + + with pytest.raises(ValueError): + Metadata.add_relationship(metadata, 'a_table', 'b_table') + + def test_add_relationship_already_exist(self): + """Add relationship already exist""" + # Run and asserts + metadata = Mock(spec=Metadata) + metadata.get_table_names.return_value = ['a_table', 'b_table'] + metadata.get_parents.return_value = set(['b_table']) + + with pytest.raises(ValueError): + Metadata.add_relationship(metadata, 'a_table', 'b_table') + + def test_add_relationship_parent_is_child_of_table(self): + """Add relationship parent is child of table""" + # Run and asserts + metadata = Mock(spec=Metadata) + metadata.get_table_names.return_value = ['a_table', 'b_table'] + metadata.get_parents.return_value = set() + metadata.get_children.return_value = set(['b_table']) + + with pytest.raises(ValueError): + Metadata.add_relationship(metadata, 'a_table', 'b_table') + + def test_add_relationship_parent_no_primary_key(self): + """Add relationship parent no primary key""" + # Run and asserts + metadata = Mock(spec=Metadata) + metadata.get_table_names.return_value = ['a_table', 'b_table'] + metadata.get_parents.return_value = set() + metadata.get_children.return_value = set() + metadata.get_primary_key.return_value = None + + with pytest.raises(ValueError): + Metadata.add_relationship(metadata, 'a_table', 'b_table') + + def test_add_relationship_valid(self): + """Add relationship valid""" + # Run + metadata = Mock(spec=Metadata) + metadata.get_table_names.return_value = ['a_table', 'b_table'] + metadata.get_parents.return_value = set() + metadata.get_children.return_value = set() + metadata.get_primary_key.return_value = 'pk_field' + + Metadata.add_relationship(metadata, 'a_table', 'b_table') + + # Asserts + metadata._validate_circular_relationships.assert_called_once_with('b_table', set()) + metadata.add_field.assert_called_once_with( + 'a_table', 'pk_field', 'id', None, {'ref': {'field': 'pk_field', 'table': 'b_table'}} + ) + metadata._get_relationships.assert_called_once_with() + + def test_add_primary_key_table_no_exist(self): + """Add primary key table no exist""" + # Run and asserts + metadata = Mock(spec=Metadata) + metadata.get_table_names.return_value = list() + + with pytest.raises(ValueError): + Metadata.add_primary_key(metadata, 'a_table', 'a_field') + + def test_add_primary_key_field_exist(self): + """Add primary key field exist""" + # Run and asserts + metadata = Mock(spec=Metadata) + metadata.get_table_names.return_value = ['a_table'] + metadata.get_fields.return_value = dict() + + with pytest.raises(ValueError): + Metadata.add_primary_key(metadata, 'a_table', 'a_field') + + def test_add_primary_key_primary_key_exist(self): + """Add primary key primary key exist""" + # Run and asserts + metadata = Mock(spec=Metadata) + metadata.get_table_names.return_value = ['a_table'] + metadata.get_fields.return_value = {'a_field': dict()} + metadata.get_primary_key.return_value = 'some_primary_key' + + with pytest.raises(ValueError): + Metadata.add_primary_key(metadata, 'a_table', 'a_field') + + def test_add_primary_key_valid(self): + """Add primary key valid""" + # Run + metadata = Mock(spec=Metadata) + metadata.get_table_names.return_value = ['a_table'] + metadata.get_fields.return_value = dict() + metadata.get_primary_key.return_value = None + metadata.get_table_meta.return_value = dict() + + Metadata.add_primary_key(metadata, 'a_table', 'a_field') + + # Asserts + metadata.get_table_names.assert_called_once_with() + metadata.get_fields.assert_called_once_with('a_table') + metadata.get_primary_key.assert_called_once_with('a_table') + + metadata.get_table_meta.assert_called_once_with('a_table') + metadata.add_field.assert_called_once_with('a_table', 'a_field', 'id', None, None) + + def test_add_field_table_no_exist(self): + """Add field table no exist""" + # Run and asserts + metadata = Mock(spec=Metadata) + metadata.get_table_names.return_value = list() + + with pytest.raises(ValueError): + Metadata.add_field(metadata, 'a_table', 'a_field', 'id', None, None) + + def test_add_field_field_exist(self): + """Add field already exist""" + # Run and asserts + metadata = Mock(spec=Metadata) + metadata.get_table_names.return_value = ['a_table'] + metadata.get_fields.return_value = {'a_field': dict()} + + with pytest.raises(ValueError): + Metadata.add_field(metadata, 'a_table', 'a_field', 'id', None, None) + + def test_add_field_valid(self): + """Add valid field""" + # Run + metadata = Mock(spec=Metadata) + metadata.get_table_names.return_value = ['a_table'] + metadata.get_fields.return_value = dict() + + Metadata.add_field(metadata, 'a_table', 'a_field', 'numerical', 'integer', {'min': 0}) + + # Asserts + metadata.get_table_names.assert_called_once_with() + metadata.get_fields.assert_called_once_with('a_table') From efa3a48c587e5d54f57b32df07c522309d28f5a3 Mon Sep 17 00:00:00 2001 From: JDTheRipperPC Date: Wed, 20 Nov 2019 12:09:47 +0100 Subject: [PATCH 06/16] remove v2 methods to reimplement them --- sdv/metadata.py | 124 ------------------------------------------------ 1 file changed, 124 deletions(-) diff --git a/sdv/metadata.py b/sdv/metadata.py index 7d853c488..bc4d6fb60 100644 --- a/sdv/metadata.py +++ b/sdv/metadata.py @@ -716,130 +716,6 @@ def add_table(self, name, primary_key=None, fields=dict(), data=None, parent=Non if parent: self.add_relationship(name, parent, foreign_key) - def remove_table(self, table): - """Remove a table, their childrens and their relationships. - - First, assert that the ``table`` exists. Then, get their childrens and remove them too, - including their relationships. Finally, remove the given ``table``. - - Args: - table (str): - Table to be removed. - """ - if table not in self.get_table_names(): - raise ValueError('Table "{}" doesn\'t exists.'.format(table)) - - childrens = self.get_children(table) - for children in list(childrens): - self.remove_table(children) - - parents = self.get_parents(table) - self.remove_relationship(table, list(parents)[0]) - - del self._metadata['tables'][table] - self._get_relationships() - - def remove_relationship(self, table, parent): - """Remove a relationship between a table and her parent. - - By a given ``table`` and ``parent`` remove their relationship. - Also, remove the ``'ref'`` key-value from the ``foreign_key`` field. - - First, assert that the ``table`` and ``parent`` exists. - Then, discard the ``table`` from the childrens of ``parent`` and the ``parent`` from - the parents of ``table``. - Finally, remove the ``'ref'`` key-value from the ``foreign_key`` field. - - Args: - table (str): - Table name to remove their relationship with a parent table. - parent (str): - Table name to remove their relationship with a children table. - """ - if table not in self.get_table_names(): - raise ValueError('Table "{}" doesn\'t exists.'.format(table)) - - if parent not in self.get_table_names(): - raise ValueError('Table "{}" doesn\'t exists.'.format(parent)) - - if parent in self.get_children(table): - self._delete_foreign_key(table, parent) - - if table in self.get_children(parent): - self._delete_foreign_key(parent, table) - - self._get_relationships() - - def update_field(self, table, field, field_details): - """Update a field from a gibven table. - - First, assert that the ``table`` and ``field`` exists. - Then, validate the ``field_details`` format. Finally, update the field. - - Args: - table (str): - Table name to update the field, it must exist. - field (str): - Field name to be updated, it must exist. - field_details (dict): - Dictionary with the details to be updated. - - Raises: - ValueError: - A ``ValueError`` is raised when the ``table`` doesn't exists or the ``field`` - doesn't exists in the table. - """ - if table not in self.get_table_names(): - raise ValueError('Table "{}" doesn\'t exists.'.format(table)) - - if field not in self.get_fields(table).keys(): - raise ValueError('Table {}, field {} doesn\'t exists.'.format(table, field)) - - fields = self.get_fields(table) - fields[field].update(field_details) - self._validate_field(fields[field]) - - def remove_field(self, table, field): - """Remove a field from a given table. - - First, assert that the ``table`` and``field`` exists. - If the field to be removed is the primary key, get their childrens and remove their - reference field. Finally, remove the field and recomptue the tables relationships. - - Args: - table (str): - Table name to remove the field, it must exist. - field (str): - Field name to be removed, it must exist. - - Raises: - ValueError: - A ``ValueError`` is raised when the ``table`` doesn't exists or the ``field`` - doesn't exists in the table. - """ - if table not in self.get_table_names(): - raise ValueError('Table "{}" doesn\'t exists.'.format(table)) - - fields = self.get_fields(table) - if field not in fields.keys(): - raise ValueError('Table {}, field {} doesn\'t exists.'.format(table, field)) - - primary_key = self.get_primary_key(table) - if field == primary_key: - self._delete_foreign_key_to(table) - del self.get_table_meta(table)['primary_key'] - - del fields[field] - self._get_relationships() - - def _delete_foreign_key(self, parent, child): - foreign_key = self.get_foreign_key(parent, child) - del self.get_fields(child)[foreign_key] - - def _delete_foreign_key_to(self, table_name): - for children in list(self.get_children(table_name)): - self._delete_foreign_key(table_name, children) - def to_dict(self): return copy.deepcopy(self._metadata) From 745abaacecbcc898c222f3265a7741191ba0d5e3 Mon Sep 17 00:00:00 2001 From: JDTheRipperPC Date: Wed, 20 Nov 2019 12:33:15 +0100 Subject: [PATCH 07/16] docstrings --- sdv/metadata.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/sdv/metadata.py b/sdv/metadata.py index bc4d6fb60..5009a533e 100644 --- a/sdv/metadata.py +++ b/sdv/metadata.py @@ -535,11 +535,11 @@ def _validate_circular_relationships(self, parent, children=None): for grandchild in children: self._validate_circular_relationships(parent, self.get_children(grandchild)) - def add_field(self, table, field, type, subtype, properties): + def add_field(self, table, field, type, subtype=None, properties=dict()): """Add a new field into a given table. - First, assert that the ``table`` exists and the ``field`` does not. - Then, validate the ``field_details`` format. Finally, add the field. + Before add the field validate the ``table`` already exists, the ``field`` does not exist, + ``type`` and ``subtype`` supporteds, and ``properties`` valid properties. The error message displayed, when the ``ValueError`` is raised because the ``fields`` already exists in the table, recommends you to use the ``update_fields`` instead. @@ -549,8 +549,13 @@ def add_field(self, table, field, type, subtype, properties): Table name to add the new field, it must exist. field (str): Field name to be added, it must not exist. - field_details (dict): - Dictionary with the details for the new table field. + type (str): + Data type of field to be added. Required. + subtype (str): + Data subtype of field to be added. Not required. Defaults to ``None``. + properties (dict): + Extra properties of field like: ref, format, min, max, etc. Not required. + Defaults to ``dict()``. Raises: ValueError: @@ -567,7 +572,6 @@ def add_field(self, table, field, type, subtype, properties): .format(table, field) ) - # Validate type, subtype and properties field_details = { 'name': field, 'type': type @@ -580,6 +584,8 @@ def add_field(self, table, field, type, subtype, properties): for property_name, property_value in properties.items(): field_details[property_name] = property_value + self._validate_field(field_details) + fields[field] = field_details def add_primary_key(self, table, field): @@ -629,6 +635,7 @@ def add_relationship(self, table, parent, foreign_key=None): Table name to add a new relationship with a children table. foreign_key (str): Field name from the parent table to create the reference in the children table. + Defaults to ``None``. """ if table not in self.get_table_names(): raise ValueError('Table "{}" doesn\'t exists.'.format(table)) @@ -720,4 +727,4 @@ def to_dict(self): return copy.deepcopy(self._metadata) def to_json(self): - pass + return json.dumps(self._metadata, indent=4) From c0c23ee627e278fa283a8ba63efd26fd711b7a14 Mon Sep 17 00:00:00 2001 From: JDTheRipperPC Date: Wed, 20 Nov 2019 14:05:47 +0100 Subject: [PATCH 08/16] updated docstrings and added integration tests for metadata --- sdv/metadata.py | 45 +++++++---- tests/integration/__init__.py | 0 tests/integration/test_metadata.py | 121 +++++++++++++++++++++++++++++ 3 files changed, 153 insertions(+), 13 deletions(-) create mode 100644 tests/integration/__init__.py create mode 100644 tests/integration/test_metadata.py diff --git a/sdv/metadata.py b/sdv/metadata.py index 5009a533e..700bd6eba 100644 --- a/sdv/metadata.py +++ b/sdv/metadata.py @@ -621,7 +621,8 @@ def add_relationship(self, table, parent, foreign_key=None): """Add a new relationship between a 2 tables. By a given ``table`` and ``parent`` add a new relationship using ``foreign_key`` - to reference the parent field. + to reference the parent field, when ``foreign_key`` is ``None`` use the primary key + field name as foreign key. First, assert that the ``table`` and ``parent`` exists. Then, assert if already exists a relationship between both tables. @@ -636,6 +637,12 @@ def add_relationship(self, table, parent, foreign_key=None): foreign_key (str): Field name from the parent table to create the reference in the children table. Defaults to ``None``. + + Raises: + ValueError: + A ``ValueError`` is raised when ``table`` or ``parent`` don't exist, + the relationship already exists, ``parent`` is child of ``table`` of + ``parent`` table does not have primary key. """ if table not in self.get_table_names(): raise ValueError('Table "{}" doesn\'t exists.'.format(table)) @@ -664,30 +671,42 @@ def _add_table_load_data(self, data): data = data if os.path.exists(data) else os.path.join(self.root_path, data) return pd.read_csv(data) - def add_table(self, name, primary_key=None, fields=dict(), data=None, parent=None, + def add_table(self, name, primary_key=None, fields=None, data=None, parent=None, foreign_key=None): """Add a new table to the metadata. - First, assert that the ``name`` table exists. - Create the table with the table name and an empty fields. - If ``primary_key`` is defined add it to the table. + First, validate that the ``name`` table already exists. - When ``fields`` is a ``dict``, use it to set the ``fields`` key from the table - (fields are validated). - When ``data`` is provided and ``fields`` is not or it's a ``list`` type, - analyze the data for all the columns or just the ``fields`` columns. - If the ``data`` is a ``str`` it should point to a csv file with the data to analazy. - It may be the relative path, if it's then concat the ``root_path`` with the ``data`` path. + When ``fields`` is a ``dict``, ignore ``data`` and validate those fields. + When ``fields`` is a ``list`` and ``data`` is ``None``, raise a ``ValueError``. + When ``fields`` is a ``list`` and ``data`` is not ``None``, analyze data columns in list. + When ``fields`` is ``None`` and ``data`` is not ``None``, analyze all columns. - Finally, if ``parent`` and ``foreign_key`` are provided, create their relationship. + Use the ``fields`` to create the table and add primary key or relationship if needed. Args: name (str): + Table name to be created, it must not exists. primary_key (str): + Field name to add as primary key, it must not exists. Defaults to ``None``. fields (dict or list): + If it's a ``dict``, data is ignored. + If it's a ``list``, indicate which columns will be analized. + Defaults to ``None``. data (str or pandas.DataFrame): + Table to be analyzed or path to the csv file. + If it's a relative path, use ``root_path`` to find the file. + Only used if fields is a ``list`` or ``None``. + Defaults to ``None``. parent (str): + Table name to refere a foreign key field. Defaults to ``None``. foreign_key (str): + Foreing key field name to ``parent`` table primary key. Defaults to ``None``. + + Raises: + ValueError: + A ``ValueError`` is raised when the table ``name`` already exists + or ``fields`` is a ``list`` and ``data`` is ``None`` """ if name in self.get_table_names(): raise ValueError('Table "{}" already exists.'.format(name)) @@ -712,7 +731,7 @@ def add_table(self, name, primary_key=None, fields=dict(), data=None, parent=Non fields = Metadata._analyze(data) - table = {'name': name, 'fields': fields} + table = {'name': name, 'fields': fields or dict()} self._metadata['tables'][name] = table if primary_key: diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/integration/test_metadata.py b/tests/integration/test_metadata.py new file mode 100644 index 000000000..c17f67e42 --- /dev/null +++ b/tests/integration/test_metadata.py @@ -0,0 +1,121 @@ +import pandas as pd +import pytest +from sdv import Metadata + + +def get_metadata(): + return Metadata({'tables': dict()}) + + +def test_add_fields_and_primary_key(): + metadata = get_metadata() + + metadata.add_table('a_table') + + metadata.add_field('a_table', 'categoricals', 'categorical') + metadata.add_field('a_table', 'integers', 'numerical', 'integer', {'min': 0, 'max': 10}) + metadata.add_field('a_table', 'floats', 'numerical', 'float') + metadata.add_field('a_table', 'booleans', 'boolean') + + metadata.add_primary_key('a_table', 'index') + + expected_metadata = { + 'tables': { + 'a_table': { + 'name': 'a_table', + 'primary_key': 'index', + 'fields': { + 'categoricals': { + 'name': 'categoricals', + 'type': 'categorical' + }, + 'integers': { + 'name': 'integers', + 'type': 'numerical', + 'subtype': 'integer', + 'min': 0, + 'max': 10 + }, + 'floats': { + 'name': 'floats', + 'type': 'numerical', + 'subtype': 'float' + }, + 'booleans': { + 'name': 'booleans', + 'type': 'boolean' + }, + 'index': { + 'name': 'index', + 'type': 'id' + } + } + } + } + } + + assert metadata._metadata == expected_metadata + + +def test_add_table_analyze_all(): + metadata = get_metadata() + + data = pd.DataFrame({ + 'a_field': [0, 1, 2], + 'b_field': ['a', 'b', 'c'], + 'c_field': [True, False, False], + 'd_field': [0., 1., 2.] + }) + + metadata.add_table('a_table', data=data) + + expected_metadata = { + 'tables': { + 'a_table': { + 'name': 'a_table', + 'fields': { + 'a_field': { + 'name': 'a_field', + 'type': 'numerical', + 'subtype': 'integer' + }, + 'b_field': { + 'name': 'b_field', + 'type': 'categorical' + }, + 'c_field': { + 'name': 'c_field', + 'type': 'boolean' + }, + 'd_field': { + 'name': 'd_field', + 'type': 'numerical', + 'subtype': 'float' + } + } + } + } + } + + assert metadata._metadata == expected_metadata + + +def test_add_relationships(): + metadata = get_metadata() + + metadata.add_table('foo', primary_key='index_foo') + metadata.add_table('bar', primary_key='index_bar', parent='foo') + + assert metadata.get_children('foo') == set(['bar']) + assert metadata.get_parents('bar') == set(['foo']) + + +def test_cirtular_dependence_validation(): + metadata = get_metadata() + + metadata.add_table('foo', primary_key='index_foo') + metadata.add_table('bar', primary_key='index_bar', parent='foo') + metadata.add_table('tar', primary_key='index_tar', parent='bar') + + with pytest.raises(ValueError): + metadata.add_relationship('foo', 'tar') From 4e4f2a1071f4988af1826d25d084cc7fa21efee1 Mon Sep 17 00:00:00 2001 From: JDTheRipperPC Date: Wed, 20 Nov 2019 14:15:44 +0100 Subject: [PATCH 09/16] fix lint --- tests/integration/test_metadata.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/test_metadata.py b/tests/integration/test_metadata.py index c17f67e42..d8055cdae 100644 --- a/tests/integration/test_metadata.py +++ b/tests/integration/test_metadata.py @@ -1,5 +1,6 @@ import pandas as pd import pytest + from sdv import Metadata From 52422e7ca3bc1ee12703e6d43914ceff3071e857 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Fri, 22 Nov 2019 22:14:25 +0100 Subject: [PATCH 10/16] Finish generate metadata development --- docs/metadata.rst | 2 +- ...uickstart - Single Table - In Memory.ipynb | 78 +- .... Quickstart - Single Table - Census.ipynb | 170 ++-- .../3. Quickstart - Multitable - Files.ipynb | 142 +-- examples/4. Quickstart - Anonymization.ipynb | 76 +- ...5. Generate Metadata from Dataframes.ipynb | 278 +++++ examples/README Demo.ipynb | 232 ----- sdv/demo.py | 77 +- sdv/metadata.py | 735 +++++++------- sdv/modeler.py | 2 +- sdv/sampler.py | 7 +- sdv/sdv.py | 2 +- tests/integration/test_metadata.py | 167 +-- tests/test_metadata.py | 954 +++++++++--------- tests/test_modeler.py | 44 +- tests/test_sampler.py | 222 ++-- tests/test_sdv.py | 50 +- 17 files changed, 1599 insertions(+), 1639 deletions(-) create mode 100644 examples/5. Generate Metadata from Dataframes.ipynb delete mode 100644 examples/README Demo.ipynb diff --git a/docs/metadata.rst b/docs/metadata.rst index 5704ff862..0b47b2602 100644 --- a/docs/metadata.rst +++ b/docs/metadata.rst @@ -118,7 +118,7 @@ The available types and subtypes are in this table: +---------------+---------------+ | numerical | float | +---------------+---------------+ -| datetime | datetime | +| datetime | | +---------------+---------------+ | categorical | | +---------------+---------------+ diff --git a/examples/1. Quickstart - Single Table - In Memory.ipynb b/examples/1. Quickstart - Single Table - In Memory.ipynb index 2040ce8b6..142241475 100644 --- a/examples/1. Quickstart - Single Table - In Memory.ipynb +++ b/examples/1. Quickstart - Single Table - In Memory.ipynb @@ -232,14 +232,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "2019-11-03 16:07:31,541 - INFO - modeler - Modeling data\n", - "2019-11-03 16:07:31,542 - INFO - metadata - Loading transformer NumericalTransformer for field integer\n", - "2019-11-03 16:07:31,543 - INFO - metadata - Loading transformer NumericalTransformer for field float\n", - "2019-11-03 16:07:31,543 - INFO - metadata - Loading transformer CategoricalTransformer for field categorical\n", - "2019-11-03 16:07:31,543 - INFO - metadata - Loading transformer BooleanTransformer for field bool\n", - "2019-11-03 16:07:31,544 - INFO - metadata - Loading transformer NumericalTransformer for field nullable\n", - "2019-11-03 16:07:31,544 - INFO - metadata - Loading transformer DatetimeTransformer for field datetime\n", - "2019-11-03 16:07:31,594 - INFO - modeler - Modeling Complete\n" + "2019-11-22 22:10:48,396 - INFO - modeler - Modeling data\n", + "2019-11-22 22:10:48,397 - INFO - metadata - Loading transformer NumericalTransformer for field integer\n", + "2019-11-22 22:10:48,397 - INFO - metadata - Loading transformer NumericalTransformer for field float\n", + "2019-11-22 22:10:48,398 - INFO - metadata - Loading transformer CategoricalTransformer for field categorical\n", + "2019-11-22 22:10:48,398 - INFO - metadata - Loading transformer BooleanTransformer for field bool\n", + "2019-11-22 22:10:48,399 - INFO - metadata - Loading transformer NumericalTransformer for field nullable\n", + "2019-11-22 22:10:48,399 - INFO - metadata - Loading transformer DatetimeTransformer for field datetime\n", + "2019-11-22 22:10:48,450 - INFO - modeler - Modeling Complete\n" ] } ], @@ -292,51 +292,51 @@ " 0\n", " 0\n", " 1\n", - " 0.155202\n", + " 0.155489\n", " a\n", " False\n", - " 5.632725\n", - " 2010-01-14 15:20:28.968422912\n", + " 2.922163\n", + " 2010-01-16 23:55:00.530385152\n", " \n", " \n", " 1\n", " 1\n", " 2\n", - " 0.148088\n", - " b\n", - " True\n", - " 4.338519\n", - " 2010-01-23 02:27:17.721717760\n", + " 0.190969\n", + " c\n", + " NaN\n", + " NaN\n", + " NaT\n", " \n", " \n", " 2\n", " 2\n", - " 2\n", - " 0.201357\n", + " 1\n", + " 0.112750\n", " a\n", " True\n", - " 3.055583\n", - " 2010-01-27 13:49:01.067935232\n", + " NaN\n", + " 2010-01-13 13:30:29.267090688\n", " \n", " \n", " 3\n", " 3\n", " 2\n", - " 0.192696\n", - " b\n", + " 0.217101\n", + " NaN\n", " False\n", - " 3.399388\n", - " 2010-01-26 18:17:43.376063232\n", + " NaN\n", + " 2010-02-09 05:40:33.603006208\n", " \n", " \n", " 4\n", " 4\n", - " 1\n", - " 0.106991\n", - " a\n", + " 2\n", + " 0.120487\n", + " b\n", " False\n", - " 3.495486\n", - " 2010-01-09 14:46:37.969550592\n", + " NaN\n", + " 2010-01-16 04:21:47.058566656\n", " \n", " \n", "\n", @@ -344,18 +344,18 @@ ], "text/plain": [ " index integer float categorical bool nullable \\\n", - "0 0 1 0.155202 a False 5.632725 \n", - "1 1 2 0.148088 b True 4.338519 \n", - "2 2 2 0.201357 a True 3.055583 \n", - "3 3 2 0.192696 b False 3.399388 \n", - "4 4 1 0.106991 a False 3.495486 \n", + "0 0 1 0.155489 a False 2.922163 \n", + "1 1 2 0.190969 c NaN NaN \n", + "2 2 1 0.112750 a True NaN \n", + "3 3 2 0.217101 NaN False NaN \n", + "4 4 2 0.120487 b False NaN \n", "\n", " datetime \n", - "0 2010-01-14 15:20:28.968422912 \n", - "1 2010-01-23 02:27:17.721717760 \n", - "2 2010-01-27 13:49:01.067935232 \n", - "3 2010-01-26 18:17:43.376063232 \n", - "4 2010-01-09 14:46:37.969550592 " + "0 2010-01-16 23:55:00.530385152 \n", + "1 NaT \n", + "2 2010-01-13 13:30:29.267090688 \n", + "3 2010-02-09 05:40:33.603006208 \n", + "4 2010-01-16 04:21:47.058566656 " ] }, "execution_count": 5, diff --git a/examples/2. Quickstart - Single Table - Census.ipynb b/examples/2. Quickstart - Single Table - Census.ipynb index 003b5f449..00c47050c 100644 --- a/examples/2. Quickstart - Single Table - Census.ipynb +++ b/examples/2. Quickstart - Single Table - Census.ipynb @@ -309,23 +309,23 @@ "name": "stderr", "output_type": "stream", "text": [ - "2019-11-03 16:07:39,430 - INFO - modeler - Modeling census\n", - "2019-11-03 16:07:39,431 - INFO - metadata - Loading transformer NumericalTransformer for field age\n", - "2019-11-03 16:07:39,431 - INFO - metadata - Loading transformer CategoricalTransformer for field workclass\n", - "2019-11-03 16:07:39,432 - INFO - metadata - Loading transformer NumericalTransformer for field fnlwgt\n", - "2019-11-03 16:07:39,432 - INFO - metadata - Loading transformer CategoricalTransformer for field education\n", - "2019-11-03 16:07:39,433 - INFO - metadata - Loading transformer NumericalTransformer for field education-num\n", - "2019-11-03 16:07:39,433 - INFO - metadata - Loading transformer CategoricalTransformer for field marital-status\n", - "2019-11-03 16:07:39,433 - INFO - metadata - Loading transformer CategoricalTransformer for field occupation\n", - "2019-11-03 16:07:39,434 - INFO - metadata - Loading transformer CategoricalTransformer for field relationship\n", - "2019-11-03 16:07:39,434 - INFO - metadata - Loading transformer CategoricalTransformer for field race\n", - "2019-11-03 16:07:39,435 - INFO - metadata - Loading transformer CategoricalTransformer for field sex\n", - "2019-11-03 16:07:39,435 - INFO - metadata - Loading transformer NumericalTransformer for field capital-gain\n", - "2019-11-03 16:07:39,436 - INFO - metadata - Loading transformer NumericalTransformer for field capital-loss\n", - "2019-11-03 16:07:39,436 - INFO - metadata - Loading transformer NumericalTransformer for field hours-per-week\n", - "2019-11-03 16:07:39,437 - INFO - metadata - Loading transformer CategoricalTransformer for field native-country\n", - "2019-11-03 16:07:39,437 - INFO - metadata - Loading transformer CategoricalTransformer for field income\n", - "2019-11-03 16:07:39,681 - INFO - modeler - Modeling Complete\n" + "2019-11-22 22:11:07,542 - INFO - modeler - Modeling census\n", + "2019-11-22 22:11:07,543 - INFO - metadata - Loading transformer NumericalTransformer for field age\n", + "2019-11-22 22:11:07,543 - INFO - metadata - Loading transformer CategoricalTransformer for field workclass\n", + "2019-11-22 22:11:07,544 - INFO - metadata - Loading transformer NumericalTransformer for field fnlwgt\n", + "2019-11-22 22:11:07,544 - INFO - metadata - Loading transformer CategoricalTransformer for field education\n", + "2019-11-22 22:11:07,544 - INFO - metadata - Loading transformer NumericalTransformer for field education-num\n", + "2019-11-22 22:11:07,545 - INFO - metadata - Loading transformer CategoricalTransformer for field marital-status\n", + "2019-11-22 22:11:07,545 - INFO - metadata - Loading transformer CategoricalTransformer for field occupation\n", + "2019-11-22 22:11:07,545 - INFO - metadata - Loading transformer CategoricalTransformer for field relationship\n", + "2019-11-22 22:11:07,546 - INFO - metadata - Loading transformer CategoricalTransformer for field race\n", + "2019-11-22 22:11:07,546 - INFO - metadata - Loading transformer CategoricalTransformer for field sex\n", + "2019-11-22 22:11:07,547 - INFO - metadata - Loading transformer NumericalTransformer for field capital-gain\n", + "2019-11-22 22:11:07,548 - INFO - metadata - Loading transformer NumericalTransformer for field capital-loss\n", + "2019-11-22 22:11:07,549 - INFO - metadata - Loading transformer NumericalTransformer for field hours-per-week\n", + "2019-11-22 22:11:07,549 - INFO - metadata - Loading transformer CategoricalTransformer for field native-country\n", + "2019-11-22 22:11:07,550 - INFO - metadata - Loading transformer CategoricalTransformer for field income\n", + "2019-11-22 22:11:07,799 - INFO - modeler - Modeling Complete\n" ] } ], @@ -382,93 +382,93 @@ " \n", " \n", " 0\n", - " 37\n", + " 54\n", " Private\n", - " 301988\n", - " Bachelors\n", - " 13\n", + " 4837\n", + " HS-grad\n", + " 12\n", " Married-civ-spouse\n", - " Adm-clerical\n", - " Own-child\n", + " Protective-serv\n", + " Husband\n", " White\n", - " Female\n", - " 3539\n", - " 266\n", - " 36\n", + " Male\n", + " 1541\n", + " 20\n", + " 52\n", " United-States\n", " <=50K\n", " \n", " \n", " 1\n", - " 18\n", + " 41\n", " Private\n", - " 303023\n", - " HS-grad\n", - " 15\n", + " 260756\n", + " Some-college\n", + " 10\n", " Never-married\n", - " Craft-repair\n", + " Transport-moving\n", " Not-in-family\n", " White\n", " Male\n", - " -2356\n", - " 154\n", - " 48\n", + " 1958\n", + " -656\n", + " 43\n", " United-States\n", - " <=50K\n", + " >50K\n", " \n", " \n", " 2\n", - " 23\n", + " 38\n", " Private\n", - " 194623\n", - " Some-college\n", - " 7\n", + " 211042\n", + " Bachelors\n", + " 10\n", " Never-married\n", " Sales\n", - " Not-in-family\n", + " Own-child\n", " White\n", - " Male\n", - " 2494\n", - " 337\n", - " 61\n", + " Female\n", + " -2265\n", + " -107\n", + " 42\n", " United-States\n", " <=50K\n", " \n", " \n", " 3\n", - " 41\n", - " Private\n", - " 33751\n", - " Assoc-voc\n", - " 5\n", - " Widowed\n", - " Prof-specialty\n", + " 45\n", + " Self-emp-not-inc\n", + " 393251\n", + " Bachelors\n", + " 10\n", + " Never-married\n", + " Craft-repair\n", " Husband\n", " White\n", - " Female\n", - " -2441\n", - " -431\n", - " 21\n", + " Male\n", + " 5538\n", + " -128\n", + " 27\n", " United-States\n", " <=50K\n", " \n", " \n", " 4\n", - " 46\n", - " Self-emp-not-inc\n", - " 180144\n", - " Bachelors\n", - " 13\n", + " 45\n", + " Private\n", + " 153962\n", + " HS-grad\n", + " 9\n", " Never-married\n", - " Prof-specialty\n", + " Exec-managerial\n", " Husband\n", " White\n", " Male\n", - " 3570\n", - " 546\n", - " 40\n", + " 2032\n", + " 780\n", + " 27\n", " United-States\n", - " >50K\n", + " <=50K\n", " \n", " \n", "\n", @@ -476,25 +476,25 @@ ], "text/plain": [ " age workclass fnlwgt education education-num \\\n", - "0 37 Private 301988 Bachelors 13 \n", - "1 18 Private 303023 HS-grad 15 \n", - "2 23 Private 194623 Some-college 7 \n", - "3 41 Private 33751 Assoc-voc 5 \n", - "4 46 Self-emp-not-inc 180144 Bachelors 13 \n", + "0 54 Private 4837 HS-grad 12 \n", + "1 41 Private 260756 Some-college 10 \n", + "2 38 Private 211042 Bachelors 10 \n", + "3 45 Self-emp-not-inc 393251 Bachelors 10 \n", + "4 45 Private 153962 HS-grad 9 \n", "\n", - " marital-status occupation relationship race sex \\\n", - "0 Married-civ-spouse Adm-clerical Own-child White Female \n", - "1 Never-married Craft-repair Not-in-family White Male \n", - "2 Never-married Sales Not-in-family White Male \n", - "3 Widowed Prof-specialty Husband White Female \n", - "4 Never-married Prof-specialty Husband White Male \n", + " marital-status occupation relationship race sex \\\n", + "0 Married-civ-spouse Protective-serv Husband White Male \n", + "1 Never-married Transport-moving Not-in-family White Male \n", + "2 Never-married Sales Own-child White Female \n", + "3 Never-married Craft-repair Husband White Male \n", + "4 Never-married Exec-managerial Husband White Male \n", "\n", " capital-gain capital-loss hours-per-week native-country income \n", - "0 3539 266 36 United-States <=50K \n", - "1 -2356 154 48 United-States <=50K \n", - "2 2494 337 61 United-States <=50K \n", - "3 -2441 -431 21 United-States <=50K \n", - "4 3570 546 40 United-States >50K " + "0 1541 20 52 United-States <=50K \n", + "1 1958 -656 43 United-States >50K \n", + "2 -2265 -107 42 United-States <=50K \n", + "3 5538 -128 27 United-States <=50K \n", + "4 2032 780 27 United-States <=50K " ] }, "execution_count": 7, @@ -523,9 +523,9 @@ { "data": { "text/plain": [ - "mse 438.13273527481414\n", - "rmse 20.93162046461798\n", - "r2_score 0.9994061495940457\n", + "mse 403.8003241991141\n", + "rmse 20.094783507147174\n", + "r2_score 0.9994524190196646\n", "dtype: object" ] }, diff --git a/examples/3. Quickstart - Multitable - Files.ipynb b/examples/3. Quickstart - Multitable - Files.ipynb index 7ba0ca259..cad162e9b 100644 --- a/examples/3. Quickstart - Multitable - Files.ipynb +++ b/examples/3. Quickstart - Multitable - Files.ipynb @@ -11,21 +11,21 @@ "name": "stderr", "output_type": "stream", "text": [ - "2019-11-05 20:59:38,774 - INFO - modeler - Modeling customers\n", - "2019-11-05 20:59:38,775 - INFO - metadata - Loading table customers\n", - "2019-11-05 20:59:38,783 - INFO - metadata - Loading transformer CategoricalTransformer for field cust_postal_code\n", - "2019-11-05 20:59:38,784 - INFO - metadata - Loading transformer NumericalTransformer for field phone_number1\n", - "2019-11-05 20:59:38,784 - INFO - metadata - Loading transformer NumericalTransformer for field credit_limit\n", - "2019-11-05 20:59:38,785 - INFO - metadata - Loading transformer CategoricalTransformer for field country\n", - "2019-11-05 20:59:38,802 - INFO - modeler - Modeling orders\n", - "2019-11-05 20:59:38,803 - INFO - metadata - Loading table orders\n", - "2019-11-05 20:59:38,805 - INFO - metadata - Loading transformer NumericalTransformer for field order_total\n", - "2019-11-05 20:59:38,809 - INFO - modeler - Modeling order_items\n", - "2019-11-05 20:59:38,809 - INFO - metadata - Loading table order_items\n", - "2019-11-05 20:59:38,813 - INFO - metadata - Loading transformer CategoricalTransformer for field product_id\n", - "2019-11-05 20:59:38,814 - INFO - metadata - Loading transformer NumericalTransformer for field unit_price\n", - "2019-11-05 20:59:38,814 - INFO - metadata - Loading transformer NumericalTransformer for field quantity\n", - "2019-11-05 20:59:39,477 - INFO - modeler - Modeling Complete\n" + "2019-11-22 22:11:44,765 - INFO - modeler - Modeling customers\n", + "2019-11-22 22:11:44,765 - INFO - metadata - Loading table customers\n", + "2019-11-22 22:11:44,772 - INFO - metadata - Loading transformer CategoricalTransformer for field cust_postal_code\n", + "2019-11-22 22:11:44,772 - INFO - metadata - Loading transformer NumericalTransformer for field phone_number1\n", + "2019-11-22 22:11:44,773 - INFO - metadata - Loading transformer NumericalTransformer for field credit_limit\n", + "2019-11-22 22:11:44,773 - INFO - metadata - Loading transformer CategoricalTransformer for field country\n", + "2019-11-22 22:11:44,790 - INFO - modeler - Modeling orders\n", + "2019-11-22 22:11:44,791 - INFO - metadata - Loading table orders\n", + "2019-11-22 22:11:44,795 - INFO - metadata - Loading transformer NumericalTransformer for field order_total\n", + "2019-11-22 22:11:44,799 - INFO - modeler - Modeling order_items\n", + "2019-11-22 22:11:44,799 - INFO - metadata - Loading table order_items\n", + "2019-11-22 22:11:44,804 - INFO - metadata - Loading transformer CategoricalTransformer for field product_id\n", + "2019-11-22 22:11:44,805 - INFO - metadata - Loading transformer NumericalTransformer for field unit_price\n", + "2019-11-22 22:11:44,805 - INFO - metadata - Loading transformer NumericalTransformer for field quantity\n", + "2019-11-22 22:11:45,470 - INFO - modeler - Modeling Complete\n" ] } ], @@ -45,14 +45,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "2019-11-05 20:59:40,118 - INFO - metadata - Loading table customers\n", - "2019-11-05 20:59:40,124 - INFO - metadata - Loading table orders\n", - "2019-11-05 20:59:40,130 - INFO - metadata - Loading table order_items\n" + "2019-11-22 22:11:45,521 - INFO - metadata - Loading table customers\n", + "2019-11-22 22:11:45,524 - INFO - metadata - Loading table orders\n", + "2019-11-22 22:11:45,526 - INFO - metadata - Loading table order_items\n" ] } ], "source": [ - "real = sdv.metadata.get_tables()\n", + "real = sdv.metadata.load_tables()\n", "\n", "samples = sdv.sample_all(len(real['customers']), reset_primary_keys=True)" ] @@ -97,41 +97,41 @@ " 0\n", " 0\n", " 6096\n", - " 5540335729\n", - " 727\n", - " SPAIN\n", + " 5887096836\n", + " 822\n", + " FRANCE\n", " \n", " \n", " 1\n", " 1\n", - " 63145\n", - " 7098501621\n", - " 1469\n", + " 11371\n", + " 6798799259\n", + " 901\n", " UK\n", " \n", " \n", " 2\n", " 2\n", - " 63145\n", - " 4805710737\n", - " 934\n", - " UK\n", + " 11371\n", + " 5947789567\n", + " 535\n", + " SPAIN\n", " \n", " \n", " 3\n", " 3\n", - " 63145\n", - " 9245642905\n", - " 1302\n", - " US\n", + " 20166\n", + " 6100829678\n", + " 1016\n", + " CANADA\n", " \n", " \n", " 4\n", " 4\n", " 63145\n", - " 8347089849\n", - " 890\n", - " UK\n", + " 8061087361\n", + " 917\n", + " US\n", " \n", " \n", "\n", @@ -139,11 +139,11 @@ ], "text/plain": [ " customer_id cust_postal_code phone_number1 credit_limit country\n", - "0 0 6096 5540335729 727 SPAIN\n", - "1 1 63145 7098501621 1469 UK\n", - "2 2 63145 4805710737 934 UK\n", - "3 3 63145 9245642905 1302 US\n", - "4 4 63145 8347089849 890 UK" + "0 0 6096 5887096836 822 FRANCE\n", + "1 1 11371 6798799259 901 UK\n", + "2 2 11371 5947789567 535 SPAIN\n", + "3 3 20166 6100829678 1016 CANADA\n", + "4 4 63145 8061087361 917 US" ] }, "execution_count": 3, @@ -287,31 +287,31 @@ " 0\n", " 0\n", " 0\n", - " 871\n", + " 1201\n", " \n", " \n", " 1\n", " 1\n", " 0\n", - " 531\n", + " 1963\n", " \n", " \n", " 2\n", " 2\n", " 1\n", - " 654\n", + " 740\n", " \n", " \n", " 3\n", " 3\n", - " 3\n", - " 49\n", + " 1\n", + " 978\n", " \n", " \n", " 4\n", " 4\n", " 3\n", - " -2557\n", + " 897\n", " \n", " \n", "\n", @@ -319,11 +319,11 @@ ], "text/plain": [ " order_id customer_id order_total\n", - "0 0 0 871\n", - "1 1 0 531\n", - "2 2 1 654\n", - "3 3 3 49\n", - "4 4 3 -2557" + "0 0 0 1201\n", + "1 1 0 1963\n", + "2 2 1 740\n", + "3 3 1 978\n", + "4 4 3 897" ] }, "execution_count": 5, @@ -457,41 +457,41 @@ " 0\n", " 0\n", " 0\n", - " 6\n", - " 147\n", - " 2\n", + " 9\n", + " 58\n", + " 1\n", " \n", " \n", " 1\n", " 1\n", " 0\n", " 10\n", - " 147\n", - " 3\n", + " 79\n", + " 4\n", " \n", " \n", " 2\n", " 2\n", " 0\n", - " 8\n", - " 137\n", - " -1\n", + " 6\n", + " -4\n", + " 2\n", " \n", " \n", " 3\n", " 3\n", " 0\n", - " 2\n", - " 159\n", - " 3\n", + " 10\n", + " 108\n", + " 5\n", " \n", " \n", " 4\n", " 4\n", - " 1\n", - " 6\n", - " 44\n", + " 0\n", + " 10\n", " 4\n", + " 2\n", " \n", " \n", "\n", @@ -499,11 +499,11 @@ ], "text/plain": [ " order_item_id order_id product_id unit_price quantity\n", - "0 0 0 6 147 2\n", - "1 1 0 10 147 3\n", - "2 2 0 8 137 -1\n", - "3 3 0 2 159 3\n", - "4 4 1 6 44 4" + "0 0 0 9 58 1\n", + "1 1 0 10 79 4\n", + "2 2 0 6 -4 2\n", + "3 3 0 10 108 5\n", + "4 4 0 10 4 2" ] }, "execution_count": 7, diff --git a/examples/4. Quickstart - Anonymization.ipynb b/examples/4. Quickstart - Anonymization.ipynb index dfa1c8d92..a397d7e03 100644 --- a/examples/4. Quickstart - Anonymization.ipynb +++ b/examples/4. Quickstart - Anonymization.ipynb @@ -86,7 +86,7 @@ " 3\n", " 8888888888888888\n", " 4\n", - " Jeff\n", + " Joe\n", " \n", " \n", "\n", @@ -97,7 +97,7 @@ "0 1111222233334444 1 Bill\n", "1 0000000000000000 2 Jeff\n", "2 9999999999999999 3 Bill\n", - "3 8888888888888888 4 Jeff" + "3 8888888888888888 4 Joe" ] }, "execution_count": 2, @@ -185,13 +185,13 @@ "name": "stderr", "output_type": "stream", "text": [ - "2019-11-05 21:41:17,816 - INFO - modeler - Modeling anonymized\n", - "2019-11-05 21:41:17,817 - INFO - metadata - Loading transformer CategoricalTransformer for field name\n", - "2019-11-05 21:41:17,817 - INFO - metadata - Loading transformer CategoricalTransformer for field credit_card_number\n", - "2019-11-05 21:41:17,861 - INFO - modeler - Modeling normal\n", - "2019-11-05 21:41:17,862 - INFO - metadata - Loading transformer CategoricalTransformer for field name\n", - "2019-11-05 21:41:17,862 - INFO - metadata - Loading transformer CategoricalTransformer for field credit_card_number\n", - "2019-11-05 21:41:17,882 - INFO - modeler - Modeling Complete\n" + "2019-11-22 22:11:58,851 - INFO - modeler - Modeling anonymized\n", + "2019-11-22 22:11:58,852 - INFO - metadata - Loading transformer CategoricalTransformer for field name\n", + "2019-11-22 22:11:58,852 - INFO - metadata - Loading transformer CategoricalTransformer for field credit_card_number\n", + "2019-11-22 22:11:58,901 - INFO - modeler - Modeling normal\n", + "2019-11-22 22:11:58,901 - INFO - metadata - Loading transformer CategoricalTransformer for field name\n", + "2019-11-22 22:11:58,902 - INFO - metadata - Loading transformer CategoricalTransformer for field credit_card_number\n", + "2019-11-22 22:11:58,922 - INFO - modeler - Modeling Complete\n" ] } ], @@ -250,44 +250,44 @@ " \n", " 0\n", " 0\n", - " John\n", - " 4234470288705895\n", + " Karen\n", + " 4296901743681193\n", " \n", " \n", " 1\n", " 1\n", - " John\n", - " 4141782321030111\n", + " Samantha\n", + " 4296901743681193\n", " \n", " \n", " 2\n", " 2\n", - " David\n", - " 4478981940903962\n", + " Karen\n", + " 4271300383612346\n", " \n", " \n", " 3\n", " 3\n", - " David\n", - " 4478981940903962\n", + " Karen\n", + " 4087112820119247\n", " \n", " \n", " 4\n", " 4\n", - " David\n", - " 4478981940903962\n", + " Samantha\n", + " 4306519196126983\n", " \n", " \n", "\n", "" ], "text/plain": [ - " index name credit_card_number\n", - "0 0 John 4234470288705895\n", - "1 1 John 4141782321030111\n", - "2 2 David 4478981940903962\n", - "3 3 David 4478981940903962\n", - "4 4 David 4478981940903962" + " index name credit_card_number\n", + "0 0 Karen 4296901743681193\n", + "1 1 Samantha 4296901743681193\n", + "2 2 Karen 4271300383612346\n", + "3 3 Karen 4087112820119247\n", + "4 4 Samantha 4306519196126983" ] }, "execution_count": 7, @@ -336,32 +336,32 @@ " \n", " 0\n", " 0\n", - " Jeff\n", - " 9999999999999999\n", + " Bill\n", + " 1111222233334444\n", " \n", " \n", " 1\n", " 1\n", - " Bill\n", - " 9999999999999999\n", + " Joe\n", + " 1111222233334444\n", " \n", " \n", " 2\n", " 2\n", - " Jeff\n", - " 1111222233334444\n", + " Bill\n", + " 8888888888888888\n", " \n", " \n", " 3\n", " 3\n", " Jeff\n", - " 0000000000000000\n", + " 1111222233334444\n", " \n", " \n", " 4\n", " 4\n", " Bill\n", - " 9999999999999999\n", + " 8888888888888888\n", " \n", " \n", "\n", @@ -369,11 +369,11 @@ ], "text/plain": [ " index name credit_card_number\n", - "0 0 Jeff 9999999999999999\n", - "1 1 Bill 9999999999999999\n", - "2 2 Jeff 1111222233334444\n", - "3 3 Jeff 0000000000000000\n", - "4 4 Bill 9999999999999999" + "0 0 Bill 1111222233334444\n", + "1 1 Joe 1111222233334444\n", + "2 2 Bill 8888888888888888\n", + "3 3 Jeff 1111222233334444\n", + "4 4 Bill 8888888888888888" ] }, "execution_count": 8, diff --git a/examples/5. Generate Metadata from Dataframes.ipynb b/examples/5. Generate Metadata from Dataframes.ipynb new file mode 100644 index 000000000..8285cf460 --- /dev/null +++ b/examples/5. Generate Metadata from Dataframes.ipynb @@ -0,0 +1,278 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from sdv import load_demo" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "metadata, tables = load_demo()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'tables': {'users': {'primary_key': 'user_id',\n", + " 'fields': {'user_id': {'type': 'id', 'subtype': 'integer'},\n", + " 'country': {'type': 'categorical'},\n", + " 'gender': {'type': 'categorical'},\n", + " 'age': {'type': 'numerical', 'subtype': 'integer'}}},\n", + " 'sessions': {'primary_key': 'session_id',\n", + " 'fields': {'session_id': {'type': 'id', 'subtype': 'integer'},\n", + " 'user_id': {'ref': {'field': 'user_id', 'table': 'users'},\n", + " 'type': 'id',\n", + " 'subtype': 'integer'},\n", + " 'device': {'type': 'categorical'},\n", + " 'os': {'type': 'categorical'}}},\n", + " 'transactions': {'primary_key': 'transaction_id',\n", + " 'fields': {'transaction_id': {'type': 'id', 'subtype': 'integer'},\n", + " 'session_id': {'ref': {'field': 'session_id', 'table': 'sessions'},\n", + " 'type': 'id',\n", + " 'subtype': 'integer'},\n", + " 'timestamp': {'type': 'datetime', 'format': '%Y-%m-%d'},\n", + " 'amount': {'type': 'numerical', 'subtype': 'float'},\n", + " 'approved': {'type': 'boolean'}}}}}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "metadata" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'users': user_id country gender age\n", + " 0 0 USA M 34\n", + " 1 1 UK F 23\n", + " 2 2 ES None 44\n", + " 3 3 UK M 22\n", + " 4 4 USA F 54\n", + " 5 5 DE M 57\n", + " 6 6 BG F 45\n", + " 7 7 ES None 41\n", + " 8 8 FR F 23\n", + " 9 9 UK None 30,\n", + " 'sessions': session_id user_id device os\n", + " 0 0 0 mobile android\n", + " 1 1 1 tablet ios\n", + " 2 2 1 tablet android\n", + " 3 3 2 mobile android\n", + " 4 4 4 mobile ios\n", + " 5 5 5 mobile android\n", + " 6 6 6 mobile ios\n", + " 7 7 6 tablet ios\n", + " 8 8 6 mobile ios\n", + " 9 9 8 tablet ios,\n", + " 'transactions': transaction_id session_id timestamp amount approved\n", + " 0 0 0 2019-01-01 12:34:32 100.0 True\n", + " 1 1 0 2019-01-01 12:42:21 55.3 True\n", + " 2 2 1 2019-01-07 17:23:11 79.5 True\n", + " 3 3 3 2019-01-10 11:08:57 112.1 False\n", + " 4 4 5 2019-01-10 21:54:08 110.0 False\n", + " 5 5 5 2019-01-11 11:21:20 76.3 True\n", + " 6 6 7 2019-01-22 14:44:10 89.5 True\n", + " 7 7 8 2019-01-23 10:14:09 132.1 False\n", + " 8 8 9 2019-01-27 16:09:17 68.0 True\n", + " 9 9 9 2019-01-29 12:10:48 99.9 True}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tables" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from sdv import Metadata\n", + "\n", + "new_meta = Metadata()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "new_meta.add_table('users', data=tables['users'], primary_key='user_id')" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "new_meta.add_table('sessions', data=tables['sessions'], primary_key='session_id',\n", + " parent='users', foreign_key='user_id')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "transactions_fields = {\n", + " 'timestamp': {\n", + " 'type': 'datetime',\n", + " 'format': '%Y-%m-%d'\n", + " }\n", + "}\n", + "new_meta.add_table('transactions', tables['transactions'], fields_metadata=transactions_fields,\n", + " primary_key='transaction_id', parent='sessions')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'tables': {'users': {'fields': {'gender': {'type': 'categorical'},\n", + " 'country': {'type': 'categorical'},\n", + " 'age': {'type': 'numerical', 'subtype': 'integer'},\n", + " 'user_id': {'type': 'id', 'subtype': 'integer'}},\n", + " 'primary_key': 'user_id'},\n", + " 'sessions': {'fields': {'device': {'type': 'categorical'},\n", + " 'session_id': {'type': 'id', 'subtype': 'integer'},\n", + " 'os': {'type': 'categorical'},\n", + " 'user_id': {'type': 'id',\n", + " 'subtype': 'integer',\n", + " 'ref': {'table': 'users', 'field': 'user_id'}}},\n", + " 'primary_key': 'session_id'},\n", + " 'transactions': {'fields': {'timestamp': {'type': 'datetime',\n", + " 'format': '%Y-%m-%d'},\n", + " 'amount': {'type': 'numerical', 'subtype': 'float'},\n", + " 'transaction_id': {'type': 'id', 'subtype': 'integer'},\n", + " 'session_id': {'type': 'id',\n", + " 'subtype': 'integer',\n", + " 'ref': {'table': 'sessions', 'field': 'session_id'}},\n", + " 'approved': {'type': 'boolean'}},\n", + " 'primary_key': 'transaction_id'}}}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_meta.to_dict()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_meta.to_dict() == metadata" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "new_meta.to_json('demo_metadata.json')" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "loaded = Metadata('demo_metadata.json')" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loaded.to_dict() == new_meta.to_dict()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/README Demo.ipynb b/examples/README Demo.ipynb deleted file mode 100644 index 43deb4d41..000000000 --- a/examples/README Demo.ipynb +++ /dev/null @@ -1,232 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from sdv import load_demo" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "metadata, tables = load_demo()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'tables': [{'name': 'users',\n", - " 'primary_key': 'user_id',\n", - " 'fields': [{'name': 'user_id', 'type': 'id'},\n", - " {'name': 'country', 'type': 'categorical'},\n", - " {'name': 'gender', 'type': 'categorical'},\n", - " {'name': 'age', 'type': 'numerical', 'subtype': 'integer'}]},\n", - " {'name': 'sessions',\n", - " 'primary_key': 'session_id',\n", - " 'fields': [{'name': 'session_id', 'type': 'id'},\n", - " {'name': 'user_id',\n", - " 'ref': {'field': 'user_id', 'table': 'users'},\n", - " 'type': 'id'},\n", - " {'name': 'device', 'type': 'categorical'},\n", - " {'name': 'os', 'type': 'categorical'}]},\n", - " {'name': 'transactions',\n", - " 'primary_key': 'transaction_id',\n", - " 'fields': [{'name': 'transaction_id', 'type': 'id'},\n", - " {'name': 'session_id',\n", - " 'ref': {'field': 'session_id', 'table': 'sessions'},\n", - " 'type': 'id'},\n", - " {'name': 'timestamp', 'type': 'datetime', 'format': '%Y-%m-%d'},\n", - " {'name': 'amount', 'type': 'numerical', 'subtype': 'float'},\n", - " {'name': 'approved', 'type': 'boolean'}]}]}" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "metadata" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'users': user_id country gender age\n", - " 0 0 USA M 34\n", - " 1 1 UK F 23\n", - " 2 2 ES None 44\n", - " 3 3 UK M 22\n", - " 4 4 USA F 54\n", - " 5 5 DE M 57\n", - " 6 6 BG F 45\n", - " 7 7 ES None 41\n", - " 8 8 FR F 23\n", - " 9 9 UK None 30,\n", - " 'sessions': session_id user_id device os\n", - " 0 0 0 mobile android\n", - " 1 1 1 tablet ios\n", - " 2 2 1 tablet android\n", - " 3 3 2 mobile android\n", - " 4 4 4 mobile ios\n", - " 5 5 5 mobile android\n", - " 6 6 6 mobile ios\n", - " 7 7 6 table ios\n", - " 8 8 6 mobile ios\n", - " 9 9 8 tablet ios,\n", - " 'transactions': transaction_id session_id timestamp amount approved\n", - " 0 0 0 2019-01-01 12:34:32 100.0 True\n", - " 1 1 0 2019-01-01 12:42:21 55.3 True\n", - " 2 2 1 2019-01-07 17:23:11 79.5 True\n", - " 3 3 3 2019-01-10 11:08:57 112.1 False\n", - " 4 4 5 2019-01-10 21:54:08 110.0 False\n", - " 5 5 5 2019-01-11 11:21:20 76.3 True\n", - " 6 6 7 2019-01-22 14:44:10 89.5 True\n", - " 7 7 8 2019-01-23 10:14:09 132.1 False\n", - " 8 8 9 2019-01-27 16:09:17 68.0 True\n", - " 9 9 9 2019-01-29 12:10:48 99.9 True}" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tables" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2019-11-03 16:08:01,089 - INFO - modeler - Modeling users\n", - "2019-11-03 16:08:01,090 - INFO - metadata - Loading transformer CategoricalTransformer for field country\n", - "2019-11-03 16:08:01,091 - INFO - metadata - Loading transformer CategoricalTransformer for field gender\n", - "2019-11-03 16:08:01,092 - INFO - metadata - Loading transformer NumericalTransformer for field age\n", - "2019-11-03 16:08:01,108 - INFO - modeler - Modeling sessions\n", - "2019-11-03 16:08:01,109 - INFO - metadata - Loading transformer CategoricalTransformer for field device\n", - "2019-11-03 16:08:01,109 - INFO - metadata - Loading transformer CategoricalTransformer for field os\n", - "2019-11-03 16:08:01,121 - INFO - modeler - Modeling transactions\n", - "2019-11-03 16:08:01,122 - INFO - metadata - Loading transformer DatetimeTransformer for field timestamp\n", - "2019-11-03 16:08:01,122 - INFO - metadata - Loading transformer NumericalTransformer for field amount\n", - "2019-11-03 16:08:01,122 - INFO - metadata - Loading transformer BooleanTransformer for field approved\n", - "/home/xals/.virtualenvs/SDV/lib/python3.6/site-packages/pandas/core/frame.py:7143: RuntimeWarning: Degrees of freedom <= 0 for slice\n", - " baseCov = np.cov(mat.T)\n", - "/home/xals/.virtualenvs/SDV/lib/python3.6/site-packages/numpy/lib/function_base.py:2451: RuntimeWarning: divide by zero encountered in true_divide\n", - " c *= np.true_divide(1, fact)\n", - "/home/xals/.virtualenvs/SDV/lib/python3.6/site-packages/numpy/lib/function_base.py:2451: RuntimeWarning: invalid value encountered in multiply\n", - " c *= np.true_divide(1, fact)\n", - "2019-11-03 16:08:01,883 - INFO - modeler - Modeling Complete\n" - ] - } - ], - "source": [ - "from sdv import SDV\n", - "\n", - "sdv = SDV()\n", - "sdv.fit(metadata, tables)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'users': user_id country gender age\n", - " 0 0 UK M 38\n", - " 1 1 FR F 49\n", - " 2 2 BG F 53\n", - " 3 3 UK NaN 40\n", - " 4 4 ES M 37,\n", - " 'sessions': session_id user_id device os\n", - " 0 0 0 mobile android\n", - " 1 1 1 mobile ios\n", - " 2 2 1 table ios\n", - " 3 3 2 mobile ios\n", - " 4 4 4 tablet ios,\n", - " 'transactions': transaction_id session_id timestamp amount \\\n", - " 0 0 2 2019-01-11 20:04:41.045274880 91.528609 \n", - " 1 1 2 2019-01-11 20:04:41.295124992 81.582317 \n", - " 2 2 2 2019-01-11 20:04:40.718006016 101.129837 \n", - " 3 3 2 2019-01-11 20:04:41.010093824 87.197737 \n", - " 4 4 2 2019-01-11 20:04:41.037239040 97.864023 \n", - " 5 5 2 2019-01-11 20:04:41.007366656 92.988464 \n", - " 6 6 2 2019-01-11 20:04:41.101156864 85.344030 \n", - " 7 7 2 2019-01-11 20:04:40.705912064 102.165910 \n", - " 8 8 2 2019-01-11 20:04:41.270179584 96.879576 \n", - " 9 9 2 2019-01-11 20:04:41.050921216 78.120009 \n", - " 10 10 2 2019-01-11 20:04:40.803478016 90.263667 \n", - " 11 11 2 2019-01-11 20:04:40.929881600 78.976379 \n", - " \n", - " approved \n", - " 0 True \n", - " 1 True \n", - " 2 True \n", - " 3 True \n", - " 4 True \n", - " 5 True \n", - " 6 True \n", - " 7 True \n", - " 8 True \n", - " 9 True \n", - " 10 True \n", - " 11 True }" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sdv.sample_all()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/sdv/demo.py b/sdv/demo.py index 749cb4cf4..7f9c895dc 100644 --- a/sdv/demo.py +++ b/sdv/demo.py @@ -1,89 +1,78 @@ import pandas as pd DEMO_METADATA = { - 'tables': [ - { - 'name': 'users', + 'tables': { + 'users': { 'primary_key': 'user_id', - 'fields': [ - { - 'name': 'user_id', + 'fields': { + 'user_id': { 'type': 'id', + 'subtype': 'integer' }, - { - 'name': 'country', - 'type': 'categorical', + 'country': { + 'type': 'categorical' }, - { - 'name': 'gender', - 'type': 'categorical', + 'gender': { + 'type': 'categorical' }, - { - 'name': 'age', + 'age': { 'type': 'numerical', 'subtype': 'integer' } - ] + } }, - { - 'name': 'sessions', + 'sessions': { 'primary_key': 'session_id', - 'fields': [ - { - 'name': 'session_id', + 'fields': { + 'session_id': { 'type': 'id', + 'subtype': 'integer' }, - { - 'name': 'user_id', + 'user_id': { 'ref': { 'field': 'user_id', 'table': 'users' }, 'type': 'id', + 'subtype': 'integer' }, - { - 'name': 'device', - 'type': 'categorical', + 'device': { + 'type': 'categorical' }, - { - 'name': 'os', - 'type': 'categorical', + 'os': { + 'type': 'categorical' } - ] + } }, - { - 'name': 'transactions', + 'transactions': { 'primary_key': 'transaction_id', - 'fields': [ - { - 'name': 'transaction_id', + 'fields': { + 'transaction_id': { 'type': 'id', + 'subtype': 'integer' }, - { - 'name': 'session_id', + 'session_id': { 'ref': { 'field': 'session_id', 'table': 'sessions' }, 'type': 'id', + 'subtype': 'integer' }, - { - 'name': 'timestamp', + 'timestamp': { 'type': 'datetime', 'format': '%Y-%m-%d' }, - { - 'name': 'amount', + 'amount': { 'type': 'numerical', 'subtype': 'float' }, - { - 'name': 'approved', + 'approved': { 'type': 'boolean' } - ] + } } - ] + } } diff --git a/sdv/metadata.py b/sdv/metadata.py index 700bd6eba..3937cdcff 100644 --- a/sdv/metadata.py +++ b/sdv/metadata.py @@ -29,7 +29,8 @@ def _parse_dtypes(data, table_meta): for name, field in table_meta['fields'].items(): field_type = field['type'] if field_type == 'datetime': - data[name] = pd.to_datetime(data[name], format=field['format'], exact=False) + datetime_format = field.get('format') + data[name] = pd.to_datetime(data[name], format=datetime_format, exact=False) elif field_type == 'numerical' and field.get('subtype') == 'integer': data[name] = data[name].dropna().astype(int) elif field_type == 'id' and field.get('subtype', 'integer') == 'integer': @@ -65,8 +66,39 @@ class Metadata: The path where the ``metadata.json`` is located. Defaults to ``None``. """ - def _get_relationships(self): - """Exttract information about child-parent relationships. + _FIELD_TEMPLATES = { + 'i': { + 'type': 'numerical', + 'subtype': 'integer', + }, + 'f': { + 'type': 'numerical', + 'subtype': 'float', + }, + 'O': { + 'type': 'categorical', + }, + 'b': { + 'type': 'boolean', + }, + 'M': { + 'type': 'datetime', + } + } + _DTYPES = { + ('categorical', None): np.object, + ('boolean', None): bool, + ('numerical', None): float, + ('numerical', 'float'): float, + ('numerical', 'integer'): int, + ('datetime', None): np.datetime64, + ('id', None): int, + ('id', 'integer'): int, + ('id', 'string'): str + } + + def _analyze_relationships(self): + """Extract information about child-parent relationships. Creates the following attributes: * ``_child_map``: set of child tables that each table has. @@ -75,9 +107,8 @@ def _get_relationships(self): self._child_map = defaultdict(set) self._parent_map = defaultdict(set) - for table_meta in self._metadata['tables'].values(): + for table, table_meta in self._metadata['tables'].items(): if table_meta.get('use', True): - table = table_meta['name'] for field_meta in table_meta['fields'].values(): ref = field_meta.get('ref') if ref: @@ -101,23 +132,31 @@ def _dict_metadata(metadata): """ new_metadata = copy.deepcopy(metadata) tables = new_metadata['tables'] - new_tables = dict() + if isinstance(tables, dict): + new_metadata['tables'] = { + table: meta + for table, meta in tables.items() + if meta.pop('use', True) + } + return new_metadata + new_tables = dict() for table in tables: - new_tables[table['name']] = table + if table.pop('use', True): + new_tables[table.pop('name')] = table - fields = table['fields'] - new_fields = dict() - for field in fields: - new_fields[field['name']] = field + fields = table['fields'] + new_fields = dict() + for field in fields: + new_fields[field.pop('name')] = field - table['fields'] = new_fields + table['fields'] = new_fields new_metadata['tables'] = new_tables return new_metadata - def __init__(self, metadata, root_path=None): + def __init__(self, metadata=None, root_path=None): if isinstance(metadata, str): self.root_path = root_path or os.path.dirname(metadata) with open(metadata) as metadata_file: @@ -125,12 +164,16 @@ def __init__(self, metadata, root_path=None): else: self.root_path = root_path or '.' - self._metadata = self._dict_metadata(metadata) + if metadata is not None: + self._metadata = self._dict_metadata(metadata) + else: + self._metadata = {'tables': {}} + self._hyper_transformers = dict() - self._get_relationships() + self._analyze_relationships() def get_children(self, table_name): - """Get table children. + """Get tables for which the given table is parent. Args: table_name (str): @@ -143,7 +186,7 @@ def get_children(self, table_name): return self._child_map[table_name] def get_parents(self, table_name): - """Get table parents. + """Get tables for with the given table is child. Args: table_name (str): @@ -156,7 +199,7 @@ def get_parents(self, table_name): return self._parent_map[table_name] def get_table_meta(self, table_name): - """Get the metadata dict for a table. + """Get the metadata dict for a table. Args: table_name (str): @@ -165,24 +208,124 @@ def get_table_meta(self, table_name): Returns: dict: table metadata + + Raises: + ValueError: + If table does not exist in this metadata. + """ + table = self._metadata['tables'].get(table_name) + if table is None: + raise ValueError('Table "{}" does not exist'.format(table_name)) + + return copy.deepcopy(table) + + def get_tables(self): + """Get the list of table names. + + Returns: + list: + table names. + """ + return list(self._metadata['tables'].keys()) + + def get_fields(self, table_name): + """Get table fields metadata. + + Args: + table_name (str): + Name of the table to get the fields from. + + Returns: + dict: + Mapping of field names and their metadata dicts. + + Raises: + ValueError: + If table does not exist in this metadata. + """ + return self.get_table_meta(table_name)['fields'] + + def get_primary_key(self, table_name): + """Get the primary key name of the indicated table. + + Args: + table_name (str): + Name of table for which to get the primary key field. + + Returns: + str or None: + Primary key field name. ``None`` if the table has no primary key. + + Raises: + ValueError: + If table does not exist in this metadata. """ - return self._metadata['tables'][table_name] + return self.get_table_meta(table_name).get('primary_key') + + def get_foreign_key(self, parent, child): + """Get table foreign key field name. + + Args: + parent (str): + Name of the parent table. + child (str): + Name of the child table. + + Returns: + str or None: + Foreign key field name. + + Raises: + ValueError: + If the relationship does not exist. + """ + primary = self.get_primary_key(parent) + + for name, field in self.get_fields(child).items(): + ref = field.get('ref') + if ref and ref['field'] == primary: + return name + + raise ValueError('{} is not parent of {}'.format(parent, child)) def load_table(self, table_name): """Load table data. Args: table_name (str): - Name of the table that we want to load. + Name of the table to load. Returns: pandas.DataFrame: DataFrame with the contents of the table. + + Raises: + ValueError: + If table does not exist in this metadata. """ LOGGER.info('Loading table %s', table_name) table_meta = self.get_table_meta(table_name) return _load_csv(self.root_path, table_meta) + def load_tables(self, tables=None): + """Get a dictionary with data from multiple tables. + + If a ``tables`` list is given, only load the indicated tables. + Otherwise, load all the tables from this metadata. + + Args: + tables (list): + List of table names. Defaults to ``None``. + + Returns: + dict(str, pandasd.DataFrame): + mapping of table names and their data loaded as ``pandas.DataFrame`` instances. + """ + return { + table_name: self.load_table(table_name) + for table_name in tables or self.get_tables() + } + def _get_dtypes(self, table_name, ids=False): """Get a ``dict`` with the ``dtypes`` for each field of a given table. @@ -198,48 +341,28 @@ def _get_dtypes(self, table_name, ids=False): Raises: ValueError: - If a field has an invalid type or subtype. + If a field has an invalid type or subtype or if the table does not + exist in this metadata. """ dtypes = dict() table_meta = self.get_table_meta(table_name) for name, field in table_meta['fields'].items(): field_type = field['type'] - if field_type == 'categorical': - dtypes[name] = np.object - - elif field_type == 'boolean': - dtypes[name] = bool - - elif field_type == 'numerical': - field_subtype = field.get('subtype', 'float') - if field_subtype == 'integer': - dtypes[name] = int - elif field_subtype == 'float': - dtypes[name] = float - else: - raise ValueError('Invalid {} subtype {} - {}'.format( - field_type, field_subtype, name)) - - elif field_type == 'datetime': - dtypes[name] = np.datetime64 - - elif field_type == 'id': - if ids: - if (name != table_meta.get('primary_key')) and not field.get('ref'): - raise ValueError( - 'id field `{}` is neither a primary or a foreign key'.format(name)) - - field_subtype = field.get('subtype', 'integer') - if field_subtype == 'integer': - dtypes[name] = int - elif field_subtype == 'string': - dtypes[name] = str - else: - raise ValueError('Invalid {} subtype: {} - {}'.format( - field_type, field_subtype, name)) - - else: - raise ValueError('Invalid field type: {} - '.format(field_type, name)) + field_subtype = field.get('subtype') + dtype = self._DTYPES.get((field_type, field_subtype)) + if not dtype: + raise ValueError( + 'Invalid type and subtype combination for field {}: ({}, {})'.format( + name, field_type, field_subtype) + ) + + if ids and field_type == 'id': + if (name != table_meta.get('primary_key')) and not field.get('ref'): + raise ValueError( + 'id field `{}` is neither a primary or a foreign key'.format(name)) + + if ids or (field_type != 'id'): + dtypes[name] = dtype return dtypes @@ -346,86 +469,6 @@ def transform(self, table_name, data): fields = list(hyper_transformer.transformers.keys()) return hyper_transformer.transform(data[fields]) - def get_table_names(self): - """Get the list of table names. - - Returns: - list: - table names. - """ - return list(self._metadata['tables'].keys()) - - def get_tables(self, tables=None): - """Get a dictionary with data from multiple tables. - - If a ``tables`` list is given, only load the indicated tables. - Otherwise, load all the tables from this metadata. - - Args: - tables (list): - List of table names. Defaults to ``None``. - - Returns: - dict(str, pandasd.DataFrame): - mapping of table names and their data loaded as ``pandas.DataFrame`` instances. - """ - return { - table_name: self.load_table(table_name) - for table_name in tables or self.get_table_names() - } - - def get_fields(self, table_name): - """Get table fields metadata. - - Args: - table_name (str): - Name of the table to get the fields from. - - Returns: - dict: - Mapping of field names and their metadata dicts. - """ - return self.get_table_meta(table_name)['fields'] - - def get_primary_key(self, table_name): - """Get the primary key name of the indicated table. - - Args: - table_name (str): - Name of table for which to get the primary key field. - - Returns: - str or None: - Primary key field name. ``None`` if the table has no primary key. - """ - return self.get_table_meta(table_name).get('primary_key') - - def get_foreign_key(self, parent, child): - """Get table foreign key field name. - - Args: - parent (str): - Name of the parent table. - child (str): - Name of the child table. - - Returns: - str or None: - Foreign key field name. - - Raises: - ValueError: - If the relationship does not exist. - """ - primary = self.get_primary_key(parent) - - for field in self.get_fields(child).values(): - ref = field.get('ref') - if ref and ref['field'] == primary: - return field['name'] - - raise ValueError('{} is not parent of {}'.format(parent, child)) - def reverse_transform(self, table_name, data): """Reverse the transformed data for a given table. @@ -446,258 +489,232 @@ def reverse_transform(self, table_name, data): return reversed_data - def _analyze(data, columns=None): - """Get a dictionary with the metadata analyzed from a dictionary. - - Analyze a ``pandas.DataFrame`` to build a ``dict`` with the name of the column, and - their data type and subtype. If ``columns`` are provided, only those columns will be - analyzed. - - Args: - data (pandas.DataFrame): - Table to be analyzed. - columns(list): - List of columns used to specify which fields analyze from the data. - - Returns: - dict: - Generated metadata from a ``pandas.DataFrame``. - - Raises: - ValueError: - A ``ValueError`` is raised when a column from the data analyzed is an unsupported - data type. - """ - fields = dict() - for column in columns or data.columns: - dtype = data[column].dtype - subtype = None - - if dtype.kind == 'i': - type = 'numerical' - subtype = 'integer' + def _check_field(self, table, field, exists=False): + """Validate the existance of the table and existance (or not) of field.""" + table_fields = self.get_fields(table) + if exists and (field not in table_fields): + raise ValueError('Field "{}" does not exist in table "{}"'.format(field, table)) - elif dtype.kind == 'f': - type = 'numerical' - subtype = 'float' + if not exists and (field in table_fields): + raise ValueError('Field "{}" already exists in table "{}"'.format(field, table)) - elif dtype.kind == 'O': - type = 'categorical' - - elif dtype.kind == 'b': - type = 'boolean' - - elif dtype.kind == 'M': - type = 'datetime' - - else: - raise ValueError('Unsupported dtype: {} in column {}'.format(dtype, column)) - - fields[column] = { - 'name': column, - 'type': type - } - - if subtype: - fields[column]['subtype'] = subtype - - return fields - - def _validate_field(self, field): - dtype = field['type'] - if dtype == 'categorical': - pass - - elif dtype == 'id': - pass - - elif dtype == 'numerical': - subtype = field.get('subtype') - if subtype and subtype != 'integer' and subtype != 'float': - raise ValueError() - - elif dtype == 'boolean': - pass - - elif dtype == 'datetime': - pass - - else: - raise ValueError('Type {} is not supported.'.format(dtype)) - - def _validate_circular_relationships(self, parent, children=None): - if children is None: - children = self.get_children(parent) - - if parent in children: - raise ValueError('Circular relationship not supported') - - for grandchild in children: - self._validate_circular_relationships(parent, self.get_children(grandchild)) - - def add_field(self, table, field, type, subtype=None, properties=dict()): - """Add a new field into a given table. - - Before add the field validate the ``table`` already exists, the ``field`` does not exist, - ``type`` and ``subtype`` supporteds, and ``properties`` valid properties. - - The error message displayed, when the ``ValueError`` is raised because the ``fields`` - already exists in the table, recommends you to use the ``update_fields`` instead. + def add_field(self, table, field, field_type, field_subtype=None, properties=None): + """Add a new field to the indicated table. Args: table (str): Table name to add the new field, it must exist. field (str): Field name to be added, it must not exist. - type (str): + field_type (str): Data type of field to be added. Required. - subtype (str): - Data subtype of field to be added. Not required. Defaults to ``None``. + field_subtype (str): + Data subtype of field to be added. Optional. + Defaults to ``None``. properties (dict): - Extra properties of field like: ref, format, min, max, etc. Not required. - Defaults to ``dict()``. + Extra properties of field like: ref, format, min, max, etc. Optional. + Defaults to ``None``. Raises: ValueError: - A ``ValueError`` is raised when the ``table`` doesn't exists or the ``field`` - exists in the table. + If the table does not exist or it already contains the field. """ - if table not in self.get_table_names(): - raise ValueError('Table "{}" doesn\'t exists.'.format(table)) - - fields = self.get_fields(table) - if field in fields: - raise ValueError( - 'Table {}, field {} already exists. Use "update_field()" to modify it.' - .format(table, field) - ) + self._check_field(table, field, exists=False) field_details = { - 'name': field, - 'type': type + 'type': field_type } - if subtype: - field_details['subtype'] = subtype + if field_subtype: + field_details['subtype'] = field_subtype if properties: - for property_name, property_value in properties.items(): - field_details[property_name] = property_value + field_details.update(properties) - self._validate_field(field_details) + self._metadata['tables'][table]['fields'][field] = field_details + + @staticmethod + def _get_key_subtype(field_meta): + """Get the appropriate key subtype.""" + field_type = field_meta['type'] + if field_type == 'categorical': + field_subtype = 'string' + elif field_type in ('numerical', 'id'): + field_subtype = field_meta['subtype'] + if field_subtype not in ('integer', 'string'): + raise ValueError( + 'Invalid field "subtype" for key field: "{}"'.format(field_subtype) + ) + else: + raise ValueError( + 'Invalid field "type" for key field: "{}"'.format(field_type) + ) - fields[field] = field_details + return field_subtype - def add_primary_key(self, table, field): - """Add a primary key into a given table. + def set_primary_key(self, table, field): + """Set the primary key field of the indicated table. - First, assert that the ``table`` exists and the ``field`` does not. - Then, assert that the ``table`` doesn't have primary key. Finally, add primary key. + The field must exist and either be an integer or categorical field. Args: table (str): - Table name to add the new primary key, it must exist. + Name of the table where the primary key will be set. field (str): - Field name to be the new primary key, it must not exist. + Field to be used as the new primary key. Raises: ValueError: - A ``ValueError`` is raised when the table not exist, - the field already exist or the primary key already exist. + If the table or the field do not exist or if the field has an + invalid type or subtype. """ - if table not in self.get_table_names(): - raise ValueError('Table "{}" doesn\'t exists.'.format(table)) + self._check_field(table, field, exists=True) - if field in self.get_fields(table).keys(): - raise ValueError('Table {}, field {} already exists.'.format(table, field)) + field_meta = self.get_fields(table).get(field) + field_subtype = self._get_key_subtype(field_meta) - if self.get_primary_key(table): - raise ValueError('Table {} already have primary key.'.format(table)) + table_meta = self._metadata['tables'][table] + table_meta['fields'][field] = { + 'type': 'id', + 'subtype': field_subtype + } + table_meta['primary_key'] = field - self.get_table_meta(table)['primary_key'] = field - self.add_field(table, field, 'id', None, None) + def _validate_circular_relationships(self, parent, children=None): + """Validate that there is no circular relatioship in the metadata.""" + if children is None: + children = self.get_children(parent) - def add_relationship(self, table, parent, foreign_key=None): - """Add a new relationship between a 2 tables. + if parent in children: + raise ValueError('Circular relationship found for table "{}"'.format(parent)) - By a given ``table`` and ``parent`` add a new relationship using ``foreign_key`` - to reference the parent field, when ``foreign_key`` is ``None`` use the primary key - field name as foreign key. + for child in children: + self._validate_circular_relationships(parent, self.get_children(child)) - First, assert that the ``table`` and ``parent`` exists. - Then, assert if already exists a relationship between both tables. - If not, add their relationships, in ``table`` add a new parent ``parent`` - and in ``parent`` add a new children ``table``. + def add_relationship(self, parent, child, foreign_key=None): + """Add a new relationship between the parent and child tables. + + The relationship is created by adding a reference (``ref``) on the ``foreign_key`` + field of the ``child`` table pointing at the ``parent`` primary key. Args: - table (str): - Table name to add a new relationship with a parent table. parent (str): - Table name to add a new relationship with a children table. + Name of the parent table. + child (str): + Name of the child table. foreign_key (str): - Field name from the parent table to create the reference in the children table. - Defaults to ``None``. + Field in the child table through which the relationship is created. + If ``None``, use the parent primary key name. Raises: ValueError: - A ``ValueError`` is raised when ``table`` or ``parent`` don't exist, - the relationship already exists, ``parent`` is child of ``table`` of - ``parent`` table does not have primary key. + If any of the following happens: + * The parent table does not exist. + * The child table does not exist. + * The parent table does not have a primary key. + * The foreign_key field already exists in the child table. + * The child table already has a parent. + * The new relationship closes a relationship circle. """ - if table not in self.get_table_names(): - raise ValueError('Table "{}" doesn\'t exists.'.format(table)) + # Validate table and field names + primary_key = self.get_primary_key(parent) + if not primary_key: + raise ValueError('Parent table "{}" does not have a primary key'.format(parent)) - if parent not in self.get_table_names(): - raise ValueError('Table "{}" doesn\'t exists.'.format(parent)) + if foreign_key is None: + foreign_key = primary_key - if parent in self.get_parents(table): - raise ValueError('Table {} is the parent table of {}.'.format(parent, table)) + # Validate relationships + if self.get_parents(child): + raise ValueError('Table "{}" already has a parent'.format(child)) - if parent in self.get_children(table): - raise ValueError('Table {} is the children table of {}.'.format(parent, table)) + grandchildren = self.get_children(child) + if grandchildren: + self._validate_circular_relationships(parent, grandchildren) - primary_key = self.get_primary_key(parent) - if not primary_key: - raise ValueError('Parent table {} have not primary key.'.format(primary_key)) + # Copy primary key details over to the foreign key + foreign_key_details = copy.deepcopy(self.get_fields(parent)[primary_key]) + foreign_key_details['ref'] = { + 'table': parent, + 'field': primary_key + } - self._validate_circular_relationships(parent, self.get_children(table)) + # Make sure that key subtypes are the same + foreign_meta = self.get_fields(child).get(foreign_key) + if foreign_meta: + foreign_subtype = self._get_key_subtype(foreign_meta) + if foreign_subtype != foreign_key_details['subtype']: + raise ValueError('Primary and Foreign key subtypes mismatch') - properties = {'ref': {'field': primary_key, 'table': parent}} - self.add_field(table, foreign_key or primary_key, 'id', None, properties) + self._metadata['tables'][child]['fields'][foreign_key] = foreign_key_details - self._get_relationships() + # Re-analyze the relationships + self._analyze_relationships() - def _add_table_load_data(self, data): - data = data if os.path.exists(data) else os.path.join(self.root_path, data) - return pd.read_csv(data) + def _get_field_details(self, data, fields): + """Get or build all the fields metadata. - def add_table(self, name, primary_key=None, fields=None, data=None, parent=None, - foreign_key=None): - """Add a new table to the metadata. + Analyze a ``pandas.DataFrame`` to build a ``dict`` with the name of the column, and + their data type and subtype. If ``columns`` are provided, only those columns will be + analyzed. - First, validate that the ``name`` table already exists. + Args: + data (pandas.DataFrame): + Table to be analyzed. + fields (set): + Set of field names or field specifications. - When ``fields`` is a ``dict``, ignore ``data`` and validate those fields. - When ``fields`` is a ``list`` and ``data`` is ``None``, raise a ``ValueError``. - When ``fields`` is a ``list`` and ``data`` is not ``None``, analyze data columns in list. - When ``fields`` is ``None`` and ``data`` is not ``None``, analyze all columns. + Returns: + dict: + Dict of valid fields. - Use the ``fields`` to create the table and add primary key or relationship if needed. + Raises: + TypeError: + If a field specification is not a str or a dict. + ValueError: + If a column from the data analyzed is an unsupported data type or + """ + fields_metadata = dict() + for field in fields: + dtype = data[field].dtype + field_template = self._FIELD_TEMPLATES.get(dtype.kind) + if not field_template: + raise ValueError('Unsupported dtype {} in column {}'.format(dtype, field)) + + field_details = copy.deepcopy(field_template) + fields_metadata[field] = field_details + + return fields_metadata + + def add_table(self, name, data=None, fields=None, fields_metadata=None, + primary_key=None, parent=None, foreign_key=None): + """Add a new table to this metadata. + + ``fields`` list can be a mixture of field names, which will be build automatically + from the data, or dictionaries specifying the field details. If a field needs to be + analyzed, data has to be also passed. + + If ``parent`` is given, a relationship will be established between this table + and the specified parent. Args: name (str): - Table name to be created, it must not exists. - primary_key (str): - Field name to add as primary key, it must not exists. Defaults to ``None``. - fields (dict or list): - If it's a ``dict``, data is ignored. - If it's a ``list``, indicate which columns will be analized. - Defaults to ``None``. + Name of the new table. data (str or pandas.DataFrame): Table to be analyzed or path to the csv file. If it's a relative path, use ``root_path`` to find the file. - Only used if fields is a ``list`` or ``None``. + Only used if fields is not ``None``. + Defaults to ``None``. + fields (list): + List of field names to build. If ``None`` is given, all the fields + found in the data will be used. + Defaults to ``None``. + fields_metadata (dict): + Metadata to be used when creating fields. This will overwrite the + metadata built from the fields found in data. Defaults to ``None``. + primary_key (str): + Field name to add as primary key, it must not exists. Defaults to ``None``. parent (str): Table name to refere a foreign key field. Defaults to ``None``. foreign_key (str): @@ -705,45 +722,61 @@ def add_table(self, name, primary_key=None, fields=None, data=None, parent=None, Raises: ValueError: - A ``ValueError`` is raised when the table ``name`` already exists - or ``fields`` is a ``list`` and ``data`` is ``None`` + If the table ``name`` already exists or ``data`` is not passed and + fields need to be built from it. """ - if name in self.get_table_names(): + if name in self.get_tables(): raise ValueError('Table "{}" already exists.'.format(name)) - if isinstance(fields, dict): - for field_key, field_value in fields.items(): - field_value['name'] = field_key - self._validate_field(field_value) + if data is not None: + if isinstance(data, str): + if not os.path.isabs(data): + data = os.path.join(self.root_path, data) - if isinstance(fields, list) and data is None: - raise ValueError() + data = pd.read_csv(data) - if isinstance(fields, list) and data is not None: - if isinstance(data, str): - data = self._add_table_load_data(data) + fields = set(fields or data.columns) + if fields_metadata: + fields = fields - set(fields_metadata.keys()) + else: + fields_metadata = dict() - fields = Metadata._analyze(data, columns=fields) + fields_metadata.update(self._get_field_details(data, fields)) - if not fields and data is not None: - if isinstance(data, str): - data = self._add_table_load_data(data) + elif fields_metadata is None: + fields_metadata = dict() - fields = Metadata._analyze(data) + self._metadata['tables'][name] = { + 'fields': fields_metadata + } - table = {'name': name, 'fields': fields or dict()} - self._metadata['tables'][name] = table + try: + if primary_key: + self.set_primary_key(name, primary_key) - if primary_key: - self.add_primary_key(name, primary_key) - # table['primary_key'] = primary_key + if parent: + self.add_relationship(parent, name, foreign_key) - # Add relationship - if parent: - self.add_relationship(name, parent, foreign_key) + except ValueError: + # Cleanup + del self._metadata['tables'][name] + raise def to_dict(self): + """Get a dict representation of this metadata. + + Returns: + dict: + dict representation of this metadata. + """ return copy.deepcopy(self._metadata) - def to_json(self): - return json.dumps(self._metadata, indent=4) + def to_json(self, path): + """Dump this metadata into a JSON file. + + Args: + path (str): + Path of the JSON file where this metadata will be stored. + """ + with open(path, 'w') as out_file: + json.dump(self._metadata, out_file, indent=4) diff --git a/sdv/modeler.py b/sdv/modeler.py index 88c2f912e..52cd9d883 100644 --- a/sdv/modeler.py +++ b/sdv/modeler.py @@ -238,7 +238,7 @@ def model_database(self, tables=None): If not given, the tables will be loaded using the dataset metadata specification. """ - for table_name in self.metadata.get_table_names(): + for table_name in self.metadata.get_tables(): if not self.metadata.get_parents(table_name): self.cpa(table_name, tables) diff --git a/sdv/sampler.py b/sdv/sampler.py index af875358e..1f07c6491 100644 --- a/sdv/sampler.py +++ b/sdv/sampler.py @@ -14,6 +14,11 @@ class Sampler: models (dict): Table models. """ + metadata = None + models = None + primary_key = None + remaining_primary_key = None + def __init__(self, metadata, models): self.metadata = metadata self.models = models @@ -453,7 +458,7 @@ def sample_all(self, num_rows=5, reset_primary_keys=False): self._reset_primary_keys_generators() sampled_data = dict() - for table in self.metadata.get_table_names(): + for table in self.metadata.get_tables(): if not self.metadata.get_parents(table): sampled_data.update(self.sample(table, num_rows)) diff --git a/sdv/sdv.py b/sdv/sdv.py index db010ff58..2718ffbeb 100644 --- a/sdv/sdv.py +++ b/sdv/sdv.py @@ -43,7 +43,7 @@ def __init__(self, model=DEFAULT_MODEL, model_kwargs=None): def _validate_dataset_structure(self): """Make sure that all the tables have at most one parent.""" - for table in self.metadata.get_table_names(): + for table in self.metadata.get_tables(): if len(self.metadata.get_parents(table)) > 1: raise ValueError('Some tables have multiple parents, which is not supported yet.') diff --git a/tests/integration/test_metadata.py b/tests/integration/test_metadata.py index d8055cdae..83b8b1f85 100644 --- a/tests/integration/test_metadata.py +++ b/tests/integration/test_metadata.py @@ -1,122 +1,57 @@ -import pandas as pd -import pytest +from sdv import Metadata, load_demo -from sdv import Metadata +def test_build_demo_metadata_from_tables(): + """Build metadata from the demo tables. -def get_metadata(): - return Metadata({'tables': dict()}) + Then compare the built metadata with the demo one + to make sure that they are the same. + """ + metadata, tables = load_demo() - -def test_add_fields_and_primary_key(): - metadata = get_metadata() - - metadata.add_table('a_table') - - metadata.add_field('a_table', 'categoricals', 'categorical') - metadata.add_field('a_table', 'integers', 'numerical', 'integer', {'min': 0, 'max': 10}) - metadata.add_field('a_table', 'floats', 'numerical', 'float') - metadata.add_field('a_table', 'booleans', 'boolean') - - metadata.add_primary_key('a_table', 'index') - - expected_metadata = { - 'tables': { - 'a_table': { - 'name': 'a_table', - 'primary_key': 'index', - 'fields': { - 'categoricals': { - 'name': 'categoricals', - 'type': 'categorical' - }, - 'integers': { - 'name': 'integers', - 'type': 'numerical', - 'subtype': 'integer', - 'min': 0, - 'max': 10 - }, - 'floats': { - 'name': 'floats', - 'type': 'numerical', - 'subtype': 'float' - }, - 'booleans': { - 'name': 'booleans', - 'type': 'boolean' - }, - 'index': { - 'name': 'index', - 'type': 'id' - } - } - } - } - } - - assert metadata._metadata == expected_metadata - - -def test_add_table_analyze_all(): - metadata = get_metadata() - - data = pd.DataFrame({ - 'a_field': [0, 1, 2], - 'b_field': ['a', 'b', 'c'], - 'c_field': [True, False, False], - 'd_field': [0., 1., 2.] - }) - - metadata.add_table('a_table', data=data) - - expected_metadata = { - 'tables': { - 'a_table': { - 'name': 'a_table', - 'fields': { - 'a_field': { - 'name': 'a_field', - 'type': 'numerical', - 'subtype': 'integer' - }, - 'b_field': { - 'name': 'b_field', - 'type': 'categorical' - }, - 'c_field': { - 'name': 'c_field', - 'type': 'boolean' - }, - 'd_field': { - 'name': 'd_field', - 'type': 'numerical', - 'subtype': 'float' - } - } - } + new_meta = Metadata() + new_meta.add_table('users', data=tables['users'], primary_key='user_id') + new_meta.add_table('sessions', data=tables['sessions'], primary_key='session_id', + parent='users', foreign_key='user_id') + transactions_fields = { + 'timestamp': { + 'type': 'datetime', + 'format': '%Y-%m-%d' } } - - assert metadata._metadata == expected_metadata - - -def test_add_relationships(): - metadata = get_metadata() - - metadata.add_table('foo', primary_key='index_foo') - metadata.add_table('bar', primary_key='index_bar', parent='foo') - - assert metadata.get_children('foo') == set(['bar']) - assert metadata.get_parents('bar') == set(['foo']) - - -def test_cirtular_dependence_validation(): - metadata = get_metadata() - - metadata.add_table('foo', primary_key='index_foo') - metadata.add_table('bar', primary_key='index_bar', parent='foo') - metadata.add_table('tar', primary_key='index_tar', parent='bar') - - with pytest.raises(ValueError): - metadata.add_relationship('foo', 'tar') + new_meta.add_table('transactions', tables['transactions'], + fields_metadata=transactions_fields, + primary_key='transaction_id', parent='sessions') + + assert metadata == new_meta.to_dict() + + +def test_build_demo_metadata_without_tables(): + metadata = Metadata() + + metadata.add_table('users') + metadata.add_field('users', 'user_id', 'id', 'integer') + metadata.add_field('users', 'country', 'categorical') + metadata.add_field('users', 'gender', 'categorical') + metadata.add_field('users', 'age', 'numerical', 'integer') + metadata.set_primary_key('users', 'user_id') + + metadata.add_table('sessions') + metadata.add_field('sessions', 'session_id', 'id', 'integer') + metadata.add_field('sessions', 'user_id', 'id', 'integer') + metadata.add_field('sessions', 'device', 'categorical') + metadata.add_field('sessions', 'os', 'categorical') + metadata.set_primary_key('sessions', 'session_id') + metadata.add_relationship('users', 'sessions') + + metadata.add_table('transactions') + metadata.add_field('transactions', 'transaction_id', 'id', 'integer') + metadata.add_field('transactions', 'session_id', 'id', 'integer') + metadata.add_field('transactions', 'timestamp', 'datetime', properties={'format': '%Y-%m-%d'}) + metadata.add_field('transactions', 'amount', 'numerical', 'float') + metadata.add_field('transactions', 'approved', 'boolean') + metadata.set_primary_key('transactions', 'transaction_id') + metadata.add_relationship('sessions', 'transactions') + + demo_metadata = load_demo()[0] + assert demo_metadata == metadata.to_dict() diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 19b9cbefa..d349af79d 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -5,7 +5,7 @@ import pandas as pd import pytest -from sdv.metadata import Metadata, _parse_dtypes, _read_csv_dtypes +from sdv.metadata import Metadata, _load_csv, _parse_dtypes, _read_csv_dtypes def test__read_csv_dtypes(): @@ -32,13 +32,10 @@ def test__read_csv_dtypes(): } } } - result = _read_csv_dtypes(table_meta) # Asserts - expected = {'a_field': str, 'd_field': str} - - assert result == expected + assert result == {'a_field': str, 'd_field': str} def test__parse_dtypes(): @@ -69,7 +66,6 @@ def test__parse_dtypes(): } } } - result = _parse_dtypes(data, table_meta) # Asserts @@ -79,16 +75,35 @@ def test__parse_dtypes(): 'c_field': [1, 2], 'd_field': ['other', 'data'] }) - pd.testing.assert_frame_equal(result, expected) +@patch('sdv.metadata._parse_dtypes') +@patch('sdv.metadata.pd.read_csv') +@patch('sdv.metadata._read_csv_dtypes') +def test__load_csv(rcdtypes_mock, read_csv_mock, pdtypes_mock): + # Run + table_meta = { + 'path': 'filename.csv', + 'other': 'stuff' + } + result = _load_csv('a/path', table_meta) + + # Asserts + assert result == pdtypes_mock.return_value + rcdtypes_mock.assert_called_once_with(table_meta) + dtypes = rcdtypes_mock.return_value + read_csv_mock.assert_called_once_with('a/path/filename.csv', dtype=dtypes) + pdtypes_mock.assert_called_once_with(read_csv_mock.return_value, table_meta) + + class TestMetadata(TestCase): """Test Metadata class.""" - def test__get_relationships(self): + def test__analyze_relationships(self): """Test get relationships""" # Setup + metadata = Mock(spec=Metadata) _metadata = { 'tables': { 'test': { @@ -113,61 +128,102 @@ def test__get_relationships(self): } } } - - # Run - metadata = Mock() metadata._metadata = _metadata - Metadata._get_relationships(metadata) + # Run + Metadata._analyze_relationships(metadata) # Asserts - expected__child_map = {'table_ref': {'test'}} - expected__parent_map = {'test': {'table_ref'}} + assert metadata._child_map == {'table_ref': {'test'}} + assert metadata._parent_map == {'test': {'table_ref'}} - assert metadata._child_map == expected__child_map - assert metadata._parent_map == expected__parent_map - - def test__dict_metadata(self): + def test__dict_metadata_list(self): """Test dict_metadata""" # Run metadata = { - 'tables': [{ - 'name': 'test', - 'use': True, - 'fields': [{ - 'ref': {'table': 'table_ref', 'field': 'field_ref'}, - 'name': 'test_field' - }] - }] + 'tables': [ + { + 'name': 'test', + 'fields': [ + { + 'ref': { + 'table': 'table_ref', + 'field': 'field_ref' + }, + 'name': 'test_field' + } + ] + }, + { + 'name': 'other', + 'use': False, + } + ] } - result = Metadata._dict_metadata(metadata) # Asserts expected = { 'tables': { 'test': { - 'use': True, - 'name': 'test', 'fields': { 'test_field': { - 'ref': {'table': 'table_ref', 'field': 'field_ref'}, - 'name': 'test_field' + 'ref': { + 'table': 'table_ref', + 'field': 'field_ref' + } } } } } } + assert result == expected + def test__dict_metadata_dict(self): + """Test dict_metadata""" + # Run + metadata = { + 'tables': { + 'test': { + 'fields': { + 'test_field': { + 'ref': { + 'table': 'table_ref', + 'field': 'field_ref' + } + } + } + }, + 'other': { + 'use': False, + } + } + } + result = Metadata._dict_metadata(metadata) + + # Asserts + expected = { + 'tables': { + 'test': { + 'fields': { + 'test_field': { + 'ref': { + 'table': 'table_ref', + 'field': 'field_ref' + } + } + } + } + } + } assert result == expected - @patch('sdv.metadata.Metadata._get_relationships') + @patch('sdv.metadata.Metadata._analyze_relationships') @patch('sdv.metadata.Metadata._dict_metadata') def test___init__default_metadata_dict(self, mock_meta, mock_relationships): """Test create Metadata instance default with a dict""" # Run - metadata_dict = {'some': 'meta'} - metadata = Metadata(metadata_dict) + metadata = Metadata({'some': 'meta'}) # Asserts mock_meta.assert_called_once_with({'some': 'meta'}) @@ -177,75 +233,66 @@ def test___init__default_metadata_dict(self, mock_meta, mock_relationships): def test_get_children(self): """Test get children""" - # Run - metadata = Mock() + # Setup + metadata = Mock(spec=Metadata) metadata._child_map = { 'test': 'child_table' } - table_name = 'test' - - result = Metadata.get_children(metadata, table_name) + # Run + result = Metadata.get_children(metadata, 'test') # Asserts assert result == 'child_table' def test_get_parents(self): """Test get parents""" - # Run - metadata = Mock() + # Setup + metadata = Mock(spec=Metadata) metadata._parent_map = { 'test': 'parent_table' } - table_name = 'test' - - result = Metadata.get_parents(metadata, table_name) + # Run + result = Metadata.get_parents(metadata, 'test') # Asserts assert result == 'parent_table' def test_get_table_meta(self): """Test get table meta""" - # Run - metadata = Mock() + # Setup + metadata = Mock(spec=Metadata) metadata._metadata = { 'tables': { 'test': {'some': 'data'} } } - table_name = 'test' - - result = Metadata.get_table_meta(metadata, table_name) + # Run + result = Metadata.get_table_meta(metadata, 'test') # Asserts - expected = {'some': 'data'} - - assert result == expected + assert result == {'some': 'data'} @patch('sdv.metadata._load_csv') def test_load_table(self, mock_load_csv): """Test load table""" # Setup - root_path = '.' - table_meta = {'some': 'data'} - - # Run - metadata = Mock() - metadata.root_path = root_path - metadata.get_table_meta.return_value = table_meta + metadata = Mock(spec=Metadata) + metadata.root_path = 'a/path' + metadata.get_table_meta.return_value = {'some': 'data'} mock_load_csv.return_value = 'data' - table_name = 'test' - - result = Metadata.load_table(metadata, table_name) + # Run + result = Metadata.load_table(metadata, 'test') # Asserts - metadata.get_table_meta.assert_called_once_with('test') - mock_load_csv.assert_called_once_with('.', {'some': 'data'}) assert result == 'data' + metadata.get_table_meta.assert_called_once_with('test') + mock_load_csv.assert_called_once_with('a/path', {'some': 'data'}) + def test__get_dtypes_with_ids(self): """Test get data types including ids.""" # Setup @@ -260,11 +307,11 @@ def test__get_dtypes_with_ids(self): }, 'primary_key': 'item 0' } - - # Run metadata = Mock(spec=Metadata) metadata.get_table_meta.return_value = table_meta + metadata._DTYPES = Metadata._DTYPES + # Run result = Metadata._get_dtypes(metadata, 'test', ids=True) # Asserts @@ -276,7 +323,6 @@ def test__get_dtypes_with_ids(self): 'item 4': bool, 'item 5': np.datetime64, } - assert result == expected def test__get_dtypes_no_ids(self): @@ -292,11 +338,11 @@ def test__get_dtypes_no_ids(self): 'item 5': {'type': 'datetime'}, } } - - # Run metadata = Mock(spec=Metadata) metadata.get_table_meta.return_value = table_meta + metadata._DTYPES = Metadata._DTYPES + # Run result = Metadata._get_dtypes(metadata, 'test') # Asserts @@ -307,7 +353,6 @@ def test__get_dtypes_no_ids(self): 'item 4': bool, 'item 5': np.datetime64, } - assert result == expected def test__get_dtypes_error_invalid_type(self): @@ -318,11 +363,11 @@ def test__get_dtypes_error_invalid_type(self): 'item': {'type': 'unknown'} } } - - # Run and asserts metadata = Mock(spec=Metadata) metadata.get_table_meta.return_value = table_meta + metadata._DTYPES = Metadata._DTYPES + # Run with pytest.raises(ValueError): Metadata._get_dtypes(metadata, 'test') @@ -334,11 +379,11 @@ def test__get_dtypes_error_id(self): 'item': {'type': 'id'} } } - - # Run and asserts metadata = Mock(spec=Metadata) metadata.get_table_meta.return_value = table_meta + metadata._DTYPES = Metadata._DTYPES + # Run with pytest.raises(ValueError): Metadata._get_dtypes(metadata, 'test', ids=True) @@ -350,11 +395,11 @@ def test__get_dtypes_error_subtype_numerical(self): 'item': {'type': 'numerical', 'subtype': 'boolean'} } } - - # Run and asserts metadata = Mock(spec=Metadata) metadata.get_table_meta.return_value = table_meta + metadata._DTYPES = Metadata._DTYPES + # Run with pytest.raises(ValueError): Metadata._get_dtypes(metadata, 'test') @@ -366,11 +411,11 @@ def test__get_dtypes_error_subtype_id(self): 'item': {'type': 'id', 'subtype': 'boolean'} } } - - # Run and asserts metadata = Mock(spec=Metadata) metadata.get_table_meta.return_value = table_meta + metadata._DTYPES = Metadata._DTYPES + # Run with pytest.raises(ValueError): Metadata._get_dtypes(metadata, 'test', ids=True) @@ -390,26 +435,21 @@ def test__get_pii_fields(self): } } } - - # Run metadata = Mock(spec=Metadata) metadata.get_table_meta.return_value = table_meta - table_name = 'test' - - result = Metadata._get_pii_fields(metadata, table_name) + # Run + result = Metadata._get_pii_fields(metadata, 'test') # Asserts - expected = {'foo': 'email'} - - assert result == expected + assert result == {'foo': 'email'} @patch('sdv.metadata.transformers.DatetimeTransformer') @patch('sdv.metadata.transformers.BooleanTransformer') @patch('sdv.metadata.transformers.CategoricalTransformer') @patch('sdv.metadata.transformers.NumericalTransformer') - def test__get_transformers_no_error( - self, numerical_mock, categorical_mock, boolean_mock, datetime_mock): + def test__get_transformers_no_error(self, numerical_mock, categorical_mock, + boolean_mock, datetime_mock): """Test get transformers dict for each data type.""" # Setup numerical_mock.return_value = 'NumericalTransformer' @@ -425,11 +465,9 @@ def test__get_transformers_no_error( 'boolean': bool, 'datetime': np.datetime64 } - pii_fields = { 'categorical': 'email' } - result = Metadata._get_transformers(dtypes, pii_fields) # Asserts @@ -446,6 +484,7 @@ def test__get_transformers_no_error( assert len(numerical_mock.call_args_list) == len(expected_numerical_calls) for item in numerical_mock.call_args_list: assert item in expected_numerical_calls + assert categorical_mock.call_args == call(anonymize='email') assert boolean_mock.call_args == call() assert datetime_mock.call_args == call() @@ -456,35 +495,33 @@ def test__get_transformers_raise_valueerror(self): dtypes = { 'string': str } - with pytest.raises(ValueError): Metadata._get_transformers(dtypes, None) @patch('sdv.metadata.HyperTransformer') def test__load_hyper_transformer(self, mock_ht): """Test load HyperTransformer""" - # Run + # Setup metadata = Mock(spec=Metadata) metadata._get_dtypes.return_value = {'meta': 'dtypes'} metadata._get_pii_fields.return_value = {'meta': 'pii_fields'} metadata._get_transformers.return_value = {'meta': 'transformers'} mock_ht.return_value = 'hypertransformer' - table_name = 'test' - - result = Metadata._load_hyper_transformer(metadata, table_name) + # Run + result = Metadata._load_hyper_transformer(metadata, 'test') # Asserts + assert result == 'hypertransformer' metadata._get_dtypes.assert_called_once_with('test') metadata._get_pii_fields.assert_called_once_with('test') - metadata._get_transformers.assert_called_once_with( - {'meta': 'dtypes'}, {'meta': 'pii_fields'}) - + {'meta': 'dtypes'}, + {'meta': 'pii_fields'} + ) mock_ht.assert_called_once_with(transformers={'meta': 'transformers'}) - assert result == 'hypertransformer' - def test_get_table_names(self): + def test_get_tables(self): """Test get table names""" # Setup _metadata = { @@ -494,19 +531,16 @@ def test_get_table_names(self): 'table 3': None } } - - # Run - metadata = Mock() + metadata = Mock(spec=Metadata) metadata._metadata = _metadata - result = Metadata.get_table_names(metadata) + # Run + result = Metadata.get_tables(metadata) # Asserts - expected = ['table 1', 'table 2', 'table 3'] - - assert sorted(result) == sorted(expected) + assert sorted(result) == ['table 1', 'table 2', 'table 3'] - def test_get_tables(self): + def test_load_tables(self): """Test get tables""" # Setup table_names = ['foo', 'bar', 'tar'] @@ -515,15 +549,13 @@ def test_get_tables(self): pd.DataFrame({'bar': [3, 4]}), pd.DataFrame({'tar': [5, 6]}) ] - - # Run metadata = Mock(spec=Metadata) - metadata.get_table_names.side_effect = table_names + metadata.get_tables.side_effect = table_names metadata.load_table.side_effect = table_data + # Run tables = ['table 1', 'table 2', 'table 3'] - - result = Metadata.get_tables(metadata, tables=tables) + result = Metadata.load_tables(metadata, tables=tables) # Asserts expected = { @@ -531,7 +563,6 @@ def test_get_tables(self): 'table 2': pd.DataFrame({'bar': [3, 4]}), 'table 3': pd.DataFrame({'tar': [5, 6]}) } - assert result.keys() == expected.keys() for k, v in result.items(): @@ -546,91 +577,67 @@ def test_get_fields(self): 'b_field': 'other data' } } - - # Run - metadata = Mock() + metadata = Mock(spec=Metadata) metadata.get_table_meta.return_value = table_meta - table_name = 'test' - - result = Metadata.get_fields(metadata, table_name) + # Run + result = Metadata.get_fields(metadata, 'test') # Asserts expected = {'a_field': 'some data', 'b_field': 'other data'} + assert result == expected metadata.get_table_meta.assert_called_once_with('test') - assert result == expected - def test_get_primary_key(self): """Test get primary key""" # Setup table_meta = { - 'primary_key': 'pk' + 'primary_key': 'a_primary_key' } - - # Run - metadata = Mock() + metadata = Mock(spec=Metadata) metadata.get_table_meta.return_value = table_meta - table_name = 'test' - - result = Metadata.get_primary_key(metadata, table_name) + # Run + result = Metadata.get_primary_key(metadata, 'test') # Asserts - expected = 'pk' - + assert result == 'a_primary_key' metadata.get_table_meta.assert_called_once_with('test') - assert result == expected - def test_get_foreign_key(self): """Test get foreign key""" # Setup - primary_key = 'pk' + primary_key = 'a_primary_key' fields = { 'a_field': { 'ref': { - 'field': 'pk' + 'field': 'a_primary_key' }, 'name': 'a_field' }, 'p_field': { 'ref': { - 'field': 'kk' + 'field': 'another_key_field' }, 'name': 'p_field' } } - - # Run - metadata = Mock() + metadata = Mock(spec=Metadata) metadata.get_primary_key.return_value = primary_key metadata.get_fields.return_value = fields - parent = 'parent_table' - child = 'child_table' - - result = Metadata.get_foreign_key(metadata, parent, child) + # Run + result = Metadata.get_foreign_key(metadata, 'parent', 'child') # Asserts - expected = 'a_field' - - metadata.get_primary_key.assert_called_once_with('parent_table') - metadata.get_fields.assert_called_once_with('child_table') - - assert result == expected + assert result == 'a_field' + metadata.get_primary_key.assert_called_once_with('parent') + metadata.get_fields.assert_called_once_with('child') def test_reverse_transform(self): """Test reverse transform""" # Setup - data_types = { - 'item 1': int, - 'item 2': float, - 'item 3': np.object, - 'item 4': bool, - } - ht_mock = Mock() ht_mock.reverse_transform.return_value = { 'item 1': pd.Series([1.0, 2.0, None, 4.0, 5.0]), @@ -639,19 +646,20 @@ def test_reverse_transform(self): 'item 4': pd.Series([True, False, None, False, True]) } - _hyper_transformers = { + metadata = Mock(spec=Metadata) + metadata._hyper_transformers = { 'test': ht_mock } + metadata._get_dtypes.return_value = { + 'item 1': int, + 'item 2': float, + 'item 3': np.object, + 'item 4': bool, + } # Run - metadata = Mock() - metadata._hyper_transformers = _hyper_transformers - metadata._get_dtypes.return_value = data_types - - table_name = 'test' data = pd.DataFrame({'foo': [0, 1]}) - - Metadata.reverse_transform(metadata, table_name, data) + Metadata.reverse_transform(metadata, 'test', data) # Asserts expected_call = pd.DataFrame({'foo': [0, 1]}) @@ -660,339 +668,339 @@ def test_reverse_transform(self): expected_call ) - def test_add_table_already_exist(self): - """Try to add a new table that already exist""" - # Setup - table_names = ['a_table', 'b_table'] + # def test_add_table_already_exist(self): + # """Try to add a new table that already exist""" + # # Setup + # table_names = ['a_table', 'b_table'] - # Run and asserts - metadata = Mock(spec=Metadata) - metadata.get_table_names.return_value = table_names + # # Run and asserts + # metadata = Mock(spec=Metadata) + # metadata.get_tables.return_value = table_names - with pytest.raises(ValueError): - Metadata.add_table(metadata, 'a_table') + # with pytest.raises(ValueError): + # Metadata.add_table(metadata, 'a_table') - def test_add_table_only_name(self): - """Add table with only the name""" - # Setup - table_names = ['a_table', 'b_table'] + # def test_add_table_only_name(self): + # """Add table with only the name""" + # # Setup + # table_names = ['a_table', 'b_table'] - # Run - metadata = Mock(spec=Metadata) - metadata.get_table_names.return_value = table_names - metadata._metadata = {'tables': dict()} + # # Run + # metadata = Mock(spec=Metadata) + # metadata.get_tables.return_value = table_names + # metadata._metadata = {'tables': dict()} - Metadata.add_table(metadata, 'x_table') + # Metadata.add_table(metadata, 'x_table') - # Asserts - expected_table_meta = { - 'name': 'x_table', - 'fields': dict() - } + # # Asserts + # expected_table_meta = { + # 'name': 'x_table', + # 'fields': dict() + # } - assert metadata._metadata['tables']['x_table'] == expected_table_meta + # assert metadata._metadata['tables']['x_table'] == expected_table_meta - metadata.add_primary_key.call_count == 0 - metadata.add_relationship.call_count == 0 + # metadata.add_primary_key.call_count == 0 + # metadata.add_relationship.call_count == 0 - def test_add_table_with_primary_key(self): - """Add table with primary key""" - # Setup - table_names = ['a_table', 'b_table'] + # def test_add_table_with_primary_key(self): + # """Add table with primary key""" + # # Setup + # table_names = ['a_table', 'b_table'] - # Run - metadata = Mock(spec=Metadata) - metadata.get_table_names.return_value = table_names - metadata._metadata = {'tables': dict()} + # # Run + # metadata = Mock(spec=Metadata) + # metadata.get_tables.return_value = table_names + # metadata._metadata = {'tables': dict()} - Metadata.add_table(metadata, 'x_table', primary_key='id') + # Metadata.add_table(metadata, 'x_table', primary_key='id') - # Asserts - expected_table_meta = { - 'name': 'x_table', - 'fields': dict() - } + # # Asserts + # expected_table_meta = { + # 'name': 'x_table', + # 'fields': dict() + # } - assert metadata._metadata['tables']['x_table'] == expected_table_meta + # assert metadata._metadata['tables']['x_table'] == expected_table_meta - metadata.add_primary_key.assert_called_once_with('x_table', 'id') - metadata.add_relationship.call_count == 0 + # metadata.add_primary_key.assert_called_once_with('x_table', 'id') + # metadata.add_relationship.call_count == 0 - def test_add_table_with_foreign_key(self): - """Add table with foreign key""" - # Setup - table_names = ['a_table', 'b_table'] + # def test_add_table_with_foreign_key(self): + # """Add table with foreign key""" + # # Setup + # table_names = ['a_table', 'b_table'] - # Run - metadata = Mock(spec=Metadata) - metadata.get_table_names.return_value = table_names - metadata._metadata = {'tables': dict()} + # # Run + # metadata = Mock(spec=Metadata) + # metadata.get_tables.return_value = table_names + # metadata._metadata = {'tables': dict()} - Metadata.add_table(metadata, 'x_table', parent='users') + # Metadata.add_table(metadata, 'x_table', parent='users') - # Asserts - expected_table_meta = { - 'name': 'x_table', - 'fields': dict() - } - - assert metadata._metadata['tables']['x_table'] == expected_table_meta - - metadata.add_primary_key.call_count == 0 - metadata.add_relationship.assert_called_once_with('x_table', 'users', None) - - def test_add_table_with_fields_dict(self): - """Add table with fields(dict)""" - # Setup - table_names = ['a_table', 'b_table'] - - # Run - metadata = Mock(spec=Metadata) - metadata.get_table_names.return_value = table_names - metadata._metadata = {'tables': dict()} - - fields = { - 'a_field': {'type': 'numerical', 'subtype': 'integer'} - } - - Metadata.add_table(metadata, 'x_table', fields=fields) - - # Asserts - expected_table_meta = { - 'name': 'x_table', - 'fields': { - 'a_field': {'name': 'a_field', 'type': 'numerical', 'subtype': 'integer'} - } - } - - assert metadata._metadata['tables']['x_table'] == expected_table_meta + # # Asserts + # expected_table_meta = { + # 'name': 'x_table', + # 'fields': dict() + # } - assert metadata._validate_field.call_args_list == [ - call({'name': 'a_field', 'type': 'numerical', 'subtype': 'integer'}) - ] - - metadata.add_primary_key.call_count == 0 - metadata.add_relationship.call_count == 0 - - def test_add_table_with_field_list_no_data(self): - """Add table with fields(list) no data""" - # Setup - table_names = ['a_table', 'b_table'] - - # Run - metadata = Mock(spec=Metadata) - metadata.get_table_names.return_value = table_names - metadata._metadata = {'tables': dict()} - - fields = ['a_field', 'b_field'] - - with pytest.raises(ValueError): - Metadata.add_table(metadata, 'x_table', fields=fields) + # assert metadata._metadata['tables']['x_table'] == expected_table_meta - def test_add_table_with_field_list_data(self): - """Add table with fields(list) data""" - # Setup - table_names = ['a_table', 'b_table'] + # metadata.add_primary_key.call_count == 0 + # metadata.add_relationship.assert_called_once_with('x_table', 'users', None) - # Run - metadata = Mock(spec=Metadata) - metadata.get_table_names.return_value = table_names - metadata._metadata = {'tables': dict()} + # def test_add_table_with_fields_dict(self): + # """Add table with fields(dict)""" + # # Setup + # table_names = ['a_table', 'b_table'] - fields = ['a_field', 'b_field'] - data = pd.DataFrame({'a_field': [0, 1], 'b_field': [True, False], 'c_field': ['a', 'b']}) + # # Run + # metadata = Mock(spec=Metadata) + # metadata.get_tables.return_value = table_names + # metadata._metadata = {'tables': dict()} - Metadata.add_table(metadata, 'x_table', fields=fields, data=data) + # fields = { + # 'a_field': {'type': 'numerical', 'subtype': 'integer'} + # } - # Asserts - expected_table_meta = { - 'name': 'x_table', - 'fields': { - 'a_field': {'name': 'a_field', 'type': 'numerical', 'subtype': 'integer'}, - 'b_field': {'name': 'b_field', 'type': 'boolean'} - } - } + # Metadata.add_table(metadata, 'x_table', fields=fields) - assert metadata._metadata['tables']['x_table'] == expected_table_meta + # # Asserts + # expected_table_meta = { + # 'name': 'x_table', + # 'fields': { + # 'a_field': {'name': 'a_field', 'type': 'numerical', 'subtype': 'integer'} + # } + # } - metadata.add_primary_key.call_count == 0 - metadata.add_relationship.call_count == 0 + # assert metadata._metadata['tables']['x_table'] == expected_table_meta - def test_add_table_with_data_analyze(self): - """Add table with data to analyze all""" - # Setup - table_names = ['a_table', 'b_table'] - - # Run - metadata = Mock(spec=Metadata) - metadata.get_table_names.return_value = table_names - metadata._metadata = {'tables': dict()} - - data = pd.DataFrame({'a_field': [0, 1], 'b_field': [True, False], 'c_field': ['a', 'b']}) - - Metadata.add_table(metadata, 'x_table', data=data) - - # Asserts - expected_table_meta = { - 'name': 'x_table', - 'fields': { - 'a_field': {'name': 'a_field', 'type': 'numerical', 'subtype': 'integer'}, - 'b_field': {'name': 'b_field', 'type': 'boolean'}, - 'c_field': {'name': 'c_field', 'type': 'categorical'} - } - } - - assert metadata._metadata['tables']['x_table'] == expected_table_meta - - metadata.add_primary_key.call_count == 0 - metadata.add_relationship.call_count == 0 - - def test_add_relationship_table_no_exist(self): - """Add relationship table no exist""" - # Run and asserts - metadata = Mock(spec=Metadata) - metadata.get_table_names.return_value = list() - - with pytest.raises(ValueError): - Metadata.add_relationship(metadata, 'a_table', 'b_table') - - def test_add_relationship_parent_no_exist(self): - """Add relationship table no exist""" - # Run and asserts - metadata = Mock(spec=Metadata) - metadata.get_table_names.return_value = ['a_table'] - - with pytest.raises(ValueError): - Metadata.add_relationship(metadata, 'a_table', 'b_table') - - def test_add_relationship_already_exist(self): - """Add relationship already exist""" - # Run and asserts - metadata = Mock(spec=Metadata) - metadata.get_table_names.return_value = ['a_table', 'b_table'] - metadata.get_parents.return_value = set(['b_table']) - - with pytest.raises(ValueError): - Metadata.add_relationship(metadata, 'a_table', 'b_table') - - def test_add_relationship_parent_is_child_of_table(self): - """Add relationship parent is child of table""" - # Run and asserts - metadata = Mock(spec=Metadata) - metadata.get_table_names.return_value = ['a_table', 'b_table'] - metadata.get_parents.return_value = set() - metadata.get_children.return_value = set(['b_table']) - - with pytest.raises(ValueError): - Metadata.add_relationship(metadata, 'a_table', 'b_table') - - def test_add_relationship_parent_no_primary_key(self): - """Add relationship parent no primary key""" - # Run and asserts - metadata = Mock(spec=Metadata) - metadata.get_table_names.return_value = ['a_table', 'b_table'] - metadata.get_parents.return_value = set() - metadata.get_children.return_value = set() - metadata.get_primary_key.return_value = None - - with pytest.raises(ValueError): - Metadata.add_relationship(metadata, 'a_table', 'b_table') - - def test_add_relationship_valid(self): - """Add relationship valid""" - # Run - metadata = Mock(spec=Metadata) - metadata.get_table_names.return_value = ['a_table', 'b_table'] - metadata.get_parents.return_value = set() - metadata.get_children.return_value = set() - metadata.get_primary_key.return_value = 'pk_field' - - Metadata.add_relationship(metadata, 'a_table', 'b_table') - - # Asserts - metadata._validate_circular_relationships.assert_called_once_with('b_table', set()) - metadata.add_field.assert_called_once_with( - 'a_table', 'pk_field', 'id', None, {'ref': {'field': 'pk_field', 'table': 'b_table'}} - ) - metadata._get_relationships.assert_called_once_with() - - def test_add_primary_key_table_no_exist(self): - """Add primary key table no exist""" - # Run and asserts - metadata = Mock(spec=Metadata) - metadata.get_table_names.return_value = list() - - with pytest.raises(ValueError): - Metadata.add_primary_key(metadata, 'a_table', 'a_field') - - def test_add_primary_key_field_exist(self): - """Add primary key field exist""" - # Run and asserts - metadata = Mock(spec=Metadata) - metadata.get_table_names.return_value = ['a_table'] - metadata.get_fields.return_value = dict() - - with pytest.raises(ValueError): - Metadata.add_primary_key(metadata, 'a_table', 'a_field') - - def test_add_primary_key_primary_key_exist(self): - """Add primary key primary key exist""" - # Run and asserts - metadata = Mock(spec=Metadata) - metadata.get_table_names.return_value = ['a_table'] - metadata.get_fields.return_value = {'a_field': dict()} - metadata.get_primary_key.return_value = 'some_primary_key' - - with pytest.raises(ValueError): - Metadata.add_primary_key(metadata, 'a_table', 'a_field') - - def test_add_primary_key_valid(self): - """Add primary key valid""" - # Run - metadata = Mock(spec=Metadata) - metadata.get_table_names.return_value = ['a_table'] - metadata.get_fields.return_value = dict() - metadata.get_primary_key.return_value = None - metadata.get_table_meta.return_value = dict() - - Metadata.add_primary_key(metadata, 'a_table', 'a_field') - - # Asserts - metadata.get_table_names.assert_called_once_with() - metadata.get_fields.assert_called_once_with('a_table') - metadata.get_primary_key.assert_called_once_with('a_table') - - metadata.get_table_meta.assert_called_once_with('a_table') - metadata.add_field.assert_called_once_with('a_table', 'a_field', 'id', None, None) - - def test_add_field_table_no_exist(self): - """Add field table no exist""" - # Run and asserts - metadata = Mock(spec=Metadata) - metadata.get_table_names.return_value = list() - - with pytest.raises(ValueError): - Metadata.add_field(metadata, 'a_table', 'a_field', 'id', None, None) - - def test_add_field_field_exist(self): - """Add field already exist""" - # Run and asserts - metadata = Mock(spec=Metadata) - metadata.get_table_names.return_value = ['a_table'] - metadata.get_fields.return_value = {'a_field': dict()} - - with pytest.raises(ValueError): - Metadata.add_field(metadata, 'a_table', 'a_field', 'id', None, None) - - def test_add_field_valid(self): - """Add valid field""" - # Run - metadata = Mock(spec=Metadata) - metadata.get_table_names.return_value = ['a_table'] - metadata.get_fields.return_value = dict() - - Metadata.add_field(metadata, 'a_table', 'a_field', 'numerical', 'integer', {'min': 0}) - - # Asserts - metadata.get_table_names.assert_called_once_with() - metadata.get_fields.assert_called_once_with('a_table') + # assert metadata._validate_field.call_args_list == [ + # call({'name': 'a_field', 'type': 'numerical', 'subtype': 'integer'}) + # ] + + # metadata.add_primary_key.call_count == 0 + # metadata.add_relationship.call_count == 0 + + # def test_add_table_with_field_list_no_data(self): + # """Add table with fields(list) no data""" + # # Setup + # table_names = ['a_table', 'b_table'] + + # # Run + # metadata = Mock(spec=Metadata) + # metadata.get_tables.return_value = table_names + # metadata._metadata = {'tables': dict()} + + # fields = ['a_field', 'b_field'] + + # with pytest.raises(ValueError): + # Metadata.add_table(metadata, 'x_table', fields=fields) + + # def test_add_table_with_field_list_data(self): + # """Add table with fields(list) data""" + # # Setup + # table_names = ['a_table', 'b_table'] + + # # Run + # metadata = Mock(spec=Metadata) + # metadata.get_tables.return_value = table_names + # metadata._metadata = {'tables': dict()} + + # fields = ['a_field', 'b_field'] + # data = pd.DataFrame({'a_field': [0, 1], 'b_field': [True, False], 'c_field': ['a', 'b']}) + + # Metadata.add_table(metadata, 'x_table', fields=fields, data=data) + + # # Asserts + # expected_table_meta = { + # 'name': 'x_table', + # 'fields': { + # 'a_field': {'name': 'a_field', 'type': 'numerical', 'subtype': 'integer'}, + # 'b_field': {'name': 'b_field', 'type': 'boolean'} + # } + # } + + # assert metadata._metadata['tables']['x_table'] == expected_table_meta + + # metadata.add_primary_key.call_count == 0 + # metadata.add_relationship.call_count == 0 + + # def test_add_table_with_data_analyze(self): + # """Add table with data to analyze all""" + # # Setup + # table_names = ['a_table', 'b_table'] + + # # Run + # metadata = Mock(spec=Metadata) + # metadata.get_tables.return_value = table_names + # metadata._metadata = {'tables': dict()} + + # data = pd.DataFrame({'a_field': [0, 1], 'b_field': [True, False], 'c_field': ['a', 'b']}) + + # Metadata.add_table(metadata, 'x_table', data=data) + + # # Asserts + # expected_table_meta = { + # 'name': 'x_table', + # 'fields': { + # 'a_field': {'name': 'a_field', 'type': 'numerical', 'subtype': 'integer'}, + # 'b_field': {'name': 'b_field', 'type': 'boolean'}, + # 'c_field': {'name': 'c_field', 'type': 'categorical'} + # } + # } + + # assert metadata._metadata['tables']['x_table'] == expected_table_meta + + # metadata.add_primary_key.call_count == 0 + # metadata.add_relationship.call_count == 0 + + # def test_add_relationship_table_no_exist(self): + # """Add relationship table no exist""" + # # Run and asserts + # metadata = Mock(spec=Metadata) + # metadata.get_tables.return_value = list() + + # with pytest.raises(ValueError): + # Metadata.add_relationship(metadata, 'a_table', 'b_table') + + # def test_add_relationship_parent_no_exist(self): + # """Add relationship table no exist""" + # # Run and asserts + # metadata = Mock(spec=Metadata) + # metadata.get_tables.return_value = ['a_table'] + + # with pytest.raises(ValueError): + # Metadata.add_relationship(metadata, 'a_table', 'b_table') + + # def test_add_relationship_already_exist(self): + # """Add relationship already exist""" + # # Run and asserts + # metadata = Mock(spec=Metadata) + # metadata.get_tables.return_value = ['a_table', 'b_table'] + # metadata.get_parents.return_value = set(['b_table']) + + # with pytest.raises(ValueError): + # Metadata.add_relationship(metadata, 'a_table', 'b_table') + + # def test_add_relationship_parent_is_child_of_table(self): + # """Add relationship parent is child of table""" + # # Run and asserts + # metadata = Mock(spec=Metadata) + # metadata.get_tables.return_value = ['a_table', 'b_table'] + # metadata.get_parents.return_value = set() + # metadata.get_children.return_value = set(['b_table']) + + # with pytest.raises(ValueError): + # Metadata.add_relationship(metadata, 'a_table', 'b_table') + + # def test_add_relationship_parent_no_primary_key(self): + # """Add relationship parent no primary key""" + # # Run and asserts + # metadata = Mock(spec=Metadata) + # metadata.get_tables.return_value = ['a_table', 'b_table'] + # metadata.get_parents.return_value = set() + # metadata.get_children.return_value = set() + # metadata.get_primary_key.return_value = None + + # with pytest.raises(ValueError): + # Metadata.add_relationship(metadata, 'a_table', 'b_table') + + # def test_add_relationship_valid(self): + # """Add relationship valid""" + # # Run + # metadata = Mock(spec=Metadata) + # metadata.get_tables.return_value = ['a_table', 'b_table'] + # metadata.get_parents.return_value = set() + # metadata.get_children.return_value = set() + # metadata.get_primary_key.return_value = 'pk_field' + + # Metadata.add_relationship(metadata, 'a_table', 'b_table') + + # # Asserts + # metadata._validate_circular_relationships.assert_called_once_with('b_table', set()) + # metadata.add_field.assert_called_once_with( + # 'a_table', 'pk_field', 'id', None, {'ref': {'field': 'pk_field', 'table': 'b_table'}} + # ) + # metadata._analyze_relationships.assert_called_once_with() + + # def test_add_primary_key_table_no_exist(self): + # """Add primary key table no exist""" + # # Run and asserts + # metadata = Mock(spec=Metadata) + # metadata.get_tables.return_value = list() + + # with pytest.raises(ValueError): + # Metadata.add_primary_key(metadata, 'a_table', 'a_field') + + # def test_add_primary_key_field_exist(self): + # """Add primary key field exist""" + # # Run and asserts + # metadata = Mock(spec=Metadata) + # metadata.get_tables.return_value = ['a_table'] + # metadata.get_fields.return_value = dict() + + # with pytest.raises(ValueError): + # Metadata.add_primary_key(metadata, 'a_table', 'a_field') + + # def test_add_primary_key_primary_key_exist(self): + # """Add primary key primary key exist""" + # # Run and asserts + # metadata = Mock(spec=Metadata) + # metadata.get_tables.return_value = ['a_table'] + # metadata.get_fields.return_value = {'a_field': dict()} + # metadata.get_primary_key.return_value = 'some_primary_key' + + # with pytest.raises(ValueError): + # Metadata.add_primary_key(metadata, 'a_table', 'a_field') + + # def test_add_primary_key_valid(self): + # """Add primary key valid""" + # # Run + # metadata = Mock(spec=Metadata) + # metadata.get_tables.return_value = ['a_table'] + # metadata.get_fields.return_value = dict() + # metadata.get_primary_key.return_value = None + # metadata.get_table_meta.return_value = dict() + + # Metadata.add_primary_key(metadata, 'a_table', 'a_field') + + # # Asserts + # metadata.get_tables.assert_called_once_with() + # metadata.get_fields.assert_called_once_with('a_table') + # metadata.get_primary_key.assert_called_once_with('a_table') + + # metadata.get_table_meta.assert_called_once_with('a_table') + # metadata.add_field.assert_called_once_with('a_table', 'a_field', 'id', None, None) + + # def test_add_field_table_no_exist(self): + # """Add field table no exist""" + # # Run and asserts + # metadata = Mock(spec=Metadata) + # metadata.get_tables.return_value = list() + + # with pytest.raises(ValueError): + # Metadata.add_field(metadata, 'a_table', 'a_field', 'id', None, None) + + # def test_add_field_field_exist(self): + # """Add field already exist""" + # # Run and asserts + # metadata = Mock(spec=Metadata) + # metadata.get_tables.return_value = ['a_table'] + # metadata.get_fields.return_value = {'a_field': dict()} + + # with pytest.raises(ValueError): + # Metadata.add_field(metadata, 'a_table', 'a_field', 'id', None, None) + + # def test_add_field_valid(self): + # """Add valid field""" + # # Run + # metadata = Mock(spec=Metadata) + # metadata.get_tables.return_value = ['a_table'] + # metadata.get_fields.return_value = dict() + + # Metadata.add_field(metadata, 'a_table', 'a_field', 'numerical', 'integer', {'min': 0}) + + # # Asserts + # metadata.get_tables.assert_called_once_with() + # metadata.get_fields.assert_called_once_with('a_table') diff --git a/tests/test_modeler.py b/tests/test_modeler.py index a8857d3b7..a480ff1f4 100644 --- a/tests/test_modeler.py +++ b/tests/test_modeler.py @@ -2,7 +2,7 @@ from unittest.mock import Mock, call, patch import pandas as pd -from copulas.multivariate import GaussianMultivariate, VineCopula +from copulas.multivariate import GaussianMultivariate from sdv.metadata import Metadata from sdv.modeler import Modeler @@ -23,21 +23,20 @@ def test___init__default(self): def test___init__with_arguments(self): # Run - modeler = Modeler({'some': 'metadata'}, model=VineCopula, model_kwargs={'some': 'kwargs'}) + model = Mock() + modeler = Modeler({'some': 'metadata'}, model=model, model_kwargs={'some': 'kwargs'}) # Asserts assert modeler.models == dict() assert modeler.metadata == {'some': 'metadata'} - assert modeler.model == VineCopula + assert modeler.model == model assert modeler.model_kwargs == {'some': 'kwargs'} def test__flatten_array(self): """Test get flatten array""" # Run nested = [['foo', 'bar'], 'tar'] - prefix = 'test' - - result = Modeler._flatten_array(nested, prefix=prefix) + result = Modeler._flatten_array(nested, prefix='test') # Asserts expected = { @@ -45,7 +44,6 @@ def test__flatten_array(self): 'test__0__1': 'bar', 'test__1': 'tar' } - assert result == expected def test__flatten_dict(self): @@ -59,7 +57,6 @@ def test__flatten_dict(self): 'distribution': 'value_2', 'type': 'value_3' } - result = Modeler._flatten_dict(nested, prefix='test') # Asserts @@ -69,7 +66,6 @@ def test__flatten_dict(self): 'test__tar__0': 'value_tar_list_0', 'test__tar__1': 'value_tar_list_1' } - assert result == expected @patch('numpy.log') @@ -99,23 +95,21 @@ def test__get_model_dict_default_model(self, log_mock): 'distrib4': distrib4 } - # Run modeler = Mock(spec=Modeler) modeler._fit_model.return_value = model_fitted - modeler._flatten_dict.return_value = 'result' + # Run data = pd.DataFrame({'data': [1, 2, 3]}) - result = Modeler._get_model_dict(modeler, data) # Asserts + assert result == modeler._flatten_dict.return_value + expected_log_mock_call = [ call(0.2), call(0.5), call(0.3) ] - - assert result == 'result' assert sorted(log_mock.call_args_list) == sorted(expected_log_mock_call) pd.testing.assert_frame_equal( @@ -126,44 +120,38 @@ def test__get_model_dict_default_model(self, log_mock): def test__get_extensions(self): """Test get list of extensions from childs""" # Setup + modeler = Mock() model_dict = [ {'model': 'data 1'}, {'model': 'data 2'}, {'model': 'data 3'} ] - - # Run - modeler = Mock() modeler._get_model_dict.side_effect = model_dict - child_name = 'some_name' + # Run child_table = pd.DataFrame({'foo': ['aaa', 'bbb', 'ccc']}) - - result = Modeler._get_extension(modeler, child_name, child_table, 'foo') + result = Modeler._get_extension(modeler, 'some_name', child_table, 'foo') # Asserts expected = pd.DataFrame({ '__some_name__model': ['data 1', 'data 2', 'data 3'], '__some_name__child_rows': [1, 1, 1] }, index=['aaa', 'bbb', 'ccc']) - pd.testing.assert_frame_equal(result, expected) assert modeler._get_model_dict.call_count == 3 def test_cpa_with_tables_no_primary_key(self): """Test CPA with tables and no primary key.""" - # Run + # Setup modeler = Mock(spec=Modeler) - modeler.metadata = Mock(spec=Metadata) modeler.models = dict() - modeler.metadata.transform.return_value = pd.DataFrame({'data': [1, 2, 3]}) modeler.metadata.get_primary_key.return_value = None modeler._fit_model.return_value = 'fitted model' + # Run tables = {'test': pd.DataFrame({'data': ['a', 'b', 'c']})} - result = Modeler.cpa(modeler, 'test', tables) # Asserts @@ -189,7 +177,6 @@ def test__impute(self): # Asserts expected = pd.DataFrame({'foo': [0, 0.5, 1], 'bar': ['a', 'a', 'b']}) - pd.testing.assert_frame_equal(result, expected) def test_model_database(self): @@ -201,18 +188,17 @@ def rcpa_side_effect(table_name, tables): metadata_table_names = ['foo', 'bar', 'tar'] metadata_parents = [None, 'bar_parent', None] - # Run modeler = Mock() - modeler.metadata.get_table_names.return_value = metadata_table_names + modeler.metadata.get_tables.return_value = metadata_table_names modeler.metadata.get_parents.side_effect = metadata_parents modeler.rcpa.side_effect = rcpa_side_effect modeler.models = dict() + # Run Modeler.model_database(modeler) # Asserts expected_metadata_parents_call_count = 3 expected_metadata_parents_call = [call('foo'), call('bar'), call('tar')] - assert modeler.metadata.get_parents.call_count == expected_metadata_parents_call_count assert modeler.metadata.get_parents.call_args_list == expected_metadata_parents_call diff --git a/tests/test_sampler.py b/tests/test_sampler.py index e62a19cfe..da043c382 100644 --- a/tests/test_sampler.py +++ b/tests/test_sampler.py @@ -27,33 +27,30 @@ def test__square_matrix(self): """Test fill zeros a triangular matrix""" # Run matrix = [[0.1, 0.5], [0.3]] - result = Sampler._square_matrix(matrix) # Asserts expected = [[0.1, 0.5], [0.3, 0.0]] - assert result == expected def test__prepare_sampled_covariance(self): """Test prepare_sampler_covariante""" # Run covariance = [[0, 1], [1]] - result = Sampler(None, None)._prepare_sampled_covariance(covariance) # Asserts expected = np.array([[1., 1.], [1., 1.0]]) - np.testing.assert_almost_equal(result, expected) def test__reset_primary_keys_generators(self): """Test reset values""" - # Run - sampler = Mock() + # Setup + sampler = Mock(spec=Sampler) sampler.primary_key = 'something' sampler.remaining_primary_key = 'else' + # Run Sampler._reset_primary_keys_generators(sampler) # Asserts @@ -65,87 +62,92 @@ def test__transform_synthesized_rows(self): # Setup metadata_reverse_transform = pd.DataFrame({'foo': [0, 1], 'bar': [2, 3], 'tar': [4, 5]}) - # Run sampler = Mock(spec=Sampler) sampler.metadata = Mock(spec=Metadata) - sampler.metadata.reverse_transform.return_value = metadata_reverse_transform sampler.metadata.get_fields.return_value = {'foo': 'some data', 'tar': 'some data'} + # Run synthesized = pd.DataFrame({'data': [1, 2, 3]}) - result = Sampler._transform_synthesized_rows(sampler, synthesized, 'test') # Asserts expected = pd.DataFrame({'foo': [0, 1], 'tar': [4, 5]}) - pd.testing.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1)) def test__get_primary_keys_none(self): """Test returns a tuple of none when a table doesn't have a primary key""" - # Run + # Setup sampler = Mock(spec=Sampler) sampler.metadata = Mock(spec=Metadata) sampler.metadata.get_primary_key.return_value = None + # Run result = Sampler._get_primary_keys(sampler, 'test', 5) # Asserts - expected = (None, None) - assert result == expected + assert result == (None, None) def test__get_primary_keys_raise_value_error_field_not_id(self): """Test a ValueError is raised when generator is None and field type not id.""" - # Run & asserts + # Setup sampler = Mock(spec=Sampler) sampler.metadata = Mock(spec=Metadata) - sampler.metadata.get_primary_key.return_value = 'pk_field' sampler.metadata.get_fields.return_value = {'pk_field': {'type': 'not id'}} sampler.primary_key = {'test': None} + # Run with pytest.raises(ValueError): Sampler._get_primary_keys(sampler, 'test', 5) def test__get_primary_keys_raise_value_error_field_not_supported(self): """Test a ValueError is raised when a field subtype is not supported.""" - # Run & asserts + # Setup sampler = Mock(spec=Sampler) sampler.metadata = Mock(spec=Metadata) - sampler.metadata.get_primary_key.return_value = 'pk_field' sampler.metadata.get_fields.return_value = {'pk_field': {'type': 'id', 'subtype': 'X'}} sampler.primary_key = {'test': None} + # Run with pytest.raises(ValueError): Sampler._get_primary_keys(sampler, 'test', 5) def test__get_primary_keys_raises_not_implemented_error_datetime(self): """Test a NotImplementedError is raised when pk field is datetime.""" - # Run & asserts + # Setup sampler = Mock(spec=Sampler) sampler.metadata = Mock(spec=Metadata) - sampler.metadata.get_primary_key.return_value = 'pk_field' sampler.metadata.get_fields.return_value = { - 'pk_field': {'type': 'id', 'subtype': 'datetime'}} + 'pk_field': { + 'type': 'id', + 'subtype': 'datetime' + } + } sampler.primary_key = {'test': None} + # Run with pytest.raises(NotImplementedError): Sampler._get_primary_keys(sampler, 'test', 5) def test__get_primary_keys_raises_value_error_remaining(self): """Test a ValueError is raised when there are not enough uniques values""" - # Run & asserts + # Setup sampler = Mock(spec=Sampler) sampler.metadata = Mock(spec=Metadata) - sampler.metadata.get_primary_key.return_value = 'pk_field' sampler.metadata.get_fields.return_value = { - 'pk_field': {'type': 'id', 'subtype': 'datetime'}} + 'pk_field': { + 'type': 'id', + 'subtype': 'datetime' + } + } sampler.primary_key = {'test': 'generator'} sampler.remaining_primary_key = {'test': 4} + # Run with pytest.raises(ValueError): Sampler._get_primary_keys(sampler, 'test', 5) @@ -153,19 +155,15 @@ def test__key_order(self): """Test key order""" # Run key_value = ['foo__0__1'] - result = Sampler._key_order(key_value) # Asserts - expected = ['foo', 0, 1] - - assert result == expected + assert result == ['foo', 0, 1] def test__unflatten_dict_raises_error_row_index(self): """Test unflatten dict raises error row_index""" # Setup sampler = Mock(autospec=Sampler) - flat = { 'foo__0__1': 'some value' } @@ -177,8 +175,7 @@ def test__unflatten_dict_raises_error_row_index(self): def test__unflatten_dict_raises_error_column_index(self): """Test unflatten dict raises error column_index""" # Setup - sampler = Mock() - + sampler = Mock(spec=Sampler) flat = { 'foo__1__0': 'some value' } @@ -190,16 +187,15 @@ def test__unflatten_dict_raises_error_column_index(self): def test__unflatten_dict(self): """Test unflatten_dict""" # Setup - sampler = Mock() + sampler = Mock(spec=Sampler) sampler._key_order = None + # Run flat = { 'foo__0__foo': 'foo value', 'bar__0__0': 'bar value', 'tar': 'tar value' } - - # Run result = Sampler._unflatten_dict(sampler, flat) # Asserts @@ -208,89 +204,83 @@ def test__unflatten_dict(self): 'bar': [['bar value']], 'tar': 'tar value', } - assert result == expected def test__make_positive_definite(self): """Test find the nearest positive-definite matrix""" - # Run - sampler = Mock() + # Setup + sampler = Mock(spec=Sampler) sampler._check_matrix_symmetric_positive_definite.return_value = True + # Run matrix = np.array([[0, 1], [1, 0]]) - result = Sampler._make_positive_definite(sampler, matrix) # Asserts expected = np.array([[0.5, 0.5], [0.5, 0.5]]) - np.testing.assert_equal(result, expected) + assert sampler._check_matrix_symmetric_positive_definite.call_count == 1 def test__make_positive_definite_iterate(self): """Test find the nearest positive-definite matrix iterating""" # Setup - check_matrix = [False, False, True] - # Run - sampler = Mock() - sampler._check_matrix_symmetric_positive_definite.side_effect = check_matrix + sampler = Mock(spec=Sampler) + sampler._check_matrix_symmetric_positive_definite.side_effect = [False, False, True] + # Run matrix = np.array([[-1, -5], [-3, -7]]) - result = Sampler._make_positive_definite(sampler, matrix) # Asserts expected = np.array([[0.8, -0.4], [-0.4, 0.2]]) - np.testing.assert_array_almost_equal(result, expected) + assert sampler._check_matrix_symmetric_positive_definite.call_count == 3 def test__check_matrix_symmetric_positive_definite_shape_error(self): """Test check matrix shape error""" + # Setup + sampler = Mock(spec=Sampler) + # Run - sampler = Mock() matrix = np.array([]) - result = Sampler._check_matrix_symmetric_positive_definite(sampler, matrix) # Asserts - expected = False - - assert result == expected + assert not result def test__check_matrix_symmetric_positive_definite_np_error(self): """Test check matrix numpy raise error""" + # Setup + sampler = Mock(spec=Sampler) + # Run - sampler = Mock() matrix = np.array([[-1, 0], [0, 0]]) - result = Sampler._check_matrix_symmetric_positive_definite(sampler, matrix) # Asserts - expected = False - - assert result == expected + assert not result def test__check_matrix_symmetric_positive_definite(self): """Test check matrix numpy""" + # Setup + sampler = Mock(spec=Sampler) + # Run - sampler = Mock() matrix = np.array([[0.5, 0.5], [0.5, 0.5]]) - result = Sampler._check_matrix_symmetric_positive_definite(sampler, matrix) # Asserts - expected = True - - assert result is expected + assert result def test__unflatten_gaussian_copula(self): """Test unflatte gaussian copula""" # Setup - fixed_covariance = [[0.4, 0.2], [0.2, 0.0]] sampler = Mock(autospec=Sampler) - sampler._prepare_sampled_covariance.return_value = fixed_covariance + sampler._prepare_sampled_covariance.return_value = [[0.4, 0.2], [0.2, 0.0]] + # Run model_parameters = { 'distribs': { 'foo': {'std': 0.5} @@ -316,28 +306,24 @@ def test__unflatten_gaussian_copula(self): def test__get_extension(self): """Test get extension""" - # Run - sampler = Mock() + # Setup + sampler = Mock(spec=Sampler) + # Run parent_row = pd.Series([[0, 1], [1, 0]], index=['__foo__field', '__foo__field2']) table_name = 'foo' - result = Sampler._get_extension(sampler, parent_row, table_name) # Asserts expected = {'field': [0, 1], 'field2': [1, 0]} - assert result == expected def test__get_model(self): """Test get model""" # Setup - unflatten_dict = {'unflatten': 'dict'} - unflatten_gaussian = {'unflatten': 'gaussian'} - - sampler = Mock() - sampler._unflatten_dict.return_value = unflatten_dict - sampler._unflatten_gaussian_copula.return_value = unflatten_gaussian + sampler = Mock(spec=Sampler) + sampler._unflatten_dict.return_value = {'unflatten': 'dict'} + sampler._unflatten_gaussian_copula.return_value = {'unflatten': 'gaussian'} table_model = Mock() table_model.to_dict.return_value = { 'distribution': 'copulas.multivariate.gaussian.GaussianMultivariate' @@ -348,61 +334,52 @@ def test__get_model(self): Sampler._get_model(sampler, extension, table_model) # Asserts - expected_unflatten_dict_call = {'extension': 'dict'} + sampler._unflatten_dict.assert_called_once_with({'extension': 'dict'}) + expected_unflatten_gaussian_call = { 'unflatten': 'dict', 'fitted': True, 'distribution': 'copulas.multivariate.gaussian.GaussianMultivariate' } - expected_from_dict_call = {'unflatten': 'gaussian'} - - sampler._unflatten_dict.assert_called_once_with(expected_unflatten_dict_call) sampler._unflatten_gaussian_copula.assert_called_once_with( expected_unflatten_gaussian_call) - table_model.from_dict.assert_called_once_with(expected_from_dict_call) + + table_model.from_dict.assert_called_once_with({'unflatten': 'gaussian'}) def test__sample_rows(self): """Test sample rows from model""" # Setup - primary_keys = ('pk', [1, 2, 3, 4]) - model_sample = dict() - - # Run - sampler = Mock() - sampler._get_primary_keys.return_value = primary_keys + sampler = Mock(spec=Sampler) + sampler._get_primary_keys.return_value = ('pk', [1, 2, 3, 4]) model = Mock() - model.sample.return_value = model_sample - num_rows = 5 - table_name = 'test' + model.sample.return_value = dict() - result = Sampler._sample_rows(sampler, model, num_rows, table_name) + # Run + result = Sampler._sample_rows(sampler, model, num_rows=5, table_name='test') # Asserts - expected = {'pk': [1, 2, 3, 4]} + assert result == {'pk': [1, 2, 3, 4]} - assert result == expected sampler._get_primary_keys.assert_called_once_with('test', 5) model.sample.called_once_with(5) def test__sample_children(self): """Test sample children""" # Setup - metadata_children = ['child A', 'child B', 'child C'] + sampler = Mock(spec=Sampler) + sampler.metadata.get_children.return_value = ['child A', 'child B', 'child C'] # Run - sampler = Mock() - sampler.metadata.get_children.return_value = metadata_children - - table_name = 'test' sampled = { 'test': pd.DataFrame({'field': [11, 22, 33]}) } - - Sampler._sample_children(sampler, table_name, sampled) + Sampler._sample_children(sampler, 'test', sampled) # Asserts - expected__sample_table_call_args = [ + sampler.metadata.get_children.assert_called_once_with('test') + + expected_calls = [ ['child A', 'test', pd.Series([11], index=['field'], name=0), sampled], ['child A', 'test', pd.Series([22], index=['field'], name=1), sampled], ['child A', 'test', pd.Series([33], index=['field'], name=2), sampled], @@ -413,11 +390,8 @@ def test__sample_children(self): ['child C', 'test', pd.Series([22], index=['field'], name=1), sampled], ['child C', 'test', pd.Series([33], index=['field'], name=2), sampled], ] - - sampler.metadata.get_children.assert_called_once_with('test') - - for result_call, expected_call in zip( - sampler._sample_table.call_args_list, expected__sample_table_call_args): + actual_calls = sampler._sample_table.call_args_list + for result_call, expected_call in zip(actual_calls, expected_calls): assert result_call[0][0] == expected_call[0] assert result_call[0][1] == expected_call[1] assert result_call[0][3] == expected_call[3] @@ -426,16 +400,17 @@ def test__sample_children(self): def test__sample_table_sampled_empty(self): """Test sample table when sampled is still an empty dict.""" # Setup - sampler = Mock(autospec=Sampler) + sampler = Mock(spec=Sampler) sampler._get_extension.return_value = {'child_rows': 5} + table_model_mock = Mock() sampler.models = {'test': table_model_mock} + model_mock = Mock() sampler._get_model.return_value = model_mock sampler._sample_rows.return_value = pd.DataFrame({ 'value': [1, 2, 3, 4, 5] }) - sampler.metadata.get_primary_key.return_value = 'id' sampler.metadata.get_foreign_key.return_value = 'parent_id' @@ -464,16 +439,17 @@ def test__sample_table_sampled_empty(self): def test__sample_table_sampled_not_empty(self): """Test sample table when sampled previous sampled rows exist.""" # Setup - sampler = Mock(autospec=Sampler) + sampler = Mock(spec=Sampler) sampler._get_extension.return_value = {'child_rows': 5} + table_model_mock = Mock() sampler.models = {'test': table_model_mock} + model_mock = Mock() sampler._get_model.return_value = model_mock sampler._sample_rows.return_value = pd.DataFrame({ 'value': [6, 7, 8, 9, 10] }) - sampler.metadata.get_primary_key.return_value = 'id' sampler.metadata.get_foreign_key.return_value = 'parent_id' @@ -510,40 +486,26 @@ def test_sample_all(self): def sample_side_effect(table, num_rows): return {table: pd.DataFrame({'foo': range(num_rows)})} - metadata_parents_side_effect = [False, True, False] - - metadata_table_names = ['table a', 'table b', 'table c'] - - # Run - sampler = Mock() - sampler.metadata.get_table_names.return_value = metadata_table_names - sampler.metadata.get_parents.side_effect = metadata_parents_side_effect + sampler = Mock(spec=Sampler) + sampler.metadata.get_tables.return_value = ['table a', 'table b', 'table c'] + sampler.metadata.get_parents.side_effect = [False, True, False] sampler.sample.side_effect = sample_side_effect - num_rows = 3 - reset_primary_keys = True - - result = Sampler.sample_all( - sampler, num_rows=num_rows, reset_primary_keys=reset_primary_keys) + # Run + result = Sampler.sample_all(sampler, num_rows=3, reset_primary_keys=True) # Asserts assert sampler.metadata.get_parents.call_count == 3 assert sampler._reset_primary_keys_generators.call_count == 1 - pd.testing.assert_frame_equal(result['table a'], pd.DataFrame({'foo': range(num_rows)})) - pd.testing.assert_frame_equal(result['table c'], pd.DataFrame({'foo': range(num_rows)})) + pd.testing.assert_frame_equal(result['table a'], pd.DataFrame({'foo': range(3)})) + pd.testing.assert_frame_equal(result['table c'], pd.DataFrame({'foo': range(3)})) def test_sample_no_sample_children(self): """Test sample no sample children""" # Setup - models = {'test': 'model'} - - # Run - sampler = Mock() - sampler.models = models + sampler = Mock(spec=Sampler) + sampler.models = {'test': 'model'} sampler.metadata.get_parents.return_value = None - table_name = 'test' - num_rows = 5 - Sampler.sample(sampler, table_name, num_rows, sample_children=False) - - # Asserts + # Run + Sampler.sample(sampler, 'test', 5, sample_children=False) diff --git a/tests/test_sdv.py b/tests/test_sdv.py index 0b8848f11..e6844a4d1 100644 --- a/tests/test_sdv.py +++ b/tests/test_sdv.py @@ -11,9 +11,11 @@ class TestSDV(TestCase): @patch('sdv.sdv.open') @patch('sdv.sdv.pickle') def test_save(self, pickle_mock, open_mock): + # Run sdv = SDV() sdv.save('save/path.pkl') + # Asserts open_mock.assert_called_once_with('save/path.pkl', 'wb') output = open_mock.return_value.__enter__.return_value pickle_mock.dump.assert_called_once_with(sdv, output) @@ -21,8 +23,10 @@ def test_save(self, pickle_mock, open_mock): @patch('sdv.sdv.open') @patch('sdv.sdv.pickle') def test_load(self, pickle_mock, open_mock): + # Run returned = SDV.load('save/path.pkl') + # Asserts open_mock.assert_called_once_with('save/path.pkl', 'rb') output = open_mock.return_value.__enter__.return_value pickle_mock.load.assert_called_once_with(output) @@ -50,78 +54,70 @@ def test____init__users_params(self): def test__validate_dataset_structure_no_error(self): """Test that any error is raised with a supported structure""" # Setup - table_names = ['foo', 'bar', 'tar'] - parents = [[], ['foo'], ['bar']] - - # Run sdv = Mock() - sdv.metadata.get_table_names.return_value = table_names - sdv.metadata.get_parents.side_effect = parents + sdv.metadata.get_tables.return_value = ['foo', 'bar', 'tar'] + sdv.metadata.get_parents.side_effect = [[], ['foo'], ['bar']] + # Run SDV._validate_dataset_structure(sdv) # Asserts - expect_get_parents_call_count = 3 - assert sdv.metadata.get_parents.call_count == expect_get_parents_call_count + assert sdv.metadata.get_parents.call_count == 3 def test__validate_dataset_structure_raise_error(self): """Test that a ValueError is raised because the bad structure""" # Setup - table_names = ['foo', 'bar', 'tar'] - parents = [[], [], ['foo', 'bar']] - - # Run & assert sdv = Mock() - sdv.metadata.get_table_names.return_value = table_names - sdv.metadata.get_parents.side_effect = parents + sdv.metadata.get_tables.return_value = ['foo', 'bar', 'tar'] + sdv.metadata.get_parents.side_effect = [[], [], ['foo', 'bar']] + # Run with pytest.raises(ValueError): SDV._validate_dataset_structure(sdv) def test_sample_fitted(self): """Check that the sample is called.""" - # Run + # Sample sdv = Mock() - table_name = 'DEMO' - num_rows = 5 sdv.sampler.sample.return_value = 'test' - result = SDV.sample(sdv, table_name, num_rows) + # Run + result = SDV.sample(sdv, 'DEMO', 5) # Asserts + assert result == 'test' sdv.sampler.sample.assert_called_once_with( 'DEMO', 5, sample_children=True, reset_primary_keys=False) - assert result == 'test' - def test_sample_not_fitted(self): """Check that the sample raise an exception when is not fitted.""" - # Run and asserts + # Setup sdv = Mock() sdv.sampler = None - table_name = 'DEMO' - num_rows = 5 + # Run with pytest.raises(NotFittedError): - SDV.sample(sdv, table_name, num_rows) + SDV.sample(sdv, 'DEMO', 5) def test_sample_all_fitted(self): """Check that the sample_all is called""" - # Run + # Setup sdv = Mock() sdv.sampler.sample_all.return_value = 'test' + # Run result = SDV.sample_all(sdv) # Asserts - sdv.sampler.sample_all.assert_called_once_with(5, reset_primary_keys=False) assert result == 'test' + sdv.sampler.sample_all.assert_called_once_with(5, reset_primary_keys=False) def test_sample_all_not_fitted(self): """Check that the sample_all raise an exception when is not fitted.""" - # Run & asserts + # Setup sdv = Mock() sdv.sampler = None + # Run with pytest.raises(NotFittedError): SDV.sample_all(sdv) From 8f1ed93150943b4e80167568807d7d1e5f0073e2 Mon Sep 17 00:00:00 2001 From: JDTheRipperPC Date: Mon, 25 Nov 2019 15:49:30 +0100 Subject: [PATCH 11/16] Updates tests, load_demo use metadata=False by default and Metadata can retrieve metadata as Metadata instance itself --- README.md | 2 +- sdv/demo.py | 7 +- sdv/metadata.py | 4 +- tests/integration/test_metadata.py | 4 +- tests/test_metadata.py | 551 +++++++++++++---------------- 5 files changed, 252 insertions(+), 316 deletions(-) diff --git a/README.md b/README.md index 12bb683f9..64c304c01 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,7 @@ function: ```python from sdv import load_demo -metadata, tables = load_demo() +metadata, tables = load_demo(metadata=True) ``` This will return two objects: diff --git a/sdv/demo.py b/sdv/demo.py index 7f9c895dc..9bdcc0f79 100644 --- a/sdv/demo.py +++ b/sdv/demo.py @@ -76,7 +76,7 @@ } -def load_demo(): +def load_demo(metadata=False): """Load demo data. The demo data consists of the metadata and tables dict for a a toy dataset with @@ -122,4 +122,7 @@ def load_demo(): 'transactions': transactions } - return DEMO_METADATA.copy(), tables + if metadata: + return DEMO_METADATA.copy(), tables + + return tables diff --git a/sdv/metadata.py b/sdv/metadata.py index 3937cdcff..694376cff 100644 --- a/sdv/metadata.py +++ b/sdv/metadata.py @@ -164,7 +164,9 @@ def __init__(self, metadata=None, root_path=None): else: self.root_path = root_path or '.' - if metadata is not None: + if isinstance(metadata, Metadata): + self._metadata = metadata + elif metadata is not None: self._metadata = self._dict_metadata(metadata) else: self._metadata = {'tables': {}} diff --git a/tests/integration/test_metadata.py b/tests/integration/test_metadata.py index 83b8b1f85..289788028 100644 --- a/tests/integration/test_metadata.py +++ b/tests/integration/test_metadata.py @@ -7,7 +7,7 @@ def test_build_demo_metadata_from_tables(): Then compare the built metadata with the demo one to make sure that they are the same. """ - metadata, tables = load_demo() + metadata, tables = load_demo(metadata=True) new_meta = Metadata() new_meta.add_table('users', data=tables['users'], primary_key='user_id') @@ -53,5 +53,5 @@ def test_build_demo_metadata_without_tables(): metadata.set_primary_key('transactions', 'transaction_id') metadata.add_relationship('sessions', 'transactions') - demo_metadata = load_demo()[0] + demo_metadata = load_demo(metadata=True)[0] assert demo_metadata == metadata.to_dict() diff --git a/tests/test_metadata.py b/tests/test_metadata.py index d349af79d..58601111c 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -668,339 +668,270 @@ def test_reverse_transform(self): expected_call ) - # def test_add_table_already_exist(self): - # """Try to add a new table that already exist""" - # # Setup - # table_names = ['a_table', 'b_table'] + def test_add_table_already_exist(self): + """Try to add a new table that already exist""" + # Setup + metadata = Mock(spec=Metadata) + metadata.get_tables.return_value = ['a_table', 'b_table'] - # # Run and asserts - # metadata = Mock(spec=Metadata) - # metadata.get_tables.return_value = table_names + # Run + with pytest.raises(ValueError): + Metadata.add_table(metadata, 'a_table') - # with pytest.raises(ValueError): - # Metadata.add_table(metadata, 'a_table') + def test_add_table_only_name(self): + """Add table with only the name""" + # Setup + metadata = Mock(spec=Metadata) + metadata.get_tables.return_value = ['a_table', 'b_table'] + metadata._metadata = {'tables': dict()} - # def test_add_table_only_name(self): - # """Add table with only the name""" - # # Setup - # table_names = ['a_table', 'b_table'] + # Run + Metadata.add_table(metadata, 'x_table') - # # Run - # metadata = Mock(spec=Metadata) - # metadata.get_tables.return_value = table_names - # metadata._metadata = {'tables': dict()} + # Asserts + expected_table_meta = { + 'fields': dict() + } - # Metadata.add_table(metadata, 'x_table') + assert metadata._metadata['tables']['x_table'] == expected_table_meta - # # Asserts - # expected_table_meta = { - # 'name': 'x_table', - # 'fields': dict() - # } + metadata.set_primary_key.call_count == 0 + metadata.add_relationship.call_count == 0 - # assert metadata._metadata['tables']['x_table'] == expected_table_meta + def test_add_table_with_primary_key(self): + """Add table with primary key""" + # Setup + metadata = Mock(spec=Metadata) + metadata.get_tables.return_value = ['a_table', 'b_table'] + metadata._metadata = {'tables': dict()} - # metadata.add_primary_key.call_count == 0 - # metadata.add_relationship.call_count == 0 + # Run + Metadata.add_table(metadata, 'x_table', primary_key='id') - # def test_add_table_with_primary_key(self): - # """Add table with primary key""" - # # Setup - # table_names = ['a_table', 'b_table'] + # Asserts + expected_table_meta = { + 'fields': dict() + } - # # Run - # metadata = Mock(spec=Metadata) - # metadata.get_tables.return_value = table_names - # metadata._metadata = {'tables': dict()} + assert metadata._metadata['tables']['x_table'] == expected_table_meta - # Metadata.add_table(metadata, 'x_table', primary_key='id') + metadata.set_primary_key.assert_called_once_with('x_table', 'id') + metadata.add_relationship.call_count == 0 - # # Asserts - # expected_table_meta = { - # 'name': 'x_table', - # 'fields': dict() - # } + def test_add_table_with_foreign_key(self): + """Add table with foreign key""" + # Setup + metadata = Mock(spec=Metadata) + metadata.get_tables.return_value = ['a_table', 'b_table'] + metadata._metadata = {'tables': dict()} - # assert metadata._metadata['tables']['x_table'] == expected_table_meta + # Run + Metadata.add_table(metadata, 'x_table', parent='users') - # metadata.add_primary_key.assert_called_once_with('x_table', 'id') - # metadata.add_relationship.call_count == 0 + # Asserts + expected_table_meta = { + 'fields': dict() + } - # def test_add_table_with_foreign_key(self): - # """Add table with foreign key""" - # # Setup - # table_names = ['a_table', 'b_table'] + assert metadata._metadata['tables']['x_table'] == expected_table_meta - # # Run - # metadata = Mock(spec=Metadata) - # metadata.get_tables.return_value = table_names - # metadata._metadata = {'tables': dict()} + metadata.set_primary_key.call_count == 0 + metadata.add_relationship.assert_called_once_with('users', 'x_table', None) - # Metadata.add_table(metadata, 'x_table', parent='users') + def test_add_table_with_fields_metadata(self): + """Add table with fields metadata""" + # Setup + metadata = Mock(spec=Metadata) + metadata.get_tables.return_value = ['a_table', 'b_table'] + metadata._metadata = {'tables': dict()} - # # Asserts - # expected_table_meta = { - # 'name': 'x_table', - # 'fields': dict() - # } + # Run + fields_metadata = { + 'a_field': {'type': 'numerical', 'subtype': 'integer'} + } - # assert metadata._metadata['tables']['x_table'] == expected_table_meta + Metadata.add_table(metadata, 'x_table', fields_metadata=fields_metadata) - # metadata.add_primary_key.call_count == 0 - # metadata.add_relationship.assert_called_once_with('x_table', 'users', None) - - # def test_add_table_with_fields_dict(self): - # """Add table with fields(dict)""" - # # Setup - # table_names = ['a_table', 'b_table'] + # Asserts + expected_table_meta = { + 'fields': { + 'a_field': {'type': 'numerical', 'subtype': 'integer'} + } + } - # # Run - # metadata = Mock(spec=Metadata) - # metadata.get_tables.return_value = table_names - # metadata._metadata = {'tables': dict()} + assert metadata._metadata['tables']['x_table'] == expected_table_meta - # fields = { - # 'a_field': {'type': 'numerical', 'subtype': 'integer'} - # } + metadata.set_primary_key.call_count == 0 + metadata.add_relationship.call_count == 0 - # Metadata.add_table(metadata, 'x_table', fields=fields) + def test_add_table_with_fields_no_data(self): + """Add table with fields and no data""" + # Setup + metadata = Mock(spec=Metadata) + metadata.get_tables.return_value = ['a_table', 'b_table'] + metadata._metadata = {'tables': dict()} - # # Asserts - # expected_table_meta = { - # 'name': 'x_table', - # 'fields': { - # 'a_field': {'name': 'a_field', 'type': 'numerical', 'subtype': 'integer'} - # } - # } + # Run + fields = ['a_field', 'b_field'] + + Metadata.add_table(metadata, 'x_table', fields=fields) + + # Asserts + expected_table_meta = { + 'fields': dict() + } + + assert metadata._metadata['tables']['x_table'] == expected_table_meta + + def test_add_table_with_fields_data(self): + """Add table with fields and data""" + # Setup + metadata = Mock(spec=Metadata) + metadata.get_tables.return_value = ['a_table', 'b_table'] + metadata._metadata = {'tables': dict()} + metadata._get_field_details.return_value = { + 'a_field': {'type': 'numerical', 'subtype': 'integer'}, + 'b_field': {'type': 'boolean'} + } + + # Run + fields = ['a_field', 'b_field'] + data = pd.DataFrame({'a_field': [0, 1], 'b_field': [True, False], 'c_field': ['a', 'b']}) + + Metadata.add_table(metadata, 'x_table', fields=fields, data=data) + + # Asserts + expected_table_meta = { + 'fields': { + 'a_field': {'type': 'numerical', 'subtype': 'integer'}, + 'b_field': {'type': 'boolean'} + } + } + + assert metadata._metadata['tables']['x_table'] == expected_table_meta + + metadata.set_primary_key.call_count == 0 + metadata.add_relationship.call_count == 0 + + def test_add_table_with_no_fields_data(self): + """Add table with data to analyze all""" + # Setup + metadata = Mock(spec=Metadata) + metadata.get_tables.return_value = ['a_table', 'b_table'] + metadata._metadata = {'tables': dict()} + metadata._get_field_details.return_value = { + 'a_field': {'type': 'numerical', 'subtype': 'integer'}, + 'b_field': {'type': 'boolean'}, + 'c_field': {'type': 'categorical'} + } - # assert metadata._metadata['tables']['x_table'] == expected_table_meta + # Run + data = pd.DataFrame({'a_field': [0, 1], 'b_field': [True, False], 'c_field': ['a', 'b']}) + + Metadata.add_table(metadata, 'x_table', data=data) + + # Asserts + expected_table_meta = { + 'fields': { + 'a_field': {'type': 'numerical', 'subtype': 'integer'}, + 'b_field': {'type': 'boolean'}, + 'c_field': {'type': 'categorical'} + } + } + + assert metadata._metadata['tables']['x_table'] == expected_table_meta + + metadata.set_primary_key.call_count == 0 + metadata.add_relationship.call_count == 0 + + def test_add_relationship_table_no_exist(self): + """Add relationship table no exist""" + # Setup + metadata = Mock(spec=Metadata) + metadata.get_tables.return_value = list() + + # Run + with pytest.raises(ValueError): + Metadata.add_relationship(metadata, 'a_table', 'b_table') + + def test_add_relationship_parent_no_exist(self): + """Add relationship table no exist""" + # Setup + metadata = Mock(spec=Metadata) + metadata.get_tables.return_value = ['a_table'] + + # Run + with pytest.raises(ValueError): + Metadata.add_relationship(metadata, 'a_table', 'b_table') + + def test_add_relationship_already_exist(self): + """Add relationship already exist""" + # Setup + metadata = Mock(spec=Metadata) + metadata.get_tables.return_value = ['a_table', 'b_table'] + metadata.get_parents.return_value = set(['b_table']) + + # Run + with pytest.raises(ValueError): + Metadata.add_relationship(metadata, 'a_table', 'b_table') + + def test_add_relationship_parent_no_primary_key(self): + """Add relationship parent no primary key""" + # Setup + metadata = Mock(spec=Metadata) + metadata.get_tables.return_value = ['a_table', 'b_table'] + metadata.get_parents.return_value = set() + metadata.get_children.return_value = set() + metadata.get_primary_key.return_value = None + + # Run + with pytest.raises(ValueError): + Metadata.add_relationship(metadata, 'a_table', 'b_table') + + def test_set_primary_key(self): + """Set primary key table no exist""" + # Setup + metadata = Mock(spec=Metadata) + metadata.get_tables.return_value = list() + metadata.get_fields.return_value = {'a_field': {'type': 'id', 'subtype': 'integer'}} + metadata._metadata = { + 'tables': { + 'a_table': { + 'fields': {'a_field': {'type': 'id', 'subtype': 'integer'}} + } + } + } + + # Run + Metadata.set_primary_key(metadata, 'a_table', 'a_field') + + # Asserts + metadata._check_field.assert_called_once_with('a_table', 'a_field', exists=True) + metadata.get_fields.assert_called_once_with('a_table') + metadata._get_key_subtype.assert_called_once_with({'type': 'id', 'subtype': 'integer'}) + + def test_add_field(self): + """Add field table no exist""" + # Setup + metadata = Mock(spec=Metadata) + metadata.get_tables.return_value = list() + metadata._metadata = { + 'tables': { + 'a_table': {'fields': dict()} + } + } + + # Run + Metadata.add_field(metadata, 'a_table', 'a_field', 'id', 'string', None) + + # Asserts + expected_metadata = { + 'tables': { + 'a_table': { + 'fields': {'a_field': {'type': 'id', 'subtype': 'string'}} + } + } + } - # assert metadata._validate_field.call_args_list == [ - # call({'name': 'a_field', 'type': 'numerical', 'subtype': 'integer'}) - # ] - - # metadata.add_primary_key.call_count == 0 - # metadata.add_relationship.call_count == 0 - - # def test_add_table_with_field_list_no_data(self): - # """Add table with fields(list) no data""" - # # Setup - # table_names = ['a_table', 'b_table'] - - # # Run - # metadata = Mock(spec=Metadata) - # metadata.get_tables.return_value = table_names - # metadata._metadata = {'tables': dict()} - - # fields = ['a_field', 'b_field'] - - # with pytest.raises(ValueError): - # Metadata.add_table(metadata, 'x_table', fields=fields) - - # def test_add_table_with_field_list_data(self): - # """Add table with fields(list) data""" - # # Setup - # table_names = ['a_table', 'b_table'] - - # # Run - # metadata = Mock(spec=Metadata) - # metadata.get_tables.return_value = table_names - # metadata._metadata = {'tables': dict()} - - # fields = ['a_field', 'b_field'] - # data = pd.DataFrame({'a_field': [0, 1], 'b_field': [True, False], 'c_field': ['a', 'b']}) - - # Metadata.add_table(metadata, 'x_table', fields=fields, data=data) - - # # Asserts - # expected_table_meta = { - # 'name': 'x_table', - # 'fields': { - # 'a_field': {'name': 'a_field', 'type': 'numerical', 'subtype': 'integer'}, - # 'b_field': {'name': 'b_field', 'type': 'boolean'} - # } - # } - - # assert metadata._metadata['tables']['x_table'] == expected_table_meta - - # metadata.add_primary_key.call_count == 0 - # metadata.add_relationship.call_count == 0 - - # def test_add_table_with_data_analyze(self): - # """Add table with data to analyze all""" - # # Setup - # table_names = ['a_table', 'b_table'] - - # # Run - # metadata = Mock(spec=Metadata) - # metadata.get_tables.return_value = table_names - # metadata._metadata = {'tables': dict()} - - # data = pd.DataFrame({'a_field': [0, 1], 'b_field': [True, False], 'c_field': ['a', 'b']}) - - # Metadata.add_table(metadata, 'x_table', data=data) - - # # Asserts - # expected_table_meta = { - # 'name': 'x_table', - # 'fields': { - # 'a_field': {'name': 'a_field', 'type': 'numerical', 'subtype': 'integer'}, - # 'b_field': {'name': 'b_field', 'type': 'boolean'}, - # 'c_field': {'name': 'c_field', 'type': 'categorical'} - # } - # } - - # assert metadata._metadata['tables']['x_table'] == expected_table_meta - - # metadata.add_primary_key.call_count == 0 - # metadata.add_relationship.call_count == 0 - - # def test_add_relationship_table_no_exist(self): - # """Add relationship table no exist""" - # # Run and asserts - # metadata = Mock(spec=Metadata) - # metadata.get_tables.return_value = list() - - # with pytest.raises(ValueError): - # Metadata.add_relationship(metadata, 'a_table', 'b_table') - - # def test_add_relationship_parent_no_exist(self): - # """Add relationship table no exist""" - # # Run and asserts - # metadata = Mock(spec=Metadata) - # metadata.get_tables.return_value = ['a_table'] - - # with pytest.raises(ValueError): - # Metadata.add_relationship(metadata, 'a_table', 'b_table') - - # def test_add_relationship_already_exist(self): - # """Add relationship already exist""" - # # Run and asserts - # metadata = Mock(spec=Metadata) - # metadata.get_tables.return_value = ['a_table', 'b_table'] - # metadata.get_parents.return_value = set(['b_table']) - - # with pytest.raises(ValueError): - # Metadata.add_relationship(metadata, 'a_table', 'b_table') - - # def test_add_relationship_parent_is_child_of_table(self): - # """Add relationship parent is child of table""" - # # Run and asserts - # metadata = Mock(spec=Metadata) - # metadata.get_tables.return_value = ['a_table', 'b_table'] - # metadata.get_parents.return_value = set() - # metadata.get_children.return_value = set(['b_table']) - - # with pytest.raises(ValueError): - # Metadata.add_relationship(metadata, 'a_table', 'b_table') - - # def test_add_relationship_parent_no_primary_key(self): - # """Add relationship parent no primary key""" - # # Run and asserts - # metadata = Mock(spec=Metadata) - # metadata.get_tables.return_value = ['a_table', 'b_table'] - # metadata.get_parents.return_value = set() - # metadata.get_children.return_value = set() - # metadata.get_primary_key.return_value = None - - # with pytest.raises(ValueError): - # Metadata.add_relationship(metadata, 'a_table', 'b_table') - - # def test_add_relationship_valid(self): - # """Add relationship valid""" - # # Run - # metadata = Mock(spec=Metadata) - # metadata.get_tables.return_value = ['a_table', 'b_table'] - # metadata.get_parents.return_value = set() - # metadata.get_children.return_value = set() - # metadata.get_primary_key.return_value = 'pk_field' - - # Metadata.add_relationship(metadata, 'a_table', 'b_table') - - # # Asserts - # metadata._validate_circular_relationships.assert_called_once_with('b_table', set()) - # metadata.add_field.assert_called_once_with( - # 'a_table', 'pk_field', 'id', None, {'ref': {'field': 'pk_field', 'table': 'b_table'}} - # ) - # metadata._analyze_relationships.assert_called_once_with() - - # def test_add_primary_key_table_no_exist(self): - # """Add primary key table no exist""" - # # Run and asserts - # metadata = Mock(spec=Metadata) - # metadata.get_tables.return_value = list() - - # with pytest.raises(ValueError): - # Metadata.add_primary_key(metadata, 'a_table', 'a_field') - - # def test_add_primary_key_field_exist(self): - # """Add primary key field exist""" - # # Run and asserts - # metadata = Mock(spec=Metadata) - # metadata.get_tables.return_value = ['a_table'] - # metadata.get_fields.return_value = dict() - - # with pytest.raises(ValueError): - # Metadata.add_primary_key(metadata, 'a_table', 'a_field') - - # def test_add_primary_key_primary_key_exist(self): - # """Add primary key primary key exist""" - # # Run and asserts - # metadata = Mock(spec=Metadata) - # metadata.get_tables.return_value = ['a_table'] - # metadata.get_fields.return_value = {'a_field': dict()} - # metadata.get_primary_key.return_value = 'some_primary_key' - - # with pytest.raises(ValueError): - # Metadata.add_primary_key(metadata, 'a_table', 'a_field') - - # def test_add_primary_key_valid(self): - # """Add primary key valid""" - # # Run - # metadata = Mock(spec=Metadata) - # metadata.get_tables.return_value = ['a_table'] - # metadata.get_fields.return_value = dict() - # metadata.get_primary_key.return_value = None - # metadata.get_table_meta.return_value = dict() - - # Metadata.add_primary_key(metadata, 'a_table', 'a_field') - - # # Asserts - # metadata.get_tables.assert_called_once_with() - # metadata.get_fields.assert_called_once_with('a_table') - # metadata.get_primary_key.assert_called_once_with('a_table') - - # metadata.get_table_meta.assert_called_once_with('a_table') - # metadata.add_field.assert_called_once_with('a_table', 'a_field', 'id', None, None) - - # def test_add_field_table_no_exist(self): - # """Add field table no exist""" - # # Run and asserts - # metadata = Mock(spec=Metadata) - # metadata.get_tables.return_value = list() - - # with pytest.raises(ValueError): - # Metadata.add_field(metadata, 'a_table', 'a_field', 'id', None, None) - - # def test_add_field_field_exist(self): - # """Add field already exist""" - # # Run and asserts - # metadata = Mock(spec=Metadata) - # metadata.get_tables.return_value = ['a_table'] - # metadata.get_fields.return_value = {'a_field': dict()} - - # with pytest.raises(ValueError): - # Metadata.add_field(metadata, 'a_table', 'a_field', 'id', None, None) - - # def test_add_field_valid(self): - # """Add valid field""" - # # Run - # metadata = Mock(spec=Metadata) - # metadata.get_tables.return_value = ['a_table'] - # metadata.get_fields.return_value = dict() - - # Metadata.add_field(metadata, 'a_table', 'a_field', 'numerical', 'integer', {'min': 0}) - - # # Asserts - # metadata.get_tables.assert_called_once_with() - # metadata.get_fields.assert_called_once_with('a_table') + assert metadata._metadata == expected_metadata + metadata._check_field.assert_called_once_with('a_table', 'a_field', exists=False) From bc57a41cfc5890301b90ca36841b44c6ec0d3142 Mon Sep 17 00:00:00 2001 From: JDTheRipperPC Date: Mon, 25 Nov 2019 15:58:47 +0100 Subject: [PATCH 12/16] fix metadata __init__ --- sdv/metadata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdv/metadata.py b/sdv/metadata.py index 694376cff..f22cb2f9f 100644 --- a/sdv/metadata.py +++ b/sdv/metadata.py @@ -165,7 +165,7 @@ def __init__(self, metadata=None, root_path=None): self.root_path = root_path or '.' if isinstance(metadata, Metadata): - self._metadata = metadata + self._metadata = metadata._metadata elif metadata is not None: self._metadata = self._dict_metadata(metadata) else: From 627a127218fac57eaa4bc1078675614087acfe28 Mon Sep 17 00:00:00 2001 From: JDTheRipperPC Date: Mon, 25 Nov 2019 16:19:59 +0100 Subject: [PATCH 13/16] fix metadata --- sdv/metadata.py | 4 +--- sdv/sdv.py | 6 +++++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/sdv/metadata.py b/sdv/metadata.py index f22cb2f9f..3937cdcff 100644 --- a/sdv/metadata.py +++ b/sdv/metadata.py @@ -164,9 +164,7 @@ def __init__(self, metadata=None, root_path=None): else: self.root_path = root_path or '.' - if isinstance(metadata, Metadata): - self._metadata = metadata._metadata - elif metadata is not None: + if metadata is not None: self._metadata = self._dict_metadata(metadata) else: self._metadata = {'tables': {}} diff --git a/sdv/sdv.py b/sdv/sdv.py index 2718ffbeb..2976b4986 100644 --- a/sdv/sdv.py +++ b/sdv/sdv.py @@ -63,7 +63,11 @@ def fit(self, metadata, tables=None, root_path=None): metadata is a dict, the current working directory is used. """ - self.metadata = Metadata(metadata, root_path) + if isinstance(metadata, Metadata): + self.metadata = metadata + else: + self.metadata = Metadata(metadata, root_path) + self._validate_dataset_structure() self.modeler = Modeler(self.metadata, self.model, self.model_kwargs) From 3260e3667b017a8d183f1aa7a8b38d651121e9c9 Mon Sep 17 00:00:00 2001 From: JDTheRipperPC Date: Mon, 25 Nov 2019 16:23:53 +0100 Subject: [PATCH 14/16] fix docstrings --- sdv/sdv.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdv/sdv.py b/sdv/sdv.py index 2976b4986..67a6ea470 100644 --- a/sdv/sdv.py +++ b/sdv/sdv.py @@ -51,8 +51,8 @@ def fit(self, metadata, tables=None, root_path=None): """Fit this SDV instance to the dataset data. Args: - metadata (dict or str): - Metadata dict or path to the metadata JSON file. + metadata (dict, str or Metadata): + Metadata dict, path to the metadata JSON file or Metadata instance itself. tables (dict): Dictionary with the table names as key and ``pandas.DataFrame`` instances as values. If ``None`` is given, the tables will be loaded from the paths From 07c8eca9f51dc5dde419c8441c3e9086d3cf3335 Mon Sep 17 00:00:00 2001 From: JDTheRipperPC Date: Mon, 25 Nov 2019 16:27:37 +0100 Subject: [PATCH 15/16] Updated metadata docs --- docs/metadata.rst | 452 +++++++++++++++++++++++++++++++--------------- 1 file changed, 308 insertions(+), 144 deletions(-) diff --git a/docs/metadata.rst b/docs/metadata.rst index 0b47b2602..9edf30004 100644 --- a/docs/metadata.rst +++ b/docs/metadata.rst @@ -1,93 +1,249 @@ Metadata ======== -In order to have **SDV** process your dataset, you will need its **Metadata**: +In order to use **SDV** you will need a ``Metadata`` object alongside your data. -.. code-block:: python +This document explains how to create it, as well as how to represent it as a JSON file. - { - "tables": [ - { - "fields": [ - {"name": "user_id", "type": "id"}, - {"name": "country", "type": "categorical"}, - {"name": "gender", "type": "categorical"}, - {"name": "age", "type": "numerical", "subtype": "integer"} - ], - "headers": True, - "name": "users", - "path": "users.csv", - "primary_key": "user_id" - }, - { - "fields": [ - {"name": "session_id", "type": "id"}, - {"name": "user_id", "type": "id", "ref": { - "field": "user_id", "table": "users"}, - }, - {"name": "device", "type": "categorical"}, - {"name": "os", "type": "categorical"} - ], - "headers": True, - "name": "sessions", - "path": "sessions.csv", - "primary_key": "session_id" - }, - { - "fields": [ - {"name": "transaction_id", "type": "id"}, - {"name": "session_id", "type": "id", "ref": { - "field": "session_id", "table": "sessions"}, - }, - {"name": "timestamp", "format": "%Y-%m-%d", "type": "datetime"}, - {"name": "amount", "type": "numerical", "subtype": "float"}, - {"name": "approved", "type": "boolean"} - ], - "headers": True, - "name": "transactions", - "path": "transactions.csv", - "primary_key": "transaction_id" - } - ] - } +Generate Metadata from your data +-------------------------------- + +In this step by step guide you will show you how to create a ``Metadata`` object +by letting **SDV** analyze your tables and figure out the data types of your columns. + +Load your data +************** + +The first step is to load your data as ``pandas.DataFrame`` objects. +In this example, you will be loading the demo data using the ``load_demo`` function. + + .. code-block:: python + + from sdv.demo import load_demo + + tables = load_demo() + +Create a Metadata instance +************************** + +The next step is to create an empty instance of the ``Metadata`` class without +passing it any arguments. + + .. code-block:: python + + from sdv import Metadata + + metadata = Metadata() + +Add the first table +******************* + +Once you have your ``Metadata`` instance ready you can start adding tables. + +In this example, you will add the table ``users``, which is the parent table of your +dataset, indicating which is its Primary Key field, ``user_id``. + +Indicating the Primary Key is optional and can be skipped if your table has none, but +then you will not be able to specify any child table. + +The ``Metadata`` instance will analyze all the columns in the passed table and identify +they different data types and subtypes, and indicate that the ``user_id`` column is +the table primary key. + + .. code-block:: python + + metadata.add_table('users', data=tables['users'], primary_key='user_id') + +Add a table specifying its parent +********************************* + +In this second example, you will add the table ``sessions``, which is related to the +``users`` table in a parent-child relationship, where each user can have multiple +sessions, and each session belongs to one and only one user. + +In order to specify this, while creating the ``sessions`` table you have to indicate the +name of the parent table, ``users``, and the field from the ``sessions`` table that +acts as the foreign key, ``user_id``. + +With this, a part from analyzing all the columsn and indicating the primary key like in +the previous step, the ``Metadata`` instance will specify a relationship between the +two tables by adding a property to the ``user_id`` field that indicates that it is related +to the ``user_id`` field in the ``users`` table. + + .. code-block:: python + metadata.add_table('sessions', data=tables['sessions'], primary_key='session_id', + parent='users', foreign_key='user_id') -This can either be provided as a python `dict` object or as a JSON file, and it -mush have the following schema: +The ``foreign_key`` field is optional, and can be skipped when the name of the child foreign +key field is exactly the same as the parent primary key field. + +Add a table specifying field properties +*************************************** + +There are situations where the ``Metadata`` analysis is not able to figure out +some data types or subtypes, or to deduce some properties of the field such as the +datetime format. + +In these situations, you can pass a dictionary with the exact metadata of those fields, +which will overwrite the deductions from the analysis process. + +In this next example, you will be adding a ``transactions`` table, which is related to +the previous ``sessions`` table, and contains a ``datetime`` field which needs to have +the datetime format specified. + + .. code-block:: python + + transactions_fields = { + 'timestamp': { + 'type': 'datetime', + 'format': '%Y-%m-%d' + } + } + metadata.add_table('transactions', data=tables['transactions'], + fields_metadata=transactions_fields, + primary_key='transaction_id', parent='sessions') + +.. note:: When analyzing an integer column that also has null values in it, the type will + be correct, ``numerical``, but the subtype will be mistakenly set as ``float``. + This can be fixed by passing the ``integer`` subtype. + + +Store your Metadata in a JSON file +********************************* + +Once you have finished configuring your ``Metadata`` instance, you can use it with ``SDV``. + +However, in some occasions you will want to store it as a JSON file, so you do not need to +configure it again the next time that you want to work on this dataset. + +This can be esily done using the ``to_json`` method of your ``Metadata`` instance, passing +it the path and name of the file where you want your JSON metadata stored. + + .. code-block:: python + + metadata.to_json('paht/to/metadata.json') + +This will create a file with this contents: + + .. code-block:: json + + { + "tables": { + "users": { + "primary_key": "user_id", + "fields": { + "user_id": { + "type": "id", + "subtype": "integer" + }, + "country": { + "type": "categorical" + }, + "gender": { + "type": "categorical" + }, + "age": { + "type": "numerical", + "subtype": "integer" + } + } + }, + "sessions": { + "primary_key": "session_id", + "fields": { + "session_id": { + "type": "id", + "subtype": "integer" + }, + "user_id": { + "ref": { + "field": "user_id", + "table": "users" + }, + "type": "id", + "subtype": "integer" + }, + "device": { + "type": "categorical" + }, + "os": { + "type": "categorical" + } + } + }, + "transactions": { + "primary_key": "transaction_id", + "fields": { + "transaction_id": { + "type": "id", + "subtype": "integer" + }, + "session_id": { + "ref": { + "field": "session_id", + "table": "sessions" + }, + "type": "id", + "subtype": "integer" + }, + "timestamp": { + "type": "datetime", + "format": "%Y-%m-%d" + }, + "amount": { + "type": "numerical", + "subtype": "float" + }, + "approved": { + "type": "boolean" + } + } + } + } + } + +Later on, you can recover your ``Metadata`` by passing the path to your ``metadata.json`` file +as an argument when creating a new ``Metadata`` instance: + + .. code-block:: python + + metadata = Metadata('metadata.json') + + +Metadata Schema +--------------- + +This section explains the format of the metadata JSON file. Top Level -^^^^^^^^^ +--------- At the topmost level of the **Metadata** dictionary, there is only one element: :Tables: - List of tables in the dataset, each one represented as a subdocument. + Mapping of tables in the dataset, each one represented as a subdocument, with + the table name as the corresponding key. Table -^^^^^ +----- -A node ``table`` should be made for each table in our dataset. It contains the configuration on +A node ``table`` should be made for each table in your dataset. It contains the configuration on how to handle this table. It has the following elements: .. code-block:: python - "tables": [ - { - "fields": [...], - "headers": true, - "name": "users", + "tables": { + "users": { + "fields": {...}, "path": "users.csv", "primary_key": "user_id" }, ... - ] + } :Fields: - List of fields of the table. - -:Headers: - Whether or not load the headers from the csv file. This can be skipped if the - data is being passed as ``pandas.DataFrames``. + Mapping of fields in the table. :Name: Name of the table. @@ -104,76 +260,75 @@ how to handle this table. It has the following elements: Field details -^^^^^^^^^^^^^ +------------- -Each field within a table needs to have its name, its type and sometimes its subtype -specified. +Each field within a table needs to have its type specified, +Additionally, some field types need additional details, such as the subtype or +other properties. The available types and subtypes are in this table: -+---------------+---------------+ -| Type | Subtype | -+===============+===============+ -| numerical | integer | -+---------------+---------------+ -| numerical | float | -+---------------+---------------+ -| datetime | | -+---------------+---------------+ -| categorical | | -+---------------+---------------+ -| boolean | | -+---------------+---------------+ -| id | integer | -+---------------+---------------+ -| id | string | -+---------------+---------------+ ++---------------+---------------+-----------------------+ +| Type | Subtype | Additional Properties | ++===============+===============+=======================+ +| numerical | integer | integer | ++---------------+---------------+-----------------------+ +| numerical | float | float | ++---------------+---------------+-----------------------+ +| datetime | | format | ++---------------+---------------+-----------------------+ +| categorical | | pii, pii_category | ++---------------+---------------+-----------------------+ +| boolean | | | ++---------------+---------------+-----------------------+ +| id | integer | ref | ++---------------+---------------+-----------------------+ +| id | string | ref, regex | ++---------------+---------------+-----------------------+ .. code-block:: python - "tables": [{ - "fields": [ - { - "name": "country", - "type": "categorical" + "tables": { + "users": { + "fields": { + "country": { + "type": "categorical" + }, + ... }, ... - ], + }, ... - }] - -:Name: - Name of the field. + } :Type: The type of the field. -:Subtype: - Optional. The subtype of the field. - Datetime fields -""""""""""""""" +*************** For ``datetime`` types, a ``format`` key should be included containing the date format using `strftime`_ format. .. code-block:: python - "tables": [{ - "fields": [ - { - "name": "timestamp", - "type": "datetime", - "format": "%Y-%m-%d" + "tables": { + "transactions": { + "fields": { + "timestamp": { + "type": "datetime", + "format": "%Y-%m-%d" + }, + ... }, ... - ], + }, ... - }] + } -Categorical fields ( Data anonymization) -"""""""""""""""""""""""""""""""""""""""" +Categorical fields (Data anonymization) +**************************************** For ``categorical`` types, there is an option to anonymize data labeled as Personally Identifiable Information, ``pii``, but keeping its statistical properties. To anonymize a field, you should use @@ -181,18 +336,20 @@ the following keys. .. code-block:: python - 'tables': [{ - 'fields': [ - { - 'name': 'social_scurity_number', - 'type': 'categorical', - 'pii': True, - 'pii_category': 'ssn' + "tables": { + "users": { + "fields": { + "social_security_number": { + "type": "categorical", + "pii": True, + "pii_category": "ssn" + }, + ... }, ... - ], + }, ... - }] + } The most common supported values of ``pii_category`` are in the following table, but any value supported by faker can be used: @@ -220,62 +377,69 @@ For a full list of available categories please check the `Faker documentation si the type instead of only the string: ``'pii_category': ['credict_card_number', 'visa']`` Primary key fields -"""""""""""""""""" +****************** If a field is specified as a ``primary_key`` of the table, then the field must be of type ``id``: .. code-block:: python - "tables": [{ - "fields": [ - { - "name": "user_id", - "type": "id" + "tables": { + "users": { + "fields": { + "user_id": { + "name": "user_id" + }, + ... }, ... - ], + }, ... - }] + } If the subtype of the primary key is integer, an optional regular expression can be passed to generate keys that match it: .. code-block:: python - "tables": [{ - "fields": [ - { - "name": "user_id", - "type": "id", - "subtype": "string", - "regex": "[a-zA-Z]{10}" + "tables": { + "users": { + "fields": { + "user_id": { + "name": "user_id", + "type": "id", + "subtype": "string", + "regex": "[a-zA-Z]{10}" + }, + ... }, ... - ], + }, ... - }] + } Foreign key fields -"""""""""""""""""" +****************** If a field is a foreign key to another table, then it has to also be of type ``id``, and define define a relationship using the ``ref`` field: .. code-block:: python - "tables": [{ - "fields": [ - { - "name": "user_id", - "ref": { - "field": "user_id", - "table": "users" + "tables": { + "sessions": { + "fields": { + "user_id": { + "type": "id" + "ref": { + "field": "user_id", + "table": "users" + }, }, - "type": "id" + ... }, ... - ], + }, ... }] From d2647cb5bf8776b1c423b3d8a7c72d95ea622844 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Mon, 25 Nov 2019 16:38:33 +0100 Subject: [PATCH 16/16] Fix README docs and return Metadata in load_demo --- README.md | 50 +-- docs/metadata.rst | 2 +- examples/0. Quickstart - README.ipynb | 318 ++++++++++++++++++ ...uickstart - Single Table - In Memory.ipynb | 84 ++--- .... Quickstart - Single Table - Census.ipynb | 160 ++++----- .../3. Quickstart - Multitable - Files.ipynb | 134 ++++---- ...ymization.ipynb => 4. Anonymization.ipynb} | 14 +- ...5. Generate Metadata from Dataframes.ipynb | 12 +- sdv/demo.py | 4 +- tests/integration/test_metadata.py | 8 +- 10 files changed, 532 insertions(+), 254 deletions(-) create mode 100644 examples/0. Quickstart - README.ipynb rename examples/{4. Quickstart - Anonymization.ipynb => 4. Anonymization.ipynb} (95%) diff --git a/README.md b/README.md index 64c304c01..83b820b4c 100644 --- a/README.md +++ b/README.md @@ -72,55 +72,13 @@ metadata, tables = load_demo(metadata=True) This will return two objects: -1. A `metadata` dictionary with all the information that **SDV** needs to know about the dataset: +1. A `Metadata` object with all the information that **SDV** needs to know about the dataset. -``` -{ - "tables": [ - { - "fields": [ - {"name": "user_id", "type": "id"}, - {"name": "country", "type": "categorical"}, - {"name": "gender", "type": "categorical"}, - {"name": "age", "type": "numerical", "subtype": "integer"} - ], - "name": "users", - "primary_key": "user_id" - }, - { - "fields": [ - {"name": "session_id", "type": "id"}, - {"name": "user_id", "type": "id", "ref": { - "field": "user_id", "table": "users"}, - }, - {"name": "device", "type": "categorical"}, - {"name": "os", "type": "categorical"} - ], - "name": "sessions", - "primary_key": "session_id" - }, - { - "fields": [ - {"name": "transaction_id", "type": "id"}, - {"name": "session_id", "type": "id", "ref": { - "field": "session_id", "table": "sessions"}, - }, - {"name": "timestamp", "format": "%Y-%m-%d", "type": "datetime"}, - {"name": "amount", "type": "numerical", "subtype": "float"}, - {"name": "approved", "type": "boolean"} - ], - "name": "transactions", - "primary_key": "transaction_id" - } - ] -} -``` - -For more details about the Metadata format, please refer to [the corresponding section -of the documentation](https://hdi-project.github.io/SDV/metadata.html) +For more details about how to build the `Metadata` for your own dataset, please refer to the +[Metadata](https://hdi-project.github.io/SDV/metadata.html) section of the documentation. 2. A dictionary containing three `pandas.DataFrames` with the tables described in the -metadata dictionary. +metadata object. The returned objects contain the following information: diff --git a/docs/metadata.rst b/docs/metadata.rst index 9edf30004..c9f4d8607 100644 --- a/docs/metadata.rst +++ b/docs/metadata.rst @@ -110,7 +110,7 @@ the datetime format specified. Store your Metadata in a JSON file -********************************* +********************************** Once you have finished configuring your ``Metadata`` instance, you can use it with ``SDV``. diff --git a/examples/0. Quickstart - README.ipynb b/examples/0. Quickstart - README.ipynb new file mode 100644 index 000000000..8aed26801 --- /dev/null +++ b/examples/0. Quickstart - README.ipynb @@ -0,0 +1,318 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from sdv import load_demo" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "metadata, tables = load_demo(metadata=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'tables': {'users': {'primary_key': 'user_id',\n", + " 'fields': {'user_id': {'type': 'id', 'subtype': 'integer'},\n", + " 'country': {'type': 'categorical'},\n", + " 'gender': {'type': 'categorical'},\n", + " 'age': {'type': 'numerical', 'subtype': 'integer'}}},\n", + " 'sessions': {'primary_key': 'session_id',\n", + " 'fields': {'session_id': {'type': 'id', 'subtype': 'integer'},\n", + " 'user_id': {'ref': {'field': 'user_id', 'table': 'users'},\n", + " 'type': 'id',\n", + " 'subtype': 'integer'},\n", + " 'device': {'type': 'categorical'},\n", + " 'os': {'type': 'categorical'}}},\n", + " 'transactions': {'primary_key': 'transaction_id',\n", + " 'fields': {'transaction_id': {'type': 'id', 'subtype': 'integer'},\n", + " 'session_id': {'ref': {'field': 'session_id', 'table': 'sessions'},\n", + " 'type': 'id',\n", + " 'subtype': 'integer'},\n", + " 'timestamp': {'type': 'datetime', 'format': '%Y-%m-%d'},\n", + " 'amount': {'type': 'numerical', 'subtype': 'float'},\n", + " 'approved': {'type': 'boolean'}}}}}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "metadata.to_dict()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'users': user_id country gender age\n", + " 0 0 USA M 34\n", + " 1 1 UK F 23\n", + " 2 2 ES None 44\n", + " 3 3 UK M 22\n", + " 4 4 USA F 54\n", + " 5 5 DE M 57\n", + " 6 6 BG F 45\n", + " 7 7 ES None 41\n", + " 8 8 FR F 23\n", + " 9 9 UK None 30,\n", + " 'sessions': session_id user_id device os\n", + " 0 0 0 mobile android\n", + " 1 1 1 tablet ios\n", + " 2 2 1 tablet android\n", + " 3 3 2 mobile android\n", + " 4 4 4 mobile ios\n", + " 5 5 5 mobile android\n", + " 6 6 6 mobile ios\n", + " 7 7 6 tablet ios\n", + " 8 8 6 mobile ios\n", + " 9 9 8 tablet ios,\n", + " 'transactions': transaction_id session_id timestamp amount approved\n", + " 0 0 0 2019-01-01 12:34:32 100.0 True\n", + " 1 1 0 2019-01-01 12:42:21 55.3 True\n", + " 2 2 1 2019-01-07 17:23:11 79.5 True\n", + " 3 3 3 2019-01-10 11:08:57 112.1 False\n", + " 4 4 5 2019-01-10 21:54:08 110.0 False\n", + " 5 5 5 2019-01-11 11:21:20 76.3 True\n", + " 6 6 7 2019-01-22 14:44:10 89.5 True\n", + " 7 7 8 2019-01-23 10:14:09 132.1 False\n", + " 8 8 9 2019-01-27 16:09:17 68.0 True\n", + " 9 9 9 2019-01-29 12:10:48 99.9 True}" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tables" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2019-11-25 16:35:27,509 - INFO - modeler - Modeling users\n", + "2019-11-25 16:35:27,510 - INFO - metadata - Loading transformer CategoricalTransformer for field country\n", + "2019-11-25 16:35:27,511 - INFO - metadata - Loading transformer CategoricalTransformer for field gender\n", + "2019-11-25 16:35:27,512 - INFO - metadata - Loading transformer NumericalTransformer for field age\n", + "2019-11-25 16:35:27,539 - INFO - modeler - Modeling sessions\n", + "2019-11-25 16:35:27,539 - INFO - metadata - Loading transformer CategoricalTransformer for field device\n", + "2019-11-25 16:35:27,540 - INFO - metadata - Loading transformer CategoricalTransformer for field os\n", + "2019-11-25 16:35:27,551 - INFO - modeler - Modeling transactions\n", + "2019-11-25 16:35:27,552 - INFO - metadata - Loading transformer DatetimeTransformer for field timestamp\n", + "2019-11-25 16:35:27,552 - INFO - metadata - Loading transformer NumericalTransformer for field amount\n", + "2019-11-25 16:35:27,552 - INFO - metadata - Loading transformer BooleanTransformer for field approved\n", + "/home/xals/.virtualenvs/SDV/lib/python3.6/site-packages/pandas/core/frame.py:7143: RuntimeWarning: Degrees of freedom <= 0 for slice\n", + " baseCov = np.cov(mat.T)\n", + "/home/xals/.virtualenvs/SDV/lib/python3.6/site-packages/numpy/lib/function_base.py:2451: RuntimeWarning: divide by zero encountered in true_divide\n", + " c *= np.true_divide(1, fact)\n", + "/home/xals/.virtualenvs/SDV/lib/python3.6/site-packages/numpy/lib/function_base.py:2451: RuntimeWarning: invalid value encountered in multiply\n", + " c *= np.true_divide(1, fact)\n", + "2019-11-25 16:35:28,299 - INFO - modeler - Modeling Complete\n" + ] + } + ], + "source": [ + "from sdv import SDV\n", + "\n", + "sdv = SDV()\n", + "sdv.fit(metadata, tables)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'users': user_id country gender age\n", + " 0 0 FR M 41\n", + " 1 1 UK M 48\n", + " 2 2 FR F 61\n", + " 3 3 ES F 48\n", + " 4 4 USA NaN 14\n", + " 5 5 UK NaN 40\n", + " 6 6 ES NaN 37\n", + " 7 7 DE NaN 52\n", + " 8 8 ES F 45\n", + " 9 9 ES F 46,\n", + " 'sessions': session_id user_id device os\n", + " 0 0 0 mobile android\n", + " 1 1 0 mobile android\n", + " 2 2 2 mobile ios\n", + " 3 3 2 mobile ios\n", + " 4 4 4 mobile ios\n", + " 5 5 7 mobile android\n", + " 6 6 8 mobile android\n", + " 7 7 9 mobile ios\n", + " 8 8 9 mobile ios,\n", + " 'transactions': transaction_id session_id timestamp amount \\\n", + " 0 0 0 2019-01-13 09:04:43.841495296 104.653370 \n", + " 1 1 1 2019-01-13 09:05:10.478101248 110.258541 \n", + " 2 2 1 2019-01-13 09:05:10.478101248 111.537307 \n", + " 3 3 0 2019-01-13 09:04:43.834843136 107.559958 \n", + " 4 4 1 2019-01-13 09:05:10.478102528 105.894175 \n", + " 5 5 1 2019-01-13 09:05:10.478103296 91.163287 \n", + " 6 6 0 2019-01-13 09:04:43.865285888 89.776084 \n", + " 7 7 1 2019-01-13 09:05:10.478100992 104.319781 \n", + " 8 8 1 2019-01-13 09:05:10.478101248 110.948264 \n", + " 9 9 0 2019-01-13 09:04:43.851251712 97.959864 \n", + " 10 10 1 2019-01-13 09:05:10.478102528 90.812841 \n", + " 11 11 1 2019-01-13 09:05:10.478102528 104.189625 \n", + " 12 12 0 2019-01-13 09:04:43.840671744 101.818085 \n", + " 13 13 1 2019-01-13 09:05:10.478102016 100.180457 \n", + " 14 14 1 2019-01-13 09:05:10.478101248 108.841044 \n", + " 15 15 4 2019-01-08 15:20:48.908520960 80.647928 \n", + " 16 16 0 2019-01-13 09:04:43.839424512 102.095410 \n", + " 17 17 1 2019-01-13 09:05:10.478103040 80.479839 \n", + " 18 18 1 2019-01-13 09:05:10.478102272 93.136450 \n", + " 19 19 4 2019-01-08 15:20:48.644995584 91.889224 \n", + " 20 20 0 2019-01-13 09:04:43.854621184 96.178556 \n", + " 21 21 1 2019-01-13 09:05:10.478101248 112.241266 \n", + " 22 22 1 2019-01-13 09:05:10.478102016 103.893851 \n", + " 23 23 4 2019-01-08 15:20:48.357267968 97.523135 \n", + " 24 24 0 2019-01-13 09:04:43.830457344 111.319295 \n", + " 25 25 1 2019-01-13 09:05:10.478102528 91.407533 \n", + " 26 26 1 2019-01-13 09:05:10.478102016 98.601993 \n", + " 27 27 4 2019-01-08 15:20:48.778796288 87.142150 \n", + " 28 28 5 2019-01-14 16:49:33.582022144 110.427982 \n", + " 29 29 5 2019-01-14 16:49:33.581722624 105.718311 \n", + " 30 30 5 2019-01-14 16:49:33.581889536 107.819611 \n", + " 31 31 0 2019-01-13 09:04:43.841137408 103.881774 \n", + " 32 32 1 2019-01-13 09:05:10.478102528 94.277283 \n", + " 33 33 1 2019-01-13 09:05:10.478102272 93.259689 \n", + " 34 34 4 2019-01-08 15:20:48.331752192 101.205109 \n", + " 35 35 5 2019-01-14 16:49:33.581950720 94.527853 \n", + " 36 36 5 2019-01-14 16:49:33.582152960 97.431130 \n", + " 37 37 5 2019-01-14 16:49:33.581928448 94.727554 \n", + " 38 38 6 2019-01-11 20:59:52.497328640 108.141832 \n", + " 39 39 6 2019-01-11 20:59:52.497328640 113.729753 \n", + " 40 40 0 2019-01-13 09:04:43.829956096 109.721716 \n", + " 41 41 1 2019-01-13 09:05:10.478102016 95.369571 \n", + " 42 42 1 2019-01-13 09:05:10.478103040 88.588423 \n", + " 43 43 4 2019-01-08 15:20:48.729502208 87.429172 \n", + " 44 44 5 2019-01-14 16:49:33.581812992 85.816540 \n", + " 45 45 5 2019-01-14 16:49:33.581463552 106.088556 \n", + " 46 46 5 2019-01-14 16:49:33.581943552 96.960598 \n", + " 47 47 6 2019-01-11 20:59:52.497328640 113.868324 \n", + " 48 48 6 2019-01-11 20:59:52.497328640 111.495581 \n", + " 49 49 7 2019-01-12 13:08:16.572689152 89.807801 \n", + " 50 50 8 2019-01-12 13:08:14.085691392 95.836909 \n", + " \n", + " approved \n", + " 0 True \n", + " 1 True \n", + " 2 True \n", + " 3 True \n", + " 4 True \n", + " 5 True \n", + " 6 True \n", + " 7 True \n", + " 8 True \n", + " 9 False \n", + " 10 False \n", + " 11 True \n", + " 12 True \n", + " 13 True \n", + " 14 True \n", + " 15 False \n", + " 16 True \n", + " 17 True \n", + " 18 True \n", + " 19 True \n", + " 20 False \n", + " 21 True \n", + " 22 True \n", + " 23 False \n", + " 24 True \n", + " 25 False \n", + " 26 True \n", + " 27 True \n", + " 28 True \n", + " 29 True \n", + " 30 True \n", + " 31 True \n", + " 32 True \n", + " 33 False \n", + " 34 False \n", + " 35 True \n", + " 36 True \n", + " 37 True \n", + " 38 True \n", + " 39 True \n", + " 40 True \n", + " 41 True \n", + " 42 True \n", + " 43 True \n", + " 44 True \n", + " 45 True \n", + " 46 True \n", + " 47 True \n", + " 48 True \n", + " 49 True \n", + " 50 True }" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sdv.sample_all(10)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/1. Quickstart - Single Table - In Memory.ipynb b/examples/1. Quickstart - Single Table - In Memory.ipynb index 142241475..fc4f66c76 100644 --- a/examples/1. Quickstart - Single Table - In Memory.ipynb +++ b/examples/1. Quickstart - Single Table - In Memory.ipynb @@ -232,14 +232,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "2019-11-22 22:10:48,396 - INFO - modeler - Modeling data\n", - "2019-11-22 22:10:48,397 - INFO - metadata - Loading transformer NumericalTransformer for field integer\n", - "2019-11-22 22:10:48,397 - INFO - metadata - Loading transformer NumericalTransformer for field float\n", - "2019-11-22 22:10:48,398 - INFO - metadata - Loading transformer CategoricalTransformer for field categorical\n", - "2019-11-22 22:10:48,398 - INFO - metadata - Loading transformer BooleanTransformer for field bool\n", - "2019-11-22 22:10:48,399 - INFO - metadata - Loading transformer NumericalTransformer for field nullable\n", - "2019-11-22 22:10:48,399 - INFO - metadata - Loading transformer DatetimeTransformer for field datetime\n", - "2019-11-22 22:10:48,450 - INFO - modeler - Modeling Complete\n" + "2019-11-25 16:35:47,751 - INFO - modeler - Modeling data\n", + "2019-11-25 16:35:47,751 - INFO - metadata - Loading transformer NumericalTransformer for field integer\n", + "2019-11-25 16:35:47,752 - INFO - metadata - Loading transformer NumericalTransformer for field float\n", + "2019-11-25 16:35:47,752 - INFO - metadata - Loading transformer CategoricalTransformer for field categorical\n", + "2019-11-25 16:35:47,753 - INFO - metadata - Loading transformer BooleanTransformer for field bool\n", + "2019-11-25 16:35:47,753 - INFO - metadata - Loading transformer NumericalTransformer for field nullable\n", + "2019-11-25 16:35:47,754 - INFO - metadata - Loading transformer DatetimeTransformer for field datetime\n", + "2019-11-25 16:35:47,804 - INFO - modeler - Modeling Complete\n" ] } ], @@ -291,52 +291,52 @@ " \n", " 0\n", " 0\n", - " 1\n", - " 0.155489\n", + " NaN\n", + " NaN\n", " a\n", - " False\n", - " 2.922163\n", - " 2010-01-16 23:55:00.530385152\n", + " True\n", + " 3.317395\n", + " 2009-12-22 15:49:05.094218752\n", " \n", " \n", " 1\n", " 1\n", - " 2\n", - " 0.190969\n", - " c\n", - " NaN\n", + " 3.0\n", + " 0.283517\n", + " a\n", + " False\n", " NaN\n", - " NaT\n", + " 2010-03-05 18:53:04.300039424\n", " \n", " \n", " 2\n", " 2\n", - " 1\n", - " 0.112750\n", + " 1.0\n", + " 0.065930\n", " a\n", - " True\n", - " NaN\n", - " 2010-01-13 13:30:29.267090688\n", + " False\n", + " 3.585714\n", + " 2009-12-25 23:25:25.932338176\n", " \n", " \n", " 3\n", " 3\n", - " 2\n", - " 0.217101\n", + " 2.0\n", + " 0.227167\n", " NaN\n", " False\n", - " NaN\n", - " 2010-02-09 05:40:33.603006208\n", + " 7.885740\n", + " 2010-02-08 02:28:02.741804800\n", " \n", " \n", " 4\n", " 4\n", - " 2\n", - " 0.120487\n", - " b\n", - " False\n", + " 2.0\n", + " 0.140742\n", " NaN\n", - " 2010-01-16 04:21:47.058566656\n", + " NaN\n", + " NaN\n", + " NaT\n", " \n", " \n", "\n", @@ -344,18 +344,18 @@ ], "text/plain": [ " index integer float categorical bool nullable \\\n", - "0 0 1 0.155489 a False 2.922163 \n", - "1 1 2 0.190969 c NaN NaN \n", - "2 2 1 0.112750 a True NaN \n", - "3 3 2 0.217101 NaN False NaN \n", - "4 4 2 0.120487 b False NaN \n", + "0 0 NaN NaN a True 3.317395 \n", + "1 1 3.0 0.283517 a False NaN \n", + "2 2 1.0 0.065930 a False 3.585714 \n", + "3 3 2.0 0.227167 NaN False 7.885740 \n", + "4 4 2.0 0.140742 NaN NaN NaN \n", "\n", " datetime \n", - "0 2010-01-16 23:55:00.530385152 \n", - "1 NaT \n", - "2 2010-01-13 13:30:29.267090688 \n", - "3 2010-02-09 05:40:33.603006208 \n", - "4 2010-01-16 04:21:47.058566656 " + "0 2009-12-22 15:49:05.094218752 \n", + "1 2010-03-05 18:53:04.300039424 \n", + "2 2009-12-25 23:25:25.932338176 \n", + "3 2010-02-08 02:28:02.741804800 \n", + "4 NaT " ] }, "execution_count": 5, diff --git a/examples/2. Quickstart - Single Table - Census.ipynb b/examples/2. Quickstart - Single Table - Census.ipynb index 00c47050c..e4e1def23 100644 --- a/examples/2. Quickstart - Single Table - Census.ipynb +++ b/examples/2. Quickstart - Single Table - Census.ipynb @@ -309,23 +309,23 @@ "name": "stderr", "output_type": "stream", "text": [ - "2019-11-22 22:11:07,542 - INFO - modeler - Modeling census\n", - "2019-11-22 22:11:07,543 - INFO - metadata - Loading transformer NumericalTransformer for field age\n", - "2019-11-22 22:11:07,543 - INFO - metadata - Loading transformer CategoricalTransformer for field workclass\n", - "2019-11-22 22:11:07,544 - INFO - metadata - Loading transformer NumericalTransformer for field fnlwgt\n", - "2019-11-22 22:11:07,544 - INFO - metadata - Loading transformer CategoricalTransformer for field education\n", - "2019-11-22 22:11:07,544 - INFO - metadata - Loading transformer NumericalTransformer for field education-num\n", - "2019-11-22 22:11:07,545 - INFO - metadata - Loading transformer CategoricalTransformer for field marital-status\n", - "2019-11-22 22:11:07,545 - INFO - metadata - Loading transformer CategoricalTransformer for field occupation\n", - "2019-11-22 22:11:07,545 - INFO - metadata - Loading transformer CategoricalTransformer for field relationship\n", - "2019-11-22 22:11:07,546 - INFO - metadata - Loading transformer CategoricalTransformer for field race\n", - "2019-11-22 22:11:07,546 - INFO - metadata - Loading transformer CategoricalTransformer for field sex\n", - "2019-11-22 22:11:07,547 - INFO - metadata - Loading transformer NumericalTransformer for field capital-gain\n", - "2019-11-22 22:11:07,548 - INFO - metadata - Loading transformer NumericalTransformer for field capital-loss\n", - "2019-11-22 22:11:07,549 - INFO - metadata - Loading transformer NumericalTransformer for field hours-per-week\n", - "2019-11-22 22:11:07,549 - INFO - metadata - Loading transformer CategoricalTransformer for field native-country\n", - "2019-11-22 22:11:07,550 - INFO - metadata - Loading transformer CategoricalTransformer for field income\n", - "2019-11-22 22:11:07,799 - INFO - modeler - Modeling Complete\n" + "2019-11-25 16:36:01,497 - INFO - modeler - Modeling census\n", + "2019-11-25 16:36:01,498 - INFO - metadata - Loading transformer NumericalTransformer for field age\n", + "2019-11-25 16:36:01,499 - INFO - metadata - Loading transformer CategoricalTransformer for field workclass\n", + "2019-11-25 16:36:01,499 - INFO - metadata - Loading transformer NumericalTransformer for field fnlwgt\n", + "2019-11-25 16:36:01,499 - INFO - metadata - Loading transformer CategoricalTransformer for field education\n", + "2019-11-25 16:36:01,500 - INFO - metadata - Loading transformer NumericalTransformer for field education-num\n", + "2019-11-25 16:36:01,500 - INFO - metadata - Loading transformer CategoricalTransformer for field marital-status\n", + "2019-11-25 16:36:01,500 - INFO - metadata - Loading transformer CategoricalTransformer for field occupation\n", + "2019-11-25 16:36:01,501 - INFO - metadata - Loading transformer CategoricalTransformer for field relationship\n", + "2019-11-25 16:36:01,501 - INFO - metadata - Loading transformer CategoricalTransformer for field race\n", + "2019-11-25 16:36:01,501 - INFO - metadata - Loading transformer CategoricalTransformer for field sex\n", + "2019-11-25 16:36:01,502 - INFO - metadata - Loading transformer NumericalTransformer for field capital-gain\n", + "2019-11-25 16:36:01,502 - INFO - metadata - Loading transformer NumericalTransformer for field capital-loss\n", + "2019-11-25 16:36:01,502 - INFO - metadata - Loading transformer NumericalTransformer for field hours-per-week\n", + "2019-11-25 16:36:01,503 - INFO - metadata - Loading transformer CategoricalTransformer for field native-country\n", + "2019-11-25 16:36:01,503 - INFO - metadata - Loading transformer CategoricalTransformer for field income\n", + "2019-11-25 16:36:01,754 - INFO - modeler - Modeling Complete\n" ] } ], @@ -382,91 +382,91 @@ " \n", " \n", " 0\n", - " 54\n", + " 39\n", " Private\n", - " 4837\n", - " HS-grad\n", + " 354328\n", + " Bachelors\n", " 12\n", " Married-civ-spouse\n", - " Protective-serv\n", + " Exec-managerial\n", " Husband\n", " White\n", " Male\n", - " 1541\n", - " 20\n", - " 52\n", + " 5731\n", + " -416\n", + " 43\n", " United-States\n", - " <=50K\n", + " >50K\n", " \n", " \n", " 1\n", - " 41\n", + " 39\n", " Private\n", - " 260756\n", + " 97345\n", " Some-college\n", " 10\n", " Never-married\n", - " Transport-moving\n", - " Not-in-family\n", + " Other-service\n", + " Own-child\n", " White\n", - " Male\n", - " 1958\n", - " -656\n", - " 43\n", + " Female\n", + " 1512\n", + " -253\n", + " 40\n", " United-States\n", - " >50K\n", + " <=50K\n", " \n", " \n", " 2\n", - " 38\n", - " Private\n", - " 211042\n", - " Bachelors\n", - " 10\n", - " Never-married\n", + " 49\n", + " Self-emp-not-inc\n", + " 122831\n", + " Some-college\n", + " 9\n", + " Married-civ-spouse\n", " Sales\n", - " Own-child\n", + " Not-in-family\n", " White\n", - " Female\n", - " -2265\n", - " -107\n", - " 42\n", + " Male\n", + " -2067\n", + " 108\n", + " 44\n", " United-States\n", " <=50K\n", " \n", " \n", " 3\n", - " 45\n", - " Self-emp-not-inc\n", - " 393251\n", - " Bachelors\n", - " 10\n", + " 35\n", + " Private\n", + " 218362\n", + " Some-college\n", + " 7\n", " Never-married\n", - " Craft-repair\n", - " Husband\n", + " Adm-clerical\n", + " Own-child\n", " White\n", " Male\n", - " 5538\n", - " -128\n", - " 27\n", + " -1209\n", + " 99\n", + " 54\n", " United-States\n", " <=50K\n", " \n", " \n", " 4\n", - " 45\n", + " 13\n", " Private\n", - " 153962\n", + " 146365\n", " HS-grad\n", " 9\n", - " Never-married\n", - " Exec-managerial\n", - " Husband\n", + " Married-civ-spouse\n", + " Adm-clerical\n", + " Own-child\n", " White\n", - " Male\n", - " 2032\n", - " 780\n", - " 27\n", + " Female\n", + " 8987\n", + " 338\n", + " 42\n", " United-States\n", " <=50K\n", " \n", @@ -476,25 +476,25 @@ ], "text/plain": [ " age workclass fnlwgt education education-num \\\n", - "0 54 Private 4837 HS-grad 12 \n", - "1 41 Private 260756 Some-college 10 \n", - "2 38 Private 211042 Bachelors 10 \n", - "3 45 Self-emp-not-inc 393251 Bachelors 10 \n", - "4 45 Private 153962 HS-grad 9 \n", + "0 39 Private 354328 Bachelors 12 \n", + "1 39 Private 97345 Some-college 10 \n", + "2 49 Self-emp-not-inc 122831 Some-college 9 \n", + "3 35 Private 218362 Some-college 7 \n", + "4 13 Private 146365 HS-grad 9 \n", "\n", - " marital-status occupation relationship race sex \\\n", - "0 Married-civ-spouse Protective-serv Husband White Male \n", - "1 Never-married Transport-moving Not-in-family White Male \n", - "2 Never-married Sales Own-child White Female \n", - "3 Never-married Craft-repair Husband White Male \n", - "4 Never-married Exec-managerial Husband White Male \n", + " marital-status occupation relationship race sex \\\n", + "0 Married-civ-spouse Exec-managerial Husband White Male \n", + "1 Never-married Other-service Own-child White Female \n", + "2 Married-civ-spouse Sales Not-in-family White Male \n", + "3 Never-married Adm-clerical Own-child White Male \n", + "4 Married-civ-spouse Adm-clerical Own-child White Female \n", "\n", " capital-gain capital-loss hours-per-week native-country income \n", - "0 1541 20 52 United-States <=50K \n", - "1 1958 -656 43 United-States >50K \n", - "2 -2265 -107 42 United-States <=50K \n", - "3 5538 -128 27 United-States <=50K \n", - "4 2032 780 27 United-States <=50K " + "0 5731 -416 43 United-States >50K \n", + "1 1512 -253 40 United-States <=50K \n", + "2 -2067 108 44 United-States <=50K \n", + "3 -1209 99 54 United-States <=50K \n", + "4 8987 338 42 United-States <=50K " ] }, "execution_count": 7, diff --git a/examples/3. Quickstart - Multitable - Files.ipynb b/examples/3. Quickstart - Multitable - Files.ipynb index cad162e9b..351063e61 100644 --- a/examples/3. Quickstart - Multitable - Files.ipynb +++ b/examples/3. Quickstart - Multitable - Files.ipynb @@ -11,21 +11,21 @@ "name": "stderr", "output_type": "stream", "text": [ - "2019-11-22 22:11:44,765 - INFO - modeler - Modeling customers\n", - "2019-11-22 22:11:44,765 - INFO - metadata - Loading table customers\n", - "2019-11-22 22:11:44,772 - INFO - metadata - Loading transformer CategoricalTransformer for field cust_postal_code\n", - "2019-11-22 22:11:44,772 - INFO - metadata - Loading transformer NumericalTransformer for field phone_number1\n", - "2019-11-22 22:11:44,773 - INFO - metadata - Loading transformer NumericalTransformer for field credit_limit\n", - "2019-11-22 22:11:44,773 - INFO - metadata - Loading transformer CategoricalTransformer for field country\n", - "2019-11-22 22:11:44,790 - INFO - modeler - Modeling orders\n", - "2019-11-22 22:11:44,791 - INFO - metadata - Loading table orders\n", - "2019-11-22 22:11:44,795 - INFO - metadata - Loading transformer NumericalTransformer for field order_total\n", - "2019-11-22 22:11:44,799 - INFO - modeler - Modeling order_items\n", - "2019-11-22 22:11:44,799 - INFO - metadata - Loading table order_items\n", - "2019-11-22 22:11:44,804 - INFO - metadata - Loading transformer CategoricalTransformer for field product_id\n", - "2019-11-22 22:11:44,805 - INFO - metadata - Loading transformer NumericalTransformer for field unit_price\n", - "2019-11-22 22:11:44,805 - INFO - metadata - Loading transformer NumericalTransformer for field quantity\n", - "2019-11-22 22:11:45,470 - INFO - modeler - Modeling Complete\n" + "2019-11-25 16:36:09,119 - INFO - modeler - Modeling customers\n", + "2019-11-25 16:36:09,120 - INFO - metadata - Loading table customers\n", + "2019-11-25 16:36:09,126 - INFO - metadata - Loading transformer CategoricalTransformer for field cust_postal_code\n", + "2019-11-25 16:36:09,127 - INFO - metadata - Loading transformer NumericalTransformer for field phone_number1\n", + "2019-11-25 16:36:09,127 - INFO - metadata - Loading transformer NumericalTransformer for field credit_limit\n", + "2019-11-25 16:36:09,128 - INFO - metadata - Loading transformer CategoricalTransformer for field country\n", + "2019-11-25 16:36:09,143 - INFO - modeler - Modeling orders\n", + "2019-11-25 16:36:09,143 - INFO - metadata - Loading table orders\n", + "2019-11-25 16:36:09,146 - INFO - metadata - Loading transformer NumericalTransformer for field order_total\n", + "2019-11-25 16:36:09,150 - INFO - modeler - Modeling order_items\n", + "2019-11-25 16:36:09,150 - INFO - metadata - Loading table order_items\n", + "2019-11-25 16:36:09,155 - INFO - metadata - Loading transformer CategoricalTransformer for field product_id\n", + "2019-11-25 16:36:09,156 - INFO - metadata - Loading transformer NumericalTransformer for field unit_price\n", + "2019-11-25 16:36:09,156 - INFO - metadata - Loading transformer NumericalTransformer for field quantity\n", + "2019-11-25 16:36:09,802 - INFO - modeler - Modeling Complete\n" ] } ], @@ -45,9 +45,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "2019-11-22 22:11:45,521 - INFO - metadata - Loading table customers\n", - "2019-11-22 22:11:45,524 - INFO - metadata - Loading table orders\n", - "2019-11-22 22:11:45,526 - INFO - metadata - Loading table order_items\n" + "2019-11-25 16:36:09,896 - INFO - metadata - Loading table customers\n", + "2019-11-25 16:36:09,899 - INFO - metadata - Loading table orders\n", + "2019-11-25 16:36:09,903 - INFO - metadata - Loading table order_items\n" ] } ], @@ -96,41 +96,41 @@ " \n", " 0\n", " 0\n", - " 6096\n", - " 5887096836\n", - " 822\n", - " FRANCE\n", + " 63145\n", + " 7731947011\n", + " 1468\n", + " UK\n", " \n", " \n", " 1\n", " 1\n", - " 11371\n", - " 6798799259\n", - " 901\n", + " 63145\n", + " 9151838659\n", + " 662\n", " UK\n", " \n", " \n", " 2\n", " 2\n", " 11371\n", - " 5947789567\n", - " 535\n", + " 6317378496\n", + " 1260\n", " SPAIN\n", " \n", " \n", " 3\n", " 3\n", - " 20166\n", - " 6100829678\n", - " 1016\n", - " CANADA\n", + " 11371\n", + " 5747702712\n", + " 1001\n", + " UK\n", " \n", " \n", " 4\n", " 4\n", " 63145\n", - " 8061087361\n", - " 917\n", + " 7374880338\n", + " 1446\n", " US\n", " \n", " \n", @@ -139,11 +139,11 @@ ], "text/plain": [ " customer_id cust_postal_code phone_number1 credit_limit country\n", - "0 0 6096 5887096836 822 FRANCE\n", - "1 1 11371 6798799259 901 UK\n", - "2 2 11371 5947789567 535 SPAIN\n", - "3 3 20166 6100829678 1016 CANADA\n", - "4 4 63145 8061087361 917 US" + "0 0 63145 7731947011 1468 UK\n", + "1 1 63145 9151838659 662 UK\n", + "2 2 11371 6317378496 1260 SPAIN\n", + "3 3 11371 5747702712 1001 UK\n", + "4 4 63145 7374880338 1446 US" ] }, "execution_count": 3, @@ -287,31 +287,31 @@ " 0\n", " 0\n", " 0\n", - " 1201\n", + " 1316\n", " \n", " \n", " 1\n", " 1\n", - " 0\n", - " 1963\n", + " 1\n", + " 1092\n", " \n", " \n", " 2\n", " 2\n", " 1\n", - " 740\n", + " 1562\n", " \n", " \n", " 3\n", " 3\n", - " 1\n", - " 978\n", + " 2\n", + " 2676\n", " \n", " \n", " 4\n", " 4\n", " 3\n", - " 897\n", + " 587\n", " \n", " \n", "\n", @@ -319,11 +319,11 @@ ], "text/plain": [ " order_id customer_id order_total\n", - "0 0 0 1201\n", - "1 1 0 1963\n", - "2 2 1 740\n", - "3 3 1 978\n", - "4 4 3 897" + "0 0 0 1316\n", + "1 1 1 1092\n", + "2 2 1 1562\n", + "3 3 2 2676\n", + "4 4 3 587" ] }, "execution_count": 5, @@ -457,41 +457,41 @@ " 0\n", " 0\n", " 0\n", - " 9\n", - " 58\n", - " 1\n", + " 10\n", + " 104\n", + " 4\n", " \n", " \n", " 1\n", " 1\n", " 0\n", " 10\n", - " 79\n", - " 4\n", + " 65\n", + " 2\n", " \n", " \n", " 2\n", " 2\n", " 0\n", - " 6\n", - " -4\n", - " 2\n", + " 9\n", + " 46\n", + " 0\n", " \n", " \n", " 3\n", " 3\n", " 0\n", " 10\n", - " 108\n", - " 5\n", + " 61\n", + " 2\n", " \n", " \n", " 4\n", " 4\n", " 0\n", " 10\n", - " 4\n", - " 2\n", + " 97\n", + " 3\n", " \n", " \n", "\n", @@ -499,11 +499,11 @@ ], "text/plain": [ " order_item_id order_id product_id unit_price quantity\n", - "0 0 0 9 58 1\n", - "1 1 0 10 79 4\n", - "2 2 0 6 -4 2\n", - "3 3 0 10 108 5\n", - "4 4 0 10 4 2" + "0 0 0 10 104 4\n", + "1 1 0 10 65 2\n", + "2 2 0 9 46 0\n", + "3 3 0 10 61 2\n", + "4 4 0 10 97 3" ] }, "execution_count": 7, diff --git a/examples/4. Quickstart - Anonymization.ipynb b/examples/4. Anonymization.ipynb similarity index 95% rename from examples/4. Quickstart - Anonymization.ipynb rename to examples/4. Anonymization.ipynb index a397d7e03..b5404eddd 100644 --- a/examples/4. Quickstart - Anonymization.ipynb +++ b/examples/4. Anonymization.ipynb @@ -185,13 +185,13 @@ "name": "stderr", "output_type": "stream", "text": [ - "2019-11-22 22:11:58,851 - INFO - modeler - Modeling anonymized\n", - "2019-11-22 22:11:58,852 - INFO - metadata - Loading transformer CategoricalTransformer for field name\n", - "2019-11-22 22:11:58,852 - INFO - metadata - Loading transformer CategoricalTransformer for field credit_card_number\n", - "2019-11-22 22:11:58,901 - INFO - modeler - Modeling normal\n", - "2019-11-22 22:11:58,901 - INFO - metadata - Loading transformer CategoricalTransformer for field name\n", - "2019-11-22 22:11:58,902 - INFO - metadata - Loading transformer CategoricalTransformer for field credit_card_number\n", - "2019-11-22 22:11:58,922 - INFO - modeler - Modeling Complete\n" + "2019-11-25 16:36:26,188 - INFO - modeler - Modeling anonymized\n", + "2019-11-25 16:36:26,189 - INFO - metadata - Loading transformer CategoricalTransformer for field name\n", + "2019-11-25 16:36:26,189 - INFO - metadata - Loading transformer CategoricalTransformer for field credit_card_number\n", + "2019-11-25 16:36:26,238 - INFO - modeler - Modeling normal\n", + "2019-11-25 16:36:26,238 - INFO - metadata - Loading transformer CategoricalTransformer for field name\n", + "2019-11-25 16:36:26,239 - INFO - metadata - Loading transformer CategoricalTransformer for field credit_card_number\n", + "2019-11-25 16:36:26,260 - INFO - modeler - Modeling Complete\n" ] } ], diff --git a/examples/5. Generate Metadata from Dataframes.ipynb b/examples/5. Generate Metadata from Dataframes.ipynb index 8285cf460..ed8621da1 100644 --- a/examples/5. Generate Metadata from Dataframes.ipynb +++ b/examples/5. Generate Metadata from Dataframes.ipynb @@ -15,7 +15,7 @@ "metadata": {}, "outputs": [], "source": [ - "metadata, tables = load_demo()" + "metadata, tables = load_demo(metadata=True)" ] }, { @@ -54,7 +54,7 @@ } ], "source": [ - "metadata" + "metadata.to_dict()" ] }, { @@ -177,12 +177,12 @@ " 'primary_key': 'session_id'},\n", " 'transactions': {'fields': {'timestamp': {'type': 'datetime',\n", " 'format': '%Y-%m-%d'},\n", - " 'amount': {'type': 'numerical', 'subtype': 'float'},\n", - " 'transaction_id': {'type': 'id', 'subtype': 'integer'},\n", " 'session_id': {'type': 'id',\n", " 'subtype': 'integer',\n", " 'ref': {'table': 'sessions', 'field': 'session_id'}},\n", - " 'approved': {'type': 'boolean'}},\n", + " 'approved': {'type': 'boolean'},\n", + " 'transaction_id': {'type': 'id', 'subtype': 'integer'},\n", + " 'amount': {'type': 'numerical', 'subtype': 'float'}},\n", " 'primary_key': 'transaction_id'}}}" ] }, @@ -212,7 +212,7 @@ } ], "source": [ - "new_meta.to_dict() == metadata" + "new_meta.to_dict() == metadata.to_dict()" ] }, { diff --git a/sdv/demo.py b/sdv/demo.py index 9bdcc0f79..41bf9f865 100644 --- a/sdv/demo.py +++ b/sdv/demo.py @@ -1,5 +1,7 @@ import pandas as pd +from sdv.metadata import Metadata + DEMO_METADATA = { 'tables': { 'users': { @@ -123,6 +125,6 @@ def load_demo(metadata=False): } if metadata: - return DEMO_METADATA.copy(), tables + return Metadata(DEMO_METADATA), tables return tables diff --git a/tests/integration/test_metadata.py b/tests/integration/test_metadata.py index 289788028..242337aaf 100644 --- a/tests/integration/test_metadata.py +++ b/tests/integration/test_metadata.py @@ -1,4 +1,5 @@ from sdv import Metadata, load_demo +from sdv.demo import DEMO_METADATA def test_build_demo_metadata_from_tables(): @@ -7,7 +8,7 @@ def test_build_demo_metadata_from_tables(): Then compare the built metadata with the demo one to make sure that they are the same. """ - metadata, tables = load_demo(metadata=True) + tables = load_demo(metadata=False) new_meta = Metadata() new_meta.add_table('users', data=tables['users'], primary_key='user_id') @@ -23,7 +24,7 @@ def test_build_demo_metadata_from_tables(): fields_metadata=transactions_fields, primary_key='transaction_id', parent='sessions') - assert metadata == new_meta.to_dict() + assert DEMO_METADATA == new_meta.to_dict() def test_build_demo_metadata_without_tables(): @@ -53,5 +54,4 @@ def test_build_demo_metadata_without_tables(): metadata.set_primary_key('transactions', 'transaction_id') metadata.add_relationship('sessions', 'transactions') - demo_metadata = load_demo(metadata=True)[0] - assert demo_metadata == metadata.to_dict() + assert DEMO_METADATA == metadata.to_dict()