Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Variable types #10 #12

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ pip uninstall autonormalize

### `auto_entityset`
```shell
auto_entityset(df, accuracy=0.98, index=None, name=None, time_index=None)
auto_entityset(df, accuracy=0.98, index=None, name=None, time_index=None, variable_types=None)
```
Creates a normalized entityset from a dataframe.

Expand All @@ -49,8 +49,10 @@ Creates a normalized entityset from a dataframe.

* `name` (str, optional) : the name of created EntitySet

* `time_index` (str, optional) : name of time column in the dataframe.
* `time_index` (str, optional) : name of time column in the dataframe

* `variable_types` (dict[str -> Variable], optional) : Keys are of variable ids and values are variable types. Used to initialize an entity's store.

**Returns:**

* `entityset` (ft.EntitySet) : created entity set
Expand Down Expand Up @@ -85,7 +87,7 @@ Normalizes dataframe based on the dependencies given. Keys for the newly created
### `make_entityset`

```shell
make_entityset(df, dependencies, name=None, time_index=None)
make_entityset(df, dependencies, name=None, time_index=None, variable_types=None)
```
Creates a normalized EntitySet from dataframe based on the dependencies given. Keys are chosen in the same fashion as for `normalize_dataframe`and a new index will be created if any key has more than a single attribute.

Expand Down
27 changes: 20 additions & 7 deletions autonormalize/autonormalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def normalize_dataframe(df, dependencies):
return depdf.return_dfs()


def make_entityset(df, dependencies, name=None, time_index=None):
def make_entityset(df, dependencies, name=None, time_index=None, variable_types=None):
"""
Creates a normalized EntitySet from df based on the dependencies given.
Keys for the newly created DataFrames can only be columns that are strings,
Expand All @@ -82,6 +82,10 @@ def make_entityset(df, dependencies, name=None, time_index=None):
df (pd.DataFrame) : dataframe to normalize and make entity set from
dependencies (Dependenies) : the dependencies discovered in df
name (str, optional) : the name of created EntitySet
time_index (str, optional) : name of time column in the dataframe
variable_types (dict[str -> Variable], optional):
Keys are of variable ids and values are variable types. Used to
initialize an entity's store.

Returns:
entityset (ft.EntitySet) : created entity set
Expand All @@ -97,10 +101,14 @@ def make_entityset(df, dependencies, name=None, time_index=None):

while stack != []:
current = stack.pop()
if variable_types is not None:
entity_variable_types = {col: variable_types[col] for col in current.df.columns if col in variable_types}
else:
entity_variable_types = None
if time_index in current.df.columns:
entities[current.index[0]] = (current.df, current.index[0], time_index)
entities[current.index[0]] = (current.df, current.index[0], time_index, entity_variable_types)
else:
entities[current.index[0]] = (current.df, current.index[0])
entities[current.index[0]] = (current.df, current.index[0], None, entity_variable_types)
for child in current.children:
# add to stack
# add relationship
Expand All @@ -110,7 +118,7 @@ def make_entityset(df, dependencies, name=None, time_index=None):
return ft.EntitySet(name, entities, relationships)


def auto_entityset(df, accuracy=0.98, index=None, name=None, time_index=None):
def auto_entityset(df, accuracy=0.98, index=None, name=None, time_index=None, variable_types=None):
"""
Creates a normalized entityset from a dataframe.

Expand All @@ -126,13 +134,17 @@ def auto_entityset(df, accuracy=0.98, index=None, name=None, time_index=None):

name (str, optional) : the name of created EntitySet

time_index (str, optional) : name of time column in the dataframe.
time_index (str, optional) : name of time column in the dataframe

variable_types (dict[str -> Variable], optional):
Keys are of variable ids and values are variable types. Used to
initialize an entity's store

Returns:

entityset (ft.EntitySet) : created entity set
"""
return make_entityset(df, find_dependencies(df, accuracy, index), name, time_index)
return make_entityset(df, find_dependencies(df, accuracy, index), name, time_index, variable_types)


def auto_normalize(df):
Expand Down Expand Up @@ -169,5 +181,6 @@ def normalize_entity(es, accuracy=0.98):
if len(es.entities) == 0:
raise ValueError('This EntitySet is empty')
entity = es.entities[0]
new_es = auto_entityset(entity.df, accuracy, index=entity.index, name=es.id, time_index=entity.time_index)
new_es = auto_entityset(entity.df, accuracy, index=entity.index, name=es.id, time_index=entity.time_index,
variable_types=entity.variable_types)
return new_es
238 changes: 218 additions & 20 deletions autonormalize/tests/test_normalize.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,47 @@
import pandas as pd
from pandas.testing import assert_frame_equal

from autonormalize import classes, normalize
import pytest
from pandas.util.testing import assert_frame_equal

import featuretools as ft
from featuretools.variable_types import (
Categorical,
Datetime,
DatetimeTimeIndex,
Id,
Index,
Numeric,
Text,
ZIPCode
)

from autonormalize import autonormalize, classes, normalize

# from classes import Dependencies

# from normalize import normalize, find_most_comm, split_on_dep

@pytest.fixture
def teams_input():
class Teams:
def get_df(self):
dic = {'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow',
'Yellow', 'Green', 'Green', 'Blue'],
'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2],
'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H'],
'city': ['boston', 'boston', 'boston', 'chicago', 'chicago',
'honolulu', 'honolulu', 'boston', 'boston', 'austin'],
'state': ['MA', 'MA', 'MA', 'IL', 'IL', 'HI', 'HI', 'MA', 'MA', 'TX']}
return pd.DataFrame(dic)

def get_deps(self):
return classes.Dependencies({'team': [['player_name', 'jersey_num']],
'jersey_num': [['player_name', 'team']],
'player_name': [['team', 'jersey_num']],
'city': [['team'], ['state'], ['player_name', 'jersey_num']],
'state': [['team'], ['player_name', 'jersey_num'],
['city']]}, ['team', 'jersey_num'])
return Teams()


def test_normalize():
# how to test that relations remain the same???
Expand Down Expand Up @@ -100,23 +135,8 @@ def test_choose_index():
assert normalize.choose_index(keys, df) == ['A', 'B']


def test_normalize_dataframe():

dic = {'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow',
'Yellow', 'Green', 'Green', 'Blue'],
'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2],
'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H'],
'city': ['boston', 'boston', 'boston', 'chicago', 'chicago',
'honolulu', 'honolulu', 'boston', 'boston', 'austin'],
'state': ['MA', 'MA', 'MA', 'IL', 'IL', 'HI', 'HI', 'MA', 'MA', 'TX']}
df = pd.DataFrame(dic)
deps = classes.Dependencies({'team': [['player_name', 'jersey_num']],
'jersey_num': [['player_name', 'team']],
'player_name': [['team', 'jersey_num']],
'city': [['team'], ['state'], ['player_name', 'jersey_num']],
'state': [['team'], ['player_name', 'jersey_num'], ['city']]}, ['team', 'jersey_num'])

depdf = normalize.DepDF(deps, df, deps.get_prim_key())
def test_normalize_dataframe(teams_input):
depdf = normalize.DepDF(teams_input.get_deps(), teams_input.get_df(), teams_input.get_deps().get_prim_key())
normalize.normalize_dataframe(depdf)
new_dfs = depdf.return_dfs()

Expand Down Expand Up @@ -178,3 +198,181 @@ def test_make_indexes():
assert new_dfs[0][new_dfs[1].columns[0]][5] == val
assert new_dfs[0][new_dfs[1].columns[0]][6] == val
assert new_dfs[0][new_dfs[1].columns[0]][7] == val


def test_variable_types():
df = ft.demo.load_mock_customer(n_customers=20, n_products=12, n_sessions=50,
n_transactions=100, return_single_table=True)
entityset = ft.EntitySet()
entityset.entity_from_dataframe(entity_id='Customer Transactions',
dataframe=df,
time_index='transaction_time',
variable_types={'zip_code': ZIPCode})

normalized_entityset = autonormalize.normalize_entity(entityset)

assert normalized_entityset['transaction_id'].variable_types['transaction_id'] == Index
assert normalized_entityset['transaction_id'].variable_types['session_id'] == Id
assert normalized_entityset['transaction_id'].variable_types['transaction_time'] == DatetimeTimeIndex
assert normalized_entityset['transaction_id'].variable_types['product_id'] == Id
assert normalized_entityset['transaction_id'].variable_types['amount'] == Numeric

assert normalized_entityset['product_id'].variable_types['product_id'] == Index
assert normalized_entityset['product_id'].variable_types['brand'] == Categorical

assert normalized_entityset['session_id'].variable_types['session_id'] == Index
assert normalized_entityset['session_id'].variable_types['customer_id'] == Id
assert normalized_entityset['session_id'].variable_types['device'] == Categorical
assert normalized_entityset['session_id'].variable_types['session_start'] == Datetime

assert normalized_entityset['customer_id'].variable_types['customer_id'] == Index
assert normalized_entityset['customer_id'].variable_types['join_date'] == Datetime
assert normalized_entityset['customer_id'].variable_types['date_of_birth'] == Datetime
assert normalized_entityset['customer_id'].variable_types['zip_code'] == ZIPCode


def test_make_entityset_default_args(teams_input):
normalized_entityset = autonormalize.make_entityset(teams_input.get_df(), teams_input.get_deps())

dic_one = {'jersey_num_team': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow',
'Yellow', 'Green', 'Green', 'Blue'],
'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2],
'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H']}

dic_two = {'team': ['Blue', 'Green', 'Orange', 'Red', 'Yellow'],
'city': ['austin', 'boston', 'chicago', 'boston', 'honolulu']}

dic_three = {'city': ['austin', 'boston', 'chicago', 'honolulu'],
'state': ['TX', 'MA', 'IL', 'HI']}

assert len(normalized_entityset.entities) == 3

assert normalized_entityset.entities[0].df.equals(pd.DataFrame(dic_one))
assert normalized_entityset.entities[1].df.equals(pd.DataFrame(
dic_two, index=['Blue', 'Green', 'Orange', 'Red', 'Yellow']))
assert normalized_entityset.entities[2].df.equals(pd.DataFrame(
dic_three, index=['austin', 'boston', 'chicago', 'honolulu']))

assert normalized_entityset.entities[0].variable_types['jersey_num_team'] == Index
assert normalized_entityset.entities[0].variable_types['team'] == Id
assert normalized_entityset.entities[0].variable_types['jersey_num'] == Numeric
assert normalized_entityset.entities[0].variable_types['player_name'] == Categorical

assert normalized_entityset.entities[1].variable_types['team'] == Index
assert normalized_entityset.entities[1].variable_types['city'] == Id

assert normalized_entityset.entities[2].variable_types['city'] == Index
assert normalized_entityset.entities[2].variable_types['state'] == Categorical


def test_make_entityset_custom_args(teams_input):
normalized_entityset = autonormalize.make_entityset(df=teams_input.get_df(),
dependencies=teams_input.get_deps(),
name='Teams',
variable_types={'state': Text})

dic_one = {'jersey_num_team': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow',
'Yellow', 'Green', 'Green', 'Blue'],
'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2],
'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H']}

dic_two = {'team': ['Blue', 'Green', 'Orange', 'Red', 'Yellow'],
'city': ['austin', 'boston', 'chicago', 'boston', 'honolulu']}

dic_three = {'city': ['austin', 'boston', 'chicago', 'honolulu'],
'state': ['TX', 'MA', 'IL', 'HI']}

assert len(normalized_entityset.entities) == 3
assert normalized_entityset.id == 'Teams'

assert normalized_entityset.entities[0].df.equals(pd.DataFrame(dic_one))
assert normalized_entityset.entities[1].df.equals(pd.DataFrame(
dic_two, index=['Blue', 'Green', 'Orange', 'Red', 'Yellow']))
assert normalized_entityset.entities[2].df.equals(pd.DataFrame(
dic_three, index=['austin', 'boston', 'chicago', 'honolulu']))

assert normalized_entityset.entities[0].variable_types['jersey_num_team'] == Index
assert normalized_entityset.entities[0].variable_types['team'] == Id
assert normalized_entityset.entities[0].variable_types['jersey_num'] == Numeric
assert normalized_entityset.entities[0].variable_types['player_name'] == Categorical

assert normalized_entityset.entities[1].variable_types['team'] == Index
assert normalized_entityset.entities[1].variable_types['city'] == Id

assert normalized_entityset.entities[2].variable_types['city'] == Index
assert normalized_entityset.entities[2].variable_types['state'] == Text


def test_auto_entityset_default_args(teams_input):
normalized_entityset = autonormalize.auto_entityset(teams_input.get_df())

dic_one = {'jersey_num_team': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow',
'Yellow', 'Green', 'Green', 'Blue'],
'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2],
'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H']}

dic_two = {'team': ['Blue', 'Green', 'Orange', 'Red', 'Yellow'],
'city': ['austin', 'boston', 'chicago', 'boston', 'honolulu']}

dic_three = {'city': ['austin', 'boston', 'chicago', 'honolulu'],
'state': ['TX', 'MA', 'IL', 'HI']}

assert len(normalized_entityset.entities) == 3

assert normalized_entityset.entities[0].df.equals(pd.DataFrame(dic_one))
assert normalized_entityset.entities[1].df.equals(pd.DataFrame(
dic_two, index=['Blue', 'Green', 'Orange', 'Red', 'Yellow']))
assert normalized_entityset.entities[2].df.equals(pd.DataFrame(
dic_three, index=['austin', 'boston', 'chicago', 'honolulu']))

assert normalized_entityset.entities[0].variable_types['jersey_num_team'] == Index
assert normalized_entityset.entities[0].variable_types['team'] == Id
assert normalized_entityset.entities[0].variable_types['jersey_num'] == Numeric
assert normalized_entityset.entities[0].variable_types['player_name'] == Categorical

assert normalized_entityset.entities[1].variable_types['team'] == Index
assert normalized_entityset.entities[1].variable_types['city'] == Id

assert normalized_entityset.entities[2].variable_types['city'] == Index
assert normalized_entityset.entities[2].variable_types['state'] == Categorical


def test_auto_entityset_custom_args(teams_input):
normalized_entityset = autonormalize.auto_entityset(df=teams_input.get_df(),
name='Teams',
variable_types={'state': Text})

dic_one = {'jersey_num_team': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow',
'Yellow', 'Green', 'Green', 'Blue'],
'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2],
'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H']}

dic_two = {'team': ['Blue', 'Green', 'Orange', 'Red', 'Yellow'],
'city': ['austin', 'boston', 'chicago', 'boston', 'honolulu']}

dic_three = {'city': ['austin', 'boston', 'chicago', 'honolulu'],
'state': ['TX', 'MA', 'IL', 'HI']}

assert len(normalized_entityset.entities) == 3
assert normalized_entityset.id == 'Teams'

assert normalized_entityset.entities[0].df.equals(pd.DataFrame(dic_one))
assert normalized_entityset.entities[1].df.equals(pd.DataFrame(
dic_two, index=['Blue', 'Green', 'Orange', 'Red', 'Yellow']))
assert normalized_entityset.entities[2].df.equals(pd.DataFrame(
dic_three, index=['austin', 'boston', 'chicago', 'honolulu']))

assert normalized_entityset.entities[0].variable_types['jersey_num_team'] == Index
assert normalized_entityset.entities[0].variable_types['team'] == Id
assert normalized_entityset.entities[0].variable_types['jersey_num'] == Numeric
assert normalized_entityset.entities[0].variable_types['player_name'] == Categorical

assert normalized_entityset.entities[1].variable_types['team'] == Index
assert normalized_entityset.entities[1].variable_types['city'] == Id

assert normalized_entityset.entities[2].variable_types['city'] == Index
assert normalized_entityset.entities[2].variable_types['state'] == Text