diff --git a/README.md b/README.md index 22f669a..d54d5a6 100755 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ pip uninstall autonormalize ### `auto_entityset` ```shell -auto_entityset(df, accuracy=0.98, index=None, name=None, time_index=None) +auto_entityset(df, accuracy=0.98, index=None, name=None, time_index=None, variable_types=None) ``` Creates a normalized entityset from a dataframe. @@ -49,8 +49,10 @@ Creates a normalized entityset from a dataframe. * `name` (str, optional) : the name of created EntitySet -* `time_index` (str, optional) : name of time column in the dataframe. +* `time_index` (str, optional) : name of time column in the dataframe +* `variable_types` (dict[str -> Variable], optional) : Keys are of variable ids and values are variable types. Used to initialize an entity's store. + **Returns:** * `entityset` (ft.EntitySet) : created entity set @@ -85,7 +87,7 @@ Normalizes dataframe based on the dependencies given. Keys for the newly created ### `make_entityset` ```shell -make_entityset(df, dependencies, name=None, time_index=None) +make_entityset(df, dependencies, name=None, time_index=None, variable_types=None) ``` Creates a normalized EntitySet from dataframe based on the dependencies given. Keys are chosen in the same fashion as for `normalize_dataframe`and a new index will be created if any key has more than a single attribute. diff --git a/autonormalize/autonormalize.py b/autonormalize/autonormalize.py index 945f959..8ee290c 100644 --- a/autonormalize/autonormalize.py +++ b/autonormalize/autonormalize.py @@ -70,7 +70,7 @@ def normalize_dataframe(df, dependencies): return depdf.return_dfs() -def make_entityset(df, dependencies, name=None, time_index=None): +def make_entityset(df, dependencies, name=None, time_index=None, variable_types=None): """ Creates a normalized EntitySet from df based on the dependencies given. Keys for the newly created DataFrames can only be columns that are strings, @@ -82,6 +82,10 @@ def make_entityset(df, dependencies, name=None, time_index=None): df (pd.DataFrame) : dataframe to normalize and make entity set from dependencies (Dependenies) : the dependencies discovered in df name (str, optional) : the name of created EntitySet + time_index (str, optional) : name of time column in the dataframe + variable_types (dict[str -> Variable], optional): + Keys are of variable ids and values are variable types. Used to + initialize an entity's store. Returns: entityset (ft.EntitySet) : created entity set @@ -97,10 +101,14 @@ def make_entityset(df, dependencies, name=None, time_index=None): while stack != []: current = stack.pop() + if variable_types is not None: + entity_variable_types = {col: variable_types[col] for col in current.df.columns if col in variable_types} + else: + entity_variable_types = None if time_index in current.df.columns: - entities[current.index[0]] = (current.df, current.index[0], time_index) + entities[current.index[0]] = (current.df, current.index[0], time_index, entity_variable_types) else: - entities[current.index[0]] = (current.df, current.index[0]) + entities[current.index[0]] = (current.df, current.index[0], None, entity_variable_types) for child in current.children: # add to stack # add relationship @@ -110,7 +118,7 @@ def make_entityset(df, dependencies, name=None, time_index=None): return ft.EntitySet(name, entities, relationships) -def auto_entityset(df, accuracy=0.98, index=None, name=None, time_index=None): +def auto_entityset(df, accuracy=0.98, index=None, name=None, time_index=None, variable_types=None): """ Creates a normalized entityset from a dataframe. @@ -126,13 +134,17 @@ def auto_entityset(df, accuracy=0.98, index=None, name=None, time_index=None): name (str, optional) : the name of created EntitySet - time_index (str, optional) : name of time column in the dataframe. + time_index (str, optional) : name of time column in the dataframe + + variable_types (dict[str -> Variable], optional): + Keys are of variable ids and values are variable types. Used to + initialize an entity's store Returns: entityset (ft.EntitySet) : created entity set """ - return make_entityset(df, find_dependencies(df, accuracy, index), name, time_index) + return make_entityset(df, find_dependencies(df, accuracy, index), name, time_index, variable_types) def auto_normalize(df): @@ -169,5 +181,6 @@ def normalize_entity(es, accuracy=0.98): if len(es.entities) == 0: raise ValueError('This EntitySet is empty') entity = es.entities[0] - new_es = auto_entityset(entity.df, accuracy, index=entity.index, name=es.id, time_index=entity.time_index) + new_es = auto_entityset(entity.df, accuracy, index=entity.index, name=es.id, time_index=entity.time_index, + variable_types=entity.variable_types) return new_es diff --git a/autonormalize/tests/test_normalize.py b/autonormalize/tests/test_normalize.py index 0024680..efbeb1e 100644 --- a/autonormalize/tests/test_normalize.py +++ b/autonormalize/tests/test_normalize.py @@ -1,12 +1,47 @@ import pandas as pd -from pandas.testing import assert_frame_equal - -from autonormalize import classes, normalize +import pytest +from pandas.util.testing import assert_frame_equal + +import featuretools as ft +from featuretools.variable_types import ( + Categorical, + Datetime, + DatetimeTimeIndex, + Id, + Index, + Numeric, + Text, + ZIPCode +) + +from autonormalize import autonormalize, classes, normalize # from classes import Dependencies # from normalize import normalize, find_most_comm, split_on_dep +@pytest.fixture +def teams_input(): + class Teams: + def get_df(self): + dic = {'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow', + 'Yellow', 'Green', 'Green', 'Blue'], + 'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2], + 'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H'], + 'city': ['boston', 'boston', 'boston', 'chicago', 'chicago', + 'honolulu', 'honolulu', 'boston', 'boston', 'austin'], + 'state': ['MA', 'MA', 'MA', 'IL', 'IL', 'HI', 'HI', 'MA', 'MA', 'TX']} + return pd.DataFrame(dic) + + def get_deps(self): + return classes.Dependencies({'team': [['player_name', 'jersey_num']], + 'jersey_num': [['player_name', 'team']], + 'player_name': [['team', 'jersey_num']], + 'city': [['team'], ['state'], ['player_name', 'jersey_num']], + 'state': [['team'], ['player_name', 'jersey_num'], + ['city']]}, ['team', 'jersey_num']) + return Teams() + def test_normalize(): # how to test that relations remain the same??? @@ -100,23 +135,8 @@ def test_choose_index(): assert normalize.choose_index(keys, df) == ['A', 'B'] -def test_normalize_dataframe(): - - dic = {'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow', - 'Yellow', 'Green', 'Green', 'Blue'], - 'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2], - 'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H'], - 'city': ['boston', 'boston', 'boston', 'chicago', 'chicago', - 'honolulu', 'honolulu', 'boston', 'boston', 'austin'], - 'state': ['MA', 'MA', 'MA', 'IL', 'IL', 'HI', 'HI', 'MA', 'MA', 'TX']} - df = pd.DataFrame(dic) - deps = classes.Dependencies({'team': [['player_name', 'jersey_num']], - 'jersey_num': [['player_name', 'team']], - 'player_name': [['team', 'jersey_num']], - 'city': [['team'], ['state'], ['player_name', 'jersey_num']], - 'state': [['team'], ['player_name', 'jersey_num'], ['city']]}, ['team', 'jersey_num']) - - depdf = normalize.DepDF(deps, df, deps.get_prim_key()) +def test_normalize_dataframe(teams_input): + depdf = normalize.DepDF(teams_input.get_deps(), teams_input.get_df(), teams_input.get_deps().get_prim_key()) normalize.normalize_dataframe(depdf) new_dfs = depdf.return_dfs() @@ -178,3 +198,181 @@ def test_make_indexes(): assert new_dfs[0][new_dfs[1].columns[0]][5] == val assert new_dfs[0][new_dfs[1].columns[0]][6] == val assert new_dfs[0][new_dfs[1].columns[0]][7] == val + + +def test_variable_types(): + df = ft.demo.load_mock_customer(n_customers=20, n_products=12, n_sessions=50, + n_transactions=100, return_single_table=True) + entityset = ft.EntitySet() + entityset.entity_from_dataframe(entity_id='Customer Transactions', + dataframe=df, + time_index='transaction_time', + variable_types={'zip_code': ZIPCode}) + + normalized_entityset = autonormalize.normalize_entity(entityset) + + assert normalized_entityset['transaction_id'].variable_types['transaction_id'] == Index + assert normalized_entityset['transaction_id'].variable_types['session_id'] == Id + assert normalized_entityset['transaction_id'].variable_types['transaction_time'] == DatetimeTimeIndex + assert normalized_entityset['transaction_id'].variable_types['product_id'] == Id + assert normalized_entityset['transaction_id'].variable_types['amount'] == Numeric + + assert normalized_entityset['product_id'].variable_types['product_id'] == Index + assert normalized_entityset['product_id'].variable_types['brand'] == Categorical + + assert normalized_entityset['session_id'].variable_types['session_id'] == Index + assert normalized_entityset['session_id'].variable_types['customer_id'] == Id + assert normalized_entityset['session_id'].variable_types['device'] == Categorical + assert normalized_entityset['session_id'].variable_types['session_start'] == Datetime + + assert normalized_entityset['customer_id'].variable_types['customer_id'] == Index + assert normalized_entityset['customer_id'].variable_types['join_date'] == Datetime + assert normalized_entityset['customer_id'].variable_types['date_of_birth'] == Datetime + assert normalized_entityset['customer_id'].variable_types['zip_code'] == ZIPCode + + +def test_make_entityset_default_args(teams_input): + normalized_entityset = autonormalize.make_entityset(teams_input.get_df(), teams_input.get_deps()) + + dic_one = {'jersey_num_team': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + 'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow', + 'Yellow', 'Green', 'Green', 'Blue'], + 'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2], + 'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H']} + + dic_two = {'team': ['Blue', 'Green', 'Orange', 'Red', 'Yellow'], + 'city': ['austin', 'boston', 'chicago', 'boston', 'honolulu']} + + dic_three = {'city': ['austin', 'boston', 'chicago', 'honolulu'], + 'state': ['TX', 'MA', 'IL', 'HI']} + + assert len(normalized_entityset.entities) == 3 + + assert normalized_entityset.entities[0].df.equals(pd.DataFrame(dic_one)) + assert normalized_entityset.entities[1].df.equals(pd.DataFrame( + dic_two, index=['Blue', 'Green', 'Orange', 'Red', 'Yellow'])) + assert normalized_entityset.entities[2].df.equals(pd.DataFrame( + dic_three, index=['austin', 'boston', 'chicago', 'honolulu'])) + + assert normalized_entityset.entities[0].variable_types['jersey_num_team'] == Index + assert normalized_entityset.entities[0].variable_types['team'] == Id + assert normalized_entityset.entities[0].variable_types['jersey_num'] == Numeric + assert normalized_entityset.entities[0].variable_types['player_name'] == Categorical + + assert normalized_entityset.entities[1].variable_types['team'] == Index + assert normalized_entityset.entities[1].variable_types['city'] == Id + + assert normalized_entityset.entities[2].variable_types['city'] == Index + assert normalized_entityset.entities[2].variable_types['state'] == Categorical + + +def test_make_entityset_custom_args(teams_input): + normalized_entityset = autonormalize.make_entityset(df=teams_input.get_df(), + dependencies=teams_input.get_deps(), + name='Teams', + variable_types={'state': Text}) + + dic_one = {'jersey_num_team': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + 'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow', + 'Yellow', 'Green', 'Green', 'Blue'], + 'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2], + 'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H']} + + dic_two = {'team': ['Blue', 'Green', 'Orange', 'Red', 'Yellow'], + 'city': ['austin', 'boston', 'chicago', 'boston', 'honolulu']} + + dic_three = {'city': ['austin', 'boston', 'chicago', 'honolulu'], + 'state': ['TX', 'MA', 'IL', 'HI']} + + assert len(normalized_entityset.entities) == 3 + assert normalized_entityset.id == 'Teams' + + assert normalized_entityset.entities[0].df.equals(pd.DataFrame(dic_one)) + assert normalized_entityset.entities[1].df.equals(pd.DataFrame( + dic_two, index=['Blue', 'Green', 'Orange', 'Red', 'Yellow'])) + assert normalized_entityset.entities[2].df.equals(pd.DataFrame( + dic_three, index=['austin', 'boston', 'chicago', 'honolulu'])) + + assert normalized_entityset.entities[0].variable_types['jersey_num_team'] == Index + assert normalized_entityset.entities[0].variable_types['team'] == Id + assert normalized_entityset.entities[0].variable_types['jersey_num'] == Numeric + assert normalized_entityset.entities[0].variable_types['player_name'] == Categorical + + assert normalized_entityset.entities[1].variable_types['team'] == Index + assert normalized_entityset.entities[1].variable_types['city'] == Id + + assert normalized_entityset.entities[2].variable_types['city'] == Index + assert normalized_entityset.entities[2].variable_types['state'] == Text + + +def test_auto_entityset_default_args(teams_input): + normalized_entityset = autonormalize.auto_entityset(teams_input.get_df()) + + dic_one = {'jersey_num_team': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + 'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow', + 'Yellow', 'Green', 'Green', 'Blue'], + 'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2], + 'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H']} + + dic_two = {'team': ['Blue', 'Green', 'Orange', 'Red', 'Yellow'], + 'city': ['austin', 'boston', 'chicago', 'boston', 'honolulu']} + + dic_three = {'city': ['austin', 'boston', 'chicago', 'honolulu'], + 'state': ['TX', 'MA', 'IL', 'HI']} + + assert len(normalized_entityset.entities) == 3 + + assert normalized_entityset.entities[0].df.equals(pd.DataFrame(dic_one)) + assert normalized_entityset.entities[1].df.equals(pd.DataFrame( + dic_two, index=['Blue', 'Green', 'Orange', 'Red', 'Yellow'])) + assert normalized_entityset.entities[2].df.equals(pd.DataFrame( + dic_three, index=['austin', 'boston', 'chicago', 'honolulu'])) + + assert normalized_entityset.entities[0].variable_types['jersey_num_team'] == Index + assert normalized_entityset.entities[0].variable_types['team'] == Id + assert normalized_entityset.entities[0].variable_types['jersey_num'] == Numeric + assert normalized_entityset.entities[0].variable_types['player_name'] == Categorical + + assert normalized_entityset.entities[1].variable_types['team'] == Index + assert normalized_entityset.entities[1].variable_types['city'] == Id + + assert normalized_entityset.entities[2].variable_types['city'] == Index + assert normalized_entityset.entities[2].variable_types['state'] == Categorical + + +def test_auto_entityset_custom_args(teams_input): + normalized_entityset = autonormalize.auto_entityset(df=teams_input.get_df(), + name='Teams', + variable_types={'state': Text}) + + dic_one = {'jersey_num_team': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + 'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow', + 'Yellow', 'Green', 'Green', 'Blue'], + 'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2], + 'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H']} + + dic_two = {'team': ['Blue', 'Green', 'Orange', 'Red', 'Yellow'], + 'city': ['austin', 'boston', 'chicago', 'boston', 'honolulu']} + + dic_three = {'city': ['austin', 'boston', 'chicago', 'honolulu'], + 'state': ['TX', 'MA', 'IL', 'HI']} + + assert len(normalized_entityset.entities) == 3 + assert normalized_entityset.id == 'Teams' + + assert normalized_entityset.entities[0].df.equals(pd.DataFrame(dic_one)) + assert normalized_entityset.entities[1].df.equals(pd.DataFrame( + dic_two, index=['Blue', 'Green', 'Orange', 'Red', 'Yellow'])) + assert normalized_entityset.entities[2].df.equals(pd.DataFrame( + dic_three, index=['austin', 'boston', 'chicago', 'honolulu'])) + + assert normalized_entityset.entities[0].variable_types['jersey_num_team'] == Index + assert normalized_entityset.entities[0].variable_types['team'] == Id + assert normalized_entityset.entities[0].variable_types['jersey_num'] == Numeric + assert normalized_entityset.entities[0].variable_types['player_name'] == Categorical + + assert normalized_entityset.entities[1].variable_types['team'] == Index + assert normalized_entityset.entities[1].variable_types['city'] == Id + + assert normalized_entityset.entities[2].variable_types['city'] == Index + assert normalized_entityset.entities[2].variable_types['state'] == Text