Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEAT] Core Numeric Type and Null Protections #181

Merged
merged 7 commits into from
Apr 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion hierarchicalforecast/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,9 +134,18 @@ def _prepare_fit(self,
# Declare output names
drop_cols = ['ds', 'y'] if 'y' in Y_hat_df.columns else ['ds']
model_names = Y_hat_df.drop(columns=drop_cols, axis=1).columns.to_list()

# Ensure numeric columns
if not len(Y_hat_df[model_names].select_dtypes(include='number').columns) == len(Y_hat_df[model_names].columns):
raise Exception('`Y_hat_df`s columns contain non numeric types')

#Ensure no null values
if Y_hat_df[model_names].isnull().values.any():
raise Exception('`Y_hat_df` contains null values')

pi_model_names = [name for name in model_names if ('-lo' in name or '-hi' in name)]
model_names = [name for name in model_names if name not in pi_model_names]

# TODO: Complete y_hat_insample protection
if intervals_method in ['bootstrap', 'permbu']:
if not (set(model_names) <= set(Y_df.columns)):
Expand Down
7 changes: 6 additions & 1 deletion hierarchicalforecast/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,11 @@ def aggregate(df: pd.DataFrame,
`Y_df, S_df, tags`: tuple with hierarchically structured series `Y_df` ($\mathbf{y}_{[a,b]}$),
summing dataframe `S_df`, and hierarchical aggregation indexes `tags`.
"""

#Ensure no null values
if df.isnull().values.any():
raise Exception('`df` contains null values')

#-------------------------------- Wrangling --------------------------------#
# constraints S_df and collapsed Y_bottom_df with 'unique_id'
Y_bottom_df, S_df, tags = _to_summing_dataframe(df=df, spec=spec)
Expand Down Expand Up @@ -242,7 +247,7 @@ def aggregate(df: pd.DataFrame,
Y_df = Y_df.set_index('unique_id')
return Y_df, S_df, tags

# %% ../nbs/utils.ipynb 15
# %% ../nbs/utils.ipynb 16
class HierarchicalPlot:
""" Hierarchical Plot

Expand Down
51 changes: 43 additions & 8 deletions nbs/core.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -234,9 +234,18 @@
" # Declare output names\n",
" drop_cols = ['ds', 'y'] if 'y' in Y_hat_df.columns else ['ds']\n",
" model_names = Y_hat_df.drop(columns=drop_cols, axis=1).columns.to_list()\n",
"\n",
" # Ensure numeric columns\n",
" if not len(Y_hat_df[model_names].select_dtypes(include='number').columns) == len(Y_hat_df[model_names].columns):\n",
" raise Exception('`Y_hat_df`s columns contain non numeric types')\n",
" \n",
" #Ensure no null values\n",
" if Y_hat_df[model_names].isnull().values.any():\n",
" raise Exception('`Y_hat_df` contains null values')\n",
" \n",
" pi_model_names = [name for name in model_names if ('-lo' in name or '-hi' in name)]\n",
" model_names = [name for name in model_names if name not in pi_model_names]\n",
"\n",
" \n",
" # TODO: Complete y_hat_insample protection\n",
" if intervals_method in ['bootstrap', 'permbu']:\n",
" if not (set(model_names) <= set(Y_df.columns)):\n",
Expand Down Expand Up @@ -585,6 +594,39 @@
" test_close(reconciled['y'], reconciled[model], eps=eps)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#| hide\n",
"# test incorrect Y_hat_df datatypes\n",
"hier_grouped_hat_df_nan = hier_grouped_hat_df.copy()\n",
"hier_grouped_hat_df_nan.loc['Australia', 'y_model'] = float('nan')\n",
"test_fail(\n",
" hrec.reconcile,\n",
" contains='null values',\n",
" args=(hier_grouped_hat_df_nan, S_grouped_df, tags_grouped, hier_grouped_df),\n",
")\n",
"\n",
"hier_grouped_hat_df_none = hier_grouped_hat_df.copy()\n",
"hier_grouped_hat_df_none.loc['Australia', 'y_model'] = None\n",
"test_fail(\n",
" hrec.reconcile,\n",
" contains='null values',\n",
" args=(hier_grouped_hat_df_none, S_grouped_df, tags_grouped, hier_grouped_df),\n",
")\n",
"\n",
"hier_grouped_hat_df_str = hier_grouped_hat_df.copy()\n",
"hier_grouped_hat_df_str['y_model'] = hier_grouped_hat_df_str['y_model'].astype(str)\n",
"test_fail(\n",
" hrec.reconcile,\n",
" contains='numeric types',\n",
" args=(hier_grouped_hat_df_str, S_grouped_df, tags_grouped, hier_grouped_df),\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down Expand Up @@ -1006,13 +1048,6 @@
" S=S_df, tags=tags)\n",
"Y_rec_df.groupby('unique_id').head(2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
38 changes: 37 additions & 1 deletion nbs/utils.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
"source": [
"#| hide\n",
"from nbdev.showdoc import add_docs, show_doc\n",
"from fastcore.test import test_eq, test_close"
"from fastcore.test import test_eq, test_close, test_fail"
]
},
{
Expand Down Expand Up @@ -297,6 +297,11 @@
" `Y_df, S_df, tags`: tuple with hierarchically structured series `Y_df` ($\\mathbf{y}_{[a,b]}$),\n",
" summing dataframe `S_df`, and hierarchical aggregation indexes `tags`.\n",
" \"\"\"\n",
" \n",
" #Ensure no null values\n",
" if df.isnull().values.any():\n",
" raise Exception('`df` contains null values')\n",
" \n",
" #-------------------------------- Wrangling --------------------------------#\n",
" # constraints S_df and collapsed Y_bottom_df with 'unique_id'\n",
" Y_bottom_df, S_df, tags = _to_summing_dataframe(df=df, spec=spec)\n",
Expand Down Expand Up @@ -390,6 +395,37 @@
"test_eq(len(tags), len(hiers_grouped))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5e1cb923",
"metadata": {},
"outputs": [],
"source": [
"#| hide\n",
"df = pd.read_csv('https://raw.githubusercontent.com/Nixtla/transfer-learning-time-series/main/datasets/tourism.csv')\n",
"df = df.rename({'Trips': 'y', 'Quarter': 'ds'}, axis=1)\n",
"df.insert(0, 'Country', 'Australia')\n",
"\n",
"#Unit Test NaN Values\n",
"df_nan = df.copy()\n",
"df_nan.loc[0, 'Region'] = float('nan')\n",
"test_fail(\n",
" aggregate,\n",
" contains='null values',\n",
" args=(df_nan, hiers_strictly),\n",
")\n",
"\n",
"#Unit Test None Values\n",
"df_none = df.copy()\n",
"df_none.loc[0, 'Region'] = None\n",
"test_fail(\n",
" aggregate,\n",
" contains='null values',\n",
" args=(df_none, hiers_strictly),\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down