Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add hist_exog_list argument to cross_validation #534

Merged
merged 3 commits into from
Nov 25, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 96 additions & 9 deletions nbs/src/nixtla_client.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@
"from dotenv import load_dotenv\n",
"from fastcore.test import test_eq, test_fail\n",
"from utilsforecast.data import generate_series\n",
"from utilsforecast.feature_engineering import fourier\n",
"\n",
"from nixtla.date_features import SpecialDates"
]
Expand Down Expand Up @@ -420,7 +421,8 @@
" target_col: str,\n",
" hist_exog: Optional[list[str]],\n",
") -> tuple[DFType, Optional[DFType]]:\n",
" exogs = [c for c in df.columns if c not in (id_col, time_col, target_col)]\n",
" base_cols = {id_col, time_col, target_col}\n",
" exogs = [c for c in df.columns if c not in base_cols]\n",
" if hist_exog is None:\n",
" hist_exog = []\n",
" if X_df is None:\n",
Expand All @@ -437,7 +439,7 @@
" return df, None\n",
"\n",
" # exogs in df that weren't declared as historic nor future\n",
" futr_exog = [c for c in X_df.columns if c not in (id_col, time_col)]\n",
" futr_exog = [c for c in X_df.columns if c not in base_cols]\n",
" declared_exogs = {*hist_exog, *futr_exog}\n",
" ignored_exogs = [c for c in exogs if c not in declared_exogs]\n",
" if ignored_exogs:\n",
Expand All @@ -455,6 +457,15 @@
" f\"but not in `df`: {missing_futr}.\"\n",
" )\n",
"\n",
" # features are provided through X_df but declared as historic\n",
" futr_and_hist = set(futr_exog) & set(hist_exog)\n",
" if futr_and_hist:\n",
" warnings.warn(\n",
" \"The following features were declared as historic but found in `X_df`: \"\n",
" f\"{futr_and_hist}, they will be considered as historic.\"\n",
" )\n",
" futr_exog = [f for f in futr_exog if f not in hist_exog]\n",
"\n",
" # Make sure df and X_df are in right order\n",
" df = df[[id_col, time_col, target_col, *futr_exog, *hist_exog]]\n",
" X_df = X_df[[id_col, time_col, *futr_exog]]\n",
Expand Down Expand Up @@ -537,7 +548,7 @@
" processed = ufp.process_df(\n",
" df=df, id_col=id_col, time_col=time_col, target_col=target_col\n",
" )\n",
" if X_df is not None:\n",
" if X_df is not None and X_df.shape[1] > 2:\n",
" X_df = ensure_time_dtype(X_df, time_col=time_col)\n",
" processed_X = ufp.process_df(\n",
" df=X_df, id_col=id_col, time_col=time_col, target_col=None,\n",
Expand Down Expand Up @@ -718,7 +729,8 @@
" if isinstance(v, np.ndarray):\n",
" if np.issubdtype(v.dtype, np.floating):\n",
" v_cont = np.ascontiguousarray(v, dtype=np.float32)\n",
" d[k] = np.nan_to_num(v_cont, \n",
" d[k] = np.nan_to_num(\n",
" v_cont, \n",
" nan=np.nan, \n",
" posinf=np.finfo(np.float32).max, \n",
" neginf=np.finfo(np.float32).min,\n",
Expand Down Expand Up @@ -967,6 +979,7 @@
" finetune_depth: _Finetune_Depth,\n",
" finetune_loss: _Loss,\n",
" clean_ex_first: bool,\n",
" hist_exog_list: Optional[list[str]],\n",
" validate_api_key: bool,\n",
" add_history: bool,\n",
" date_features: Union[bool, list[Union[str, Callable]]],\n",
Expand Down Expand Up @@ -1023,6 +1036,7 @@
" finetune_depth=finetune_depth,\n",
" finetune_loss=finetune_loss,\n",
" clean_ex_first=clean_ex_first,\n",
" hist_exog_list=hist_exog_list,\n",
" validate_api_key=validate_api_key,\n",
" add_history=add_history,\n",
" date_features=date_features,\n",
Expand Down Expand Up @@ -1158,6 +1172,7 @@
" finetune_depth=finetune_depth,\n",
" finetune_loss=finetune_loss,\n",
" clean_ex_first=clean_ex_first,\n",
" hist_exog_list=hist_exog_list,\n",
" validate_api_key=validate_api_key,\n",
" add_history=add_history,\n",
" date_features=date_features,\n",
Expand Down Expand Up @@ -1226,7 +1241,7 @@
" X = processed.data[:, 1:].T\n",
" if futr_cols is not None:\n",
" logger.info(f'Using future exogenous features: {futr_cols}')\n",
" if hist_exog_list is not None:\n",
" if hist_exog_list:\n",
" logger.info(f'Using historical exogenous features: {hist_exog_list}')\n",
" else:\n",
" X = None\n",
Expand Down Expand Up @@ -1541,6 +1556,7 @@
" finetune_depth: _Finetune_Depth,\n",
" finetune_loss: _Loss,\n",
" clean_ex_first: bool,\n",
" hist_exog_list: Optional[list[str]],\n",
" date_features: Union[bool, Sequence[Union[str, Callable]]],\n",
" date_features_to_one_hot: Union[bool, list[str]],\n",
" model: _Model,\n",
Expand All @@ -1550,7 +1566,7 @@
" \n",
" schema, partition_config = _distributed_setup(\n",
" df=df,\n",
" method='forecast',\n",
" method='cross_validation',\n",
" id_col=id_col,\n",
" time_col=time_col,\n",
" target_col=target_col,\n",
Expand Down Expand Up @@ -1578,6 +1594,7 @@
" finetune_depth=finetune_depth,\n",
" finetune_loss=finetune_loss,\n",
" clean_ex_first=clean_ex_first,\n",
" hist_exog_list=hist_exog_list,\n",
" date_features=date_features,\n",
" date_features_to_one_hot=date_features_to_one_hot,\n",
" model=model,\n",
Expand Down Expand Up @@ -1605,6 +1622,7 @@
" finetune_depth: _Finetune_Depth = 1,\n",
" finetune_loss: _Loss = 'default',\n",
" clean_ex_first: bool = True,\n",
" hist_exog_list: Optional[list[str]] = None,\n",
" date_features: Union[bool, list[str]] = False,\n",
" date_features_to_one_hot: Union[bool, list[str]] = False,\n",
" model: _Model = 'timegpt-1',\n",
Expand Down Expand Up @@ -1661,8 +1679,9 @@
" finetune_loss : str (default='default')\n",
" Loss function to use for finetuning. Options are: `default`, `mae`, `mse`, `rmse`, `mape`, and `smape`.\n",
" clean_ex_first : bool (default=True)\n",
" Clean exogenous signal before making forecasts\n",
" using TimeGPT.\n",
" Clean exogenous signal before making forecasts using TimeGPT.\n",
" hist_exog_list : list of str, optional (default=None)\n",
" Column names of the historical exogenous features.\n",
" date_features : bool or list of str or callable, optional (default=False)\n",
" Features computed from the dates.\n",
" Can be pandas date attributes or functions that will take the dates as input.\n",
Expand Down Expand Up @@ -1704,6 +1723,7 @@
" finetune_depth=finetune_depth,\n",
" finetune_loss=finetune_loss,\n",
" clean_ex_first=clean_ex_first,\n",
" hist_exog_list=hist_exog_list,\n",
" date_features=date_features,\n",
" date_features_to_one_hot=date_features_to_one_hot,\n",
" model=model,\n",
Expand Down Expand Up @@ -1767,9 +1787,29 @@
" targets = _array_tails(targets, orig_indptr, np.diff(processed.indptr))\n",
" if processed.data.shape[1] > 1:\n",
" X = processed.data[:, 1:].T\n",
" logger.info(f'Using the following exogenous features: {x_cols}')\n",
" if hist_exog_list is None:\n",
" hist_exog = None\n",
" futr_exog = x_cols\n",
" else:\n",
" missing_hist = set(hist_exog_list) - set(x_cols)\n",
" if missing_hist:\n",
" raise ValueError(\n",
" \"The following exogenous features were declared as historic \"\n",
" f\"but were not found in `df`: {missing_hist}.\"\n",
" )\n",
" futr_exog = [c for c in x_cols if c not in hist_exog_list]\n",
" # match the forecast method order [future, historic]\n",
" fcst_features_order = futr_exog + hist_exog_list\n",
" x_idxs = [x_cols.index(c) for c in fcst_features_order]\n",
" X = X[x_idxs]\n",
" hist_exog = [fcst_features_order.index(c) for c in hist_exog_list]\n",
" if futr_exog:\n",
" logger.info(f'Using future exogenous features: {futr_exog}')\n",
" if hist_exog_list:\n",
" logger.info(f'Using historical exogenous features: {hist_exog_list}')\n",
" else:\n",
" X = None\n",
" hist_exog = None\n",
"\n",
" logger.info('Calling Cross Validation Endpoint...')\n",
" payload = {\n",
Expand All @@ -1784,6 +1824,7 @@
" 'step_size': step_size,\n",
" 'freq': standard_freq,\n",
" 'clean_ex_first': clean_ex_first,\n",
" 'hist_exog': hist_exog,\n",
" 'level': level,\n",
" 'finetune_steps': finetune_steps,\n",
" 'finetune_depth': finetune_depth,\n",
Expand Down Expand Up @@ -1953,6 +1994,7 @@
" finetune_depth: _Finetune_Depth,\n",
" finetune_loss: _Loss,\n",
" clean_ex_first: bool,\n",
" hist_exog_list: Optional[list[str]],\n",
" validate_api_key: bool,\n",
" add_history: bool,\n",
" date_features: Union[bool, list[Union[str, Callable]]],\n",
Expand Down Expand Up @@ -1981,6 +2023,7 @@
" finetune_depth=finetune_depth,\n",
" finetune_loss=finetune_loss,\n",
" clean_ex_first=clean_ex_first,\n",
" hist_exog_list=hist_exog_list,\n",
" validate_api_key=validate_api_key,\n",
" add_history=add_history,\n",
" date_features=date_features,\n",
Expand Down Expand Up @@ -2037,6 +2080,7 @@
" finetune_depth: _Finetune_Depth,\n",
" finetune_loss: _Loss,\n",
" clean_ex_first: bool,\n",
" hist_exog_list: Optional[list[str]],\n",
" date_features: Union[bool, list[str]],\n",
" date_features_to_one_hot: Union[bool, list[str]],\n",
" model: _Model,\n",
Expand All @@ -2058,6 +2102,7 @@
" finetune_depth=finetune_depth,\n",
" finetune_loss=finetune_loss,\n",
" clean_ex_first=clean_ex_first,\n",
" hist_exog_list=hist_exog_list,\n",
" date_features=date_features,\n",
" date_features_to_one_hot=date_features_to_one_hot,\n",
" model=model,\n",
Expand Down Expand Up @@ -2205,6 +2250,48 @@
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#| hide\n",
"# historic exog in cv\n",
"freq = 'D'\n",
"h = 5\n",
"series = generate_series(2, freq=freq)\n",
"series_with_features, _ = fourier(series, freq=freq, season_length=7, k=2)\n",
"splits = ufp.backtest_splits(\n",
" df=series_with_features,\n",
" n_windows=1,\n",
" h=h,\n",
" id_col='unique_id',\n",
" time_col='ds',\n",
" freq=freq,\n",
")\n",
"_, train, valid = next(splits)\n",
"x_cols = train.columns.drop(['unique_id', 'ds', 'y']).tolist()\n",
"for hist_exog_list in [None, [], [x_cols[2], x_cols[1]], x_cols]:\n",
" cv_res = nixtla_client.cross_validation(\n",
" series_with_features,\n",
" n_windows=1,\n",
" h=h,\n",
" freq=freq,\n",
" hist_exog_list=hist_exog_list,\n",
" )\n",
" fcst_res = nixtla_client.forecast(\n",
" train,\n",
" h=h,\n",
" freq=freq,\n",
" hist_exog_list=hist_exog_list,\n",
" X_df=valid,\n",
" )\n",
" np.testing.assert_allclose(\n",
" cv_res['TimeGPT'], fcst_res['TimeGPT'], atol=1e-4, rtol=1e-3\n",
" )"
]
Comment on lines +2275 to +2293
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cool

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we add a test to verify that different historical exogenous variables result in different forecasts? wdyt?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added in 91b27ec

},
{
"cell_type": "code",
"execution_count": null,
Expand Down
Loading