Skip to content

Commit

Permalink
feat: add hist_exog_list argument to cross_validation
Browse files Browse the repository at this point in the history
  • Loading branch information
jmoralez committed Nov 7, 2024
1 parent b420490 commit b40c2d0
Show file tree
Hide file tree
Showing 2 changed files with 133 additions and 17 deletions.
98 changes: 89 additions & 9 deletions nbs/src/nixtla_client.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@
"from dotenv import load_dotenv\n",
"from fastcore.test import test_eq, test_fail\n",
"from utilsforecast.data import generate_series\n",
"from utilsforecast.feature_engineering import fourier\n",
"\n",
"from nixtla.date_features import SpecialDates"
]
Expand Down Expand Up @@ -420,7 +421,8 @@
" target_col: str,\n",
" hist_exog: Optional[list[str]],\n",
") -> tuple[DFType, Optional[DFType]]:\n",
" exogs = [c for c in df.columns if c not in (id_col, time_col, target_col)]\n",
" base_cols = {id_col, time_col, target_col}\n",
" exogs = [c for c in df.columns if c not in base_cols]\n",
" if hist_exog is None:\n",
" hist_exog = []\n",
" if X_df is None:\n",
Expand All @@ -437,7 +439,7 @@
" return df, None\n",
"\n",
" # exogs in df that weren't declared as historic nor future\n",
" futr_exog = [c for c in X_df.columns if c not in (id_col, time_col)]\n",
" futr_exog = [c for c in X_df.columns if c not in base_cols]\n",
" declared_exogs = {*hist_exog, *futr_exog}\n",
" ignored_exogs = [c for c in exogs if c not in declared_exogs]\n",
" if ignored_exogs:\n",
Expand All @@ -455,6 +457,15 @@
" f\"but not in `df`: {missing_futr}.\"\n",
" )\n",
"\n",
" # features are provided through X_df but declared as historic\n",
" futr_and_hist = set(futr_exog) & set(hist_exog)\n",
" if futr_and_hist:\n",
" warnings.warn(\n",
" \"The following features were declared as historic but found in `X_df`: \"\n",
" f\"{futr_and_hist}, they will be considered as historic.\"\n",
" )\n",
" futr_exog = [f for f in futr_exog if f not in hist_exog]\n",
"\n",
" # Make sure df and X_df are in right order\n",
" df = df[[id_col, time_col, target_col, *futr_exog, *hist_exog]]\n",
" X_df = X_df[[id_col, time_col, *futr_exog]]\n",
Expand Down Expand Up @@ -537,7 +548,7 @@
" processed = ufp.process_df(\n",
" df=df, id_col=id_col, time_col=time_col, target_col=target_col\n",
" )\n",
" if X_df is not None:\n",
" if X_df is not None and X_df.shape[1] > 2:\n",
" X_df = ensure_time_dtype(X_df, time_col=time_col)\n",
" processed_X = ufp.process_df(\n",
" df=X_df, id_col=id_col, time_col=time_col, target_col=None,\n",
Expand Down Expand Up @@ -718,7 +729,8 @@
" if isinstance(v, np.ndarray):\n",
" if np.issubdtype(v.dtype, np.floating):\n",
" v_cont = np.ascontiguousarray(v, dtype=np.float32)\n",
" d[k] = np.nan_to_num(v_cont, \n",
" d[k] = np.nan_to_num(\n",
" v_cont, \n",
" nan=np.nan, \n",
" posinf=np.finfo(np.float32).max, \n",
" neginf=np.finfo(np.float32).min,\n",
Expand Down Expand Up @@ -1226,7 +1238,7 @@
" X = processed.data[:, 1:].T\n",
" if futr_cols is not None:\n",
" logger.info(f'Using future exogenous features: {futr_cols}')\n",
" if hist_exog_list is not None:\n",
" if hist_exog_list:\n",
" logger.info(f'Using historical exogenous features: {hist_exog_list}')\n",
" else:\n",
" X = None\n",
Expand Down Expand Up @@ -1541,6 +1553,7 @@
" finetune_depth: _Finetune_Depth,\n",
" finetune_loss: _Loss,\n",
" clean_ex_first: bool,\n",
" hist_exog_list: Optional[list[str]],\n",
" date_features: Union[bool, Sequence[Union[str, Callable]]],\n",
" date_features_to_one_hot: Union[bool, list[str]],\n",
" model: _Model,\n",
Expand All @@ -1550,7 +1563,7 @@
" \n",
" schema, partition_config = _distributed_setup(\n",
" df=df,\n",
" method='forecast',\n",
" method='cross_validation',\n",
" id_col=id_col,\n",
" time_col=time_col,\n",
" target_col=target_col,\n",
Expand Down Expand Up @@ -1578,6 +1591,7 @@
" finetune_depth=finetune_depth,\n",
" finetune_loss=finetune_loss,\n",
" clean_ex_first=clean_ex_first,\n",
" hist_exog_list=hist_exog_list,\n",
" date_features=date_features,\n",
" date_features_to_one_hot=date_features_to_one_hot,\n",
" model=model,\n",
Expand Down Expand Up @@ -1605,6 +1619,7 @@
" finetune_depth: _Finetune_Depth = 1,\n",
" finetune_loss: _Loss = 'default',\n",
" clean_ex_first: bool = True,\n",
" hist_exog_list: Optional[list[str]] = None,\n",
" date_features: Union[bool, list[str]] = False,\n",
" date_features_to_one_hot: Union[bool, list[str]] = False,\n",
" model: _Model = 'timegpt-1',\n",
Expand Down Expand Up @@ -1661,8 +1676,9 @@
" finetune_loss : str (default='default')\n",
" Loss function to use for finetuning. Options are: `default`, `mae`, `mse`, `rmse`, `mape`, and `smape`.\n",
" clean_ex_first : bool (default=True)\n",
" Clean exogenous signal before making forecasts\n",
" using TimeGPT.\n",
" Clean exogenous signal before making forecasts using TimeGPT.\n",
" hist_exog_list : list of str, optional (default=None)\n",
" Column names of the historical exogenous features.\n",
" date_features : bool or list of str or callable, optional (default=False)\n",
" Features computed from the dates.\n",
" Can be pandas date attributes or functions that will take the dates as input.\n",
Expand Down Expand Up @@ -1704,6 +1720,7 @@
" finetune_depth=finetune_depth,\n",
" finetune_loss=finetune_loss,\n",
" clean_ex_first=clean_ex_first,\n",
" hist_exog_list=hist_exog_list,\n",
" date_features=date_features,\n",
" date_features_to_one_hot=date_features_to_one_hot,\n",
" model=model,\n",
Expand Down Expand Up @@ -1767,9 +1784,29 @@
" targets = _array_tails(targets, orig_indptr, np.diff(processed.indptr))\n",
" if processed.data.shape[1] > 1:\n",
" X = processed.data[:, 1:].T\n",
" logger.info(f'Using the following exogenous features: {x_cols}')\n",
" if hist_exog_list is None:\n",
" hist_exog = None\n",
" futr_exog = x_cols\n",
" else:\n",
" missing_hist = set(hist_exog_list) - set(x_cols)\n",
" if missing_hist:\n",
" raise ValueError(\n",
" \"The following exogenous features were declared as historic \"\n",
" f\"but were not found in `df`: {missing_hist}.\"\n",
" )\n",
" futr_exog = [c for c in x_cols if c not in hist_exog_list]\n",
" # match the forecast method order [future, historic]\n",
" fcst_features_order = futr_exog + hist_exog_list\n",
" x_idxs = [x_cols.index(c) for c in fcst_features_order]\n",
" X = X[x_idxs]\n",
" hist_exog = [fcst_features_order.index(c) for c in hist_exog_list]\n",
" if futr_exog:\n",
" logger.info(f'Using future exogenous features: {futr_exog}')\n",
" if hist_exog_list:\n",
" logger.info(f'Using historical exogenous features: {hist_exog_list}')\n",
" else:\n",
" X = None\n",
" hist_exog = None\n",
"\n",
" logger.info('Calling Cross Validation Endpoint...')\n",
" payload = {\n",
Expand All @@ -1784,6 +1821,7 @@
" 'step_size': step_size,\n",
" 'freq': standard_freq,\n",
" 'clean_ex_first': clean_ex_first,\n",
" 'hist_exog': hist_exog,\n",
" 'level': level,\n",
" 'finetune_steps': finetune_steps,\n",
" 'finetune_depth': finetune_depth,\n",
Expand Down Expand Up @@ -2205,6 +2243,48 @@
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#| hide\n",
"# historic exog in cv\n",
"freq = 'D'\n",
"h = 5\n",
"series = generate_series(2, freq=freq)\n",
"series_with_features, _ = fourier(series, freq=freq, season_length=7, k=2)\n",
"splits = ufp.backtest_splits(\n",
" df=series_with_features,\n",
" n_windows=1,\n",
" h=h,\n",
" id_col='unique_id',\n",
" time_col='ds',\n",
" freq=freq,\n",
")\n",
"_, train, valid = next(splits)\n",
"x_cols = train.columns.drop(['unique_id', 'ds', 'y']).tolist()\n",
"for hist_exog_list in [None, [], [x_cols[2], x_cols[1]], x_cols]:\n",
" cv_res = nixtla_client.cross_validation(\n",
" series_with_features,\n",
" n_windows=1,\n",
" h=h,\n",
" freq=freq,\n",
" hist_exog_list=hist_exog_list,\n",
" )\n",
" fcst_res = nixtla_client.forecast(\n",
" train,\n",
" h=h,\n",
" freq=freq,\n",
" hist_exog_list=hist_exog_list,\n",
" X_df=valid,\n",
" )\n",
" np.testing.assert_allclose(\n",
" cv_res['TimeGPT'], fcst_res['TimeGPT'], atol=1e-4, rtol=1e-3\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down
52 changes: 44 additions & 8 deletions nixtla/nixtla_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,8 @@ def _validate_exog(
target_col: str,
hist_exog: Optional[list[str]],
) -> tuple[DFType, Optional[DFType]]:
exogs = [c for c in df.columns if c not in (id_col, time_col, target_col)]
base_cols = {id_col, time_col, target_col}
exogs = [c for c in df.columns if c not in base_cols]
if hist_exog is None:
hist_exog = []
if X_df is None:
Expand All @@ -367,7 +368,7 @@ def _validate_exog(
return df, None

# exogs in df that weren't declared as historic nor future
futr_exog = [c for c in X_df.columns if c not in (id_col, time_col)]
futr_exog = [c for c in X_df.columns if c not in base_cols]
declared_exogs = {*hist_exog, *futr_exog}
ignored_exogs = [c for c in exogs if c not in declared_exogs]
if ignored_exogs:
Expand All @@ -385,6 +386,15 @@ def _validate_exog(
f"but not in `df`: {missing_futr}."
)

# features are provided through X_df but declared as historic
futr_and_hist = set(futr_exog) & set(hist_exog)
if futr_and_hist:
warnings.warn(
"The following features were declared as historic but found in `X_df`: "
f"{futr_and_hist}, they will be considered as historic."
)
futr_exog = [f for f in futr_exog if f not in hist_exog]

# Make sure df and X_df are in right order
df = df[[id_col, time_col, target_col, *futr_exog, *hist_exog]]
X_df = X_df[[id_col, time_col, *futr_exog]]
Expand Down Expand Up @@ -469,7 +479,7 @@ def _preprocess(
processed = ufp.process_df(
df=df, id_col=id_col, time_col=time_col, target_col=target_col
)
if X_df is not None:
if X_df is not None and X_df.shape[1] > 2:
X_df = ensure_time_dtype(X_df, time_col=time_col)
processed_X = ufp.process_df(
df=X_df,
Expand Down Expand Up @@ -1158,7 +1168,7 @@ def forecast(
X = processed.data[:, 1:].T
if futr_cols is not None:
logger.info(f"Using future exogenous features: {futr_cols}")
if hist_exog_list is not None:
if hist_exog_list:
logger.info(f"Using historical exogenous features: {hist_exog_list}")
else:
X = None
Expand Down Expand Up @@ -1481,6 +1491,7 @@ def _distributed_cross_validation(
finetune_depth: _Finetune_Depth,
finetune_loss: _Loss,
clean_ex_first: bool,
hist_exog_list: Optional[list[str]],
date_features: Union[bool, Sequence[Union[str, Callable]]],
date_features_to_one_hot: Union[bool, list[str]],
model: _Model,
Expand All @@ -1490,7 +1501,7 @@ def _distributed_cross_validation(

schema, partition_config = _distributed_setup(
df=df,
method="forecast",
method="cross_validation",
id_col=id_col,
time_col=time_col,
target_col=target_col,
Expand Down Expand Up @@ -1518,6 +1529,7 @@ def _distributed_cross_validation(
finetune_depth=finetune_depth,
finetune_loss=finetune_loss,
clean_ex_first=clean_ex_first,
hist_exog_list=hist_exog_list,
date_features=date_features,
date_features_to_one_hot=date_features_to_one_hot,
model=model,
Expand Down Expand Up @@ -1545,6 +1557,7 @@ def cross_validation(
finetune_depth: _Finetune_Depth = 1,
finetune_loss: _Loss = "default",
clean_ex_first: bool = True,
hist_exog_list: Optional[list[str]] = None,
date_features: Union[bool, list[str]] = False,
date_features_to_one_hot: Union[bool, list[str]] = False,
model: _Model = "timegpt-1",
Expand Down Expand Up @@ -1601,8 +1614,9 @@ def cross_validation(
finetune_loss : str (default='default')
Loss function to use for finetuning. Options are: `default`, `mae`, `mse`, `rmse`, `mape`, and `smape`.
clean_ex_first : bool (default=True)
Clean exogenous signal before making forecasts
using TimeGPT.
Clean exogenous signal before making forecasts using TimeGPT.
hist_exog_list : list of str, optional (default=None)
Column names of the historical exogenous features.
date_features : bool or list of str or callable, optional (default=False)
Features computed from the dates.
Can be pandas date attributes or functions that will take the dates as input.
Expand Down Expand Up @@ -1644,6 +1658,7 @@ def cross_validation(
finetune_depth=finetune_depth,
finetune_loss=finetune_loss,
clean_ex_first=clean_ex_first,
hist_exog_list=hist_exog_list,
date_features=date_features,
date_features_to_one_hot=date_features_to_one_hot,
model=model,
Expand Down Expand Up @@ -1707,9 +1722,29 @@ def cross_validation(
targets = _array_tails(targets, orig_indptr, np.diff(processed.indptr))
if processed.data.shape[1] > 1:
X = processed.data[:, 1:].T
logger.info(f"Using the following exogenous features: {x_cols}")
if hist_exog_list is None:
hist_exog = None
futr_exog = x_cols
else:
missing_hist = set(hist_exog_list) - set(x_cols)
if missing_hist:
raise ValueError(
"The following exogenous features were declared as historic "
f"but were not found in `df`: {missing_hist}."
)
futr_exog = [c for c in x_cols if c not in hist_exog_list]
# match the forecast method order [future, historic]
fcst_features_order = futr_exog + hist_exog_list
x_idxs = [x_cols.index(c) for c in fcst_features_order]
X = X[x_idxs]
hist_exog = [fcst_features_order.index(c) for c in hist_exog_list]
if futr_exog:
logger.info(f"Using future exogenous features: {futr_exog}")
if hist_exog_list:
logger.info(f"Using historical exogenous features: {hist_exog_list}")
else:
X = None
hist_exog = None

logger.info("Calling Cross Validation Endpoint...")
payload = {
Expand All @@ -1724,6 +1759,7 @@ def cross_validation(
"step_size": step_size,
"freq": standard_freq,
"clean_ex_first": clean_ex_first,
"hist_exog": hist_exog,
"level": level,
"finetune_steps": finetune_steps,
"finetune_depth": finetune_depth,
Expand Down

0 comments on commit b40c2d0

Please sign in to comment.