diff --git a/nbs/src/nixtla_client.ipynb b/nbs/src/nixtla_client.ipynb index 5bf9b8cf..4c80949e 100644 --- a/nbs/src/nixtla_client.ipynb +++ b/nbs/src/nixtla_client.ipynb @@ -145,6 +145,7 @@ "from dotenv import load_dotenv\n", "from fastcore.test import test_eq, test_fail\n", "from utilsforecast.data import generate_series\n", + "from utilsforecast.feature_engineering import fourier\n", "\n", "from nixtla.date_features import SpecialDates" ] @@ -420,7 +421,8 @@ " target_col: str,\n", " hist_exog: Optional[list[str]],\n", ") -> tuple[DFType, Optional[DFType]]:\n", - " exogs = [c for c in df.columns if c not in (id_col, time_col, target_col)]\n", + " base_cols = {id_col, time_col, target_col}\n", + " exogs = [c for c in df.columns if c not in base_cols]\n", " if hist_exog is None:\n", " hist_exog = []\n", " if X_df is None:\n", @@ -437,7 +439,7 @@ " return df, None\n", "\n", " # exogs in df that weren't declared as historic nor future\n", - " futr_exog = [c for c in X_df.columns if c not in (id_col, time_col)]\n", + " futr_exog = [c for c in X_df.columns if c not in base_cols]\n", " declared_exogs = {*hist_exog, *futr_exog}\n", " ignored_exogs = [c for c in exogs if c not in declared_exogs]\n", " if ignored_exogs:\n", @@ -455,6 +457,15 @@ " f\"but not in `df`: {missing_futr}.\"\n", " )\n", "\n", + " # features are provided through X_df but declared as historic\n", + " futr_and_hist = set(futr_exog) & set(hist_exog)\n", + " if futr_and_hist:\n", + " warnings.warn(\n", + " \"The following features were declared as historic but found in `X_df`: \"\n", + " f\"{futr_and_hist}, they will be considered as historic.\"\n", + " )\n", + " futr_exog = [f for f in futr_exog if f not in hist_exog]\n", + "\n", " # Make sure df and X_df are in right order\n", " df = df[[id_col, time_col, target_col, *futr_exog, *hist_exog]]\n", " X_df = X_df[[id_col, time_col, *futr_exog]]\n", @@ -537,7 +548,7 @@ " processed = ufp.process_df(\n", " df=df, id_col=id_col, time_col=time_col, target_col=target_col\n", " )\n", - " if X_df is not None:\n", + " if X_df is not None and X_df.shape[1] > 2:\n", " X_df = ensure_time_dtype(X_df, time_col=time_col)\n", " processed_X = ufp.process_df(\n", " df=X_df, id_col=id_col, time_col=time_col, target_col=None,\n", @@ -718,7 +729,8 @@ " if isinstance(v, np.ndarray):\n", " if np.issubdtype(v.dtype, np.floating):\n", " v_cont = np.ascontiguousarray(v, dtype=np.float32)\n", - " d[k] = np.nan_to_num(v_cont, \n", + " d[k] = np.nan_to_num(\n", + " v_cont, \n", " nan=np.nan, \n", " posinf=np.finfo(np.float32).max, \n", " neginf=np.finfo(np.float32).min,\n", @@ -1226,7 +1238,7 @@ " X = processed.data[:, 1:].T\n", " if futr_cols is not None:\n", " logger.info(f'Using future exogenous features: {futr_cols}')\n", - " if hist_exog_list is not None:\n", + " if hist_exog_list:\n", " logger.info(f'Using historical exogenous features: {hist_exog_list}')\n", " else:\n", " X = None\n", @@ -1541,6 +1553,7 @@ " finetune_depth: _Finetune_Depth,\n", " finetune_loss: _Loss,\n", " clean_ex_first: bool,\n", + " hist_exog_list: Optional[list[str]],\n", " date_features: Union[bool, Sequence[Union[str, Callable]]],\n", " date_features_to_one_hot: Union[bool, list[str]],\n", " model: _Model,\n", @@ -1550,7 +1563,7 @@ " \n", " schema, partition_config = _distributed_setup(\n", " df=df,\n", - " method='forecast',\n", + " method='cross_validation',\n", " id_col=id_col,\n", " time_col=time_col,\n", " target_col=target_col,\n", @@ -1578,6 +1591,7 @@ " finetune_depth=finetune_depth,\n", " finetune_loss=finetune_loss,\n", " clean_ex_first=clean_ex_first,\n", + " hist_exog_list=hist_exog_list,\n", " date_features=date_features,\n", " date_features_to_one_hot=date_features_to_one_hot,\n", " model=model,\n", @@ -1605,6 +1619,7 @@ " finetune_depth: _Finetune_Depth = 1,\n", " finetune_loss: _Loss = 'default',\n", " clean_ex_first: bool = True,\n", + " hist_exog_list: Optional[list[str]] = None,\n", " date_features: Union[bool, list[str]] = False,\n", " date_features_to_one_hot: Union[bool, list[str]] = False,\n", " model: _Model = 'timegpt-1',\n", @@ -1661,8 +1676,9 @@ " finetune_loss : str (default='default')\n", " Loss function to use for finetuning. Options are: `default`, `mae`, `mse`, `rmse`, `mape`, and `smape`.\n", " clean_ex_first : bool (default=True)\n", - " Clean exogenous signal before making forecasts\n", - " using TimeGPT.\n", + " Clean exogenous signal before making forecasts using TimeGPT.\n", + " hist_exog_list : list of str, optional (default=None)\n", + " Column names of the historical exogenous features.\n", " date_features : bool or list of str or callable, optional (default=False)\n", " Features computed from the dates.\n", " Can be pandas date attributes or functions that will take the dates as input.\n", @@ -1704,6 +1720,7 @@ " finetune_depth=finetune_depth,\n", " finetune_loss=finetune_loss,\n", " clean_ex_first=clean_ex_first,\n", + " hist_exog_list=hist_exog_list,\n", " date_features=date_features,\n", " date_features_to_one_hot=date_features_to_one_hot,\n", " model=model,\n", @@ -1767,9 +1784,29 @@ " targets = _array_tails(targets, orig_indptr, np.diff(processed.indptr))\n", " if processed.data.shape[1] > 1:\n", " X = processed.data[:, 1:].T\n", - " logger.info(f'Using the following exogenous features: {x_cols}')\n", + " if hist_exog_list is None:\n", + " hist_exog = None\n", + " futr_exog = x_cols\n", + " else:\n", + " missing_hist = set(hist_exog_list) - set(x_cols)\n", + " if missing_hist:\n", + " raise ValueError(\n", + " \"The following exogenous features were declared as historic \"\n", + " f\"but were not found in `df`: {missing_hist}.\"\n", + " )\n", + " futr_exog = [c for c in x_cols if c not in hist_exog_list]\n", + " # match the forecast method order [future, historic]\n", + " fcst_features_order = futr_exog + hist_exog_list\n", + " x_idxs = [x_cols.index(c) for c in fcst_features_order]\n", + " X = X[x_idxs]\n", + " hist_exog = [fcst_features_order.index(c) for c in hist_exog_list]\n", + " if futr_exog:\n", + " logger.info(f'Using future exogenous features: {futr_exog}')\n", + " if hist_exog_list:\n", + " logger.info(f'Using historical exogenous features: {hist_exog_list}')\n", " else:\n", " X = None\n", + " hist_exog = None\n", "\n", " logger.info('Calling Cross Validation Endpoint...')\n", " payload = {\n", @@ -1784,6 +1821,7 @@ " 'step_size': step_size,\n", " 'freq': standard_freq,\n", " 'clean_ex_first': clean_ex_first,\n", + " 'hist_exog': hist_exog,\n", " 'level': level,\n", " 'finetune_steps': finetune_steps,\n", " 'finetune_depth': finetune_depth,\n", @@ -2205,6 +2243,48 @@ ")" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#| hide\n", + "# historic exog in cv\n", + "freq = 'D'\n", + "h = 5\n", + "series = generate_series(2, freq=freq)\n", + "series_with_features, _ = fourier(series, freq=freq, season_length=7, k=2)\n", + "splits = ufp.backtest_splits(\n", + " df=series_with_features,\n", + " n_windows=1,\n", + " h=h,\n", + " id_col='unique_id',\n", + " time_col='ds',\n", + " freq=freq,\n", + ")\n", + "_, train, valid = next(splits)\n", + "x_cols = train.columns.drop(['unique_id', 'ds', 'y']).tolist()\n", + "for hist_exog_list in [None, [], [x_cols[2], x_cols[1]], x_cols]:\n", + " cv_res = nixtla_client.cross_validation(\n", + " series_with_features,\n", + " n_windows=1,\n", + " h=h,\n", + " freq=freq,\n", + " hist_exog_list=hist_exog_list,\n", + " )\n", + " fcst_res = nixtla_client.forecast(\n", + " train,\n", + " h=h,\n", + " freq=freq,\n", + " hist_exog_list=hist_exog_list,\n", + " X_df=valid,\n", + " )\n", + " np.testing.assert_allclose(\n", + " cv_res['TimeGPT'], fcst_res['TimeGPT'], atol=1e-4, rtol=1e-3\n", + " )" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/nixtla/nixtla_client.py b/nixtla/nixtla_client.py index 514323f1..5e8b4bbf 100644 --- a/nixtla/nixtla_client.py +++ b/nixtla/nixtla_client.py @@ -350,7 +350,8 @@ def _validate_exog( target_col: str, hist_exog: Optional[list[str]], ) -> tuple[DFType, Optional[DFType]]: - exogs = [c for c in df.columns if c not in (id_col, time_col, target_col)] + base_cols = {id_col, time_col, target_col} + exogs = [c for c in df.columns if c not in base_cols] if hist_exog is None: hist_exog = [] if X_df is None: @@ -367,7 +368,7 @@ def _validate_exog( return df, None # exogs in df that weren't declared as historic nor future - futr_exog = [c for c in X_df.columns if c not in (id_col, time_col)] + futr_exog = [c for c in X_df.columns if c not in base_cols] declared_exogs = {*hist_exog, *futr_exog} ignored_exogs = [c for c in exogs if c not in declared_exogs] if ignored_exogs: @@ -385,6 +386,15 @@ def _validate_exog( f"but not in `df`: {missing_futr}." ) + # features are provided through X_df but declared as historic + futr_and_hist = set(futr_exog) & set(hist_exog) + if futr_and_hist: + warnings.warn( + "The following features were declared as historic but found in `X_df`: " + f"{futr_and_hist}, they will be considered as historic." + ) + futr_exog = [f for f in futr_exog if f not in hist_exog] + # Make sure df and X_df are in right order df = df[[id_col, time_col, target_col, *futr_exog, *hist_exog]] X_df = X_df[[id_col, time_col, *futr_exog]] @@ -469,7 +479,7 @@ def _preprocess( processed = ufp.process_df( df=df, id_col=id_col, time_col=time_col, target_col=target_col ) - if X_df is not None: + if X_df is not None and X_df.shape[1] > 2: X_df = ensure_time_dtype(X_df, time_col=time_col) processed_X = ufp.process_df( df=X_df, @@ -1158,7 +1168,7 @@ def forecast( X = processed.data[:, 1:].T if futr_cols is not None: logger.info(f"Using future exogenous features: {futr_cols}") - if hist_exog_list is not None: + if hist_exog_list: logger.info(f"Using historical exogenous features: {hist_exog_list}") else: X = None @@ -1481,6 +1491,7 @@ def _distributed_cross_validation( finetune_depth: _Finetune_Depth, finetune_loss: _Loss, clean_ex_first: bool, + hist_exog_list: Optional[list[str]], date_features: Union[bool, Sequence[Union[str, Callable]]], date_features_to_one_hot: Union[bool, list[str]], model: _Model, @@ -1490,7 +1501,7 @@ def _distributed_cross_validation( schema, partition_config = _distributed_setup( df=df, - method="forecast", + method="cross_validation", id_col=id_col, time_col=time_col, target_col=target_col, @@ -1518,6 +1529,7 @@ def _distributed_cross_validation( finetune_depth=finetune_depth, finetune_loss=finetune_loss, clean_ex_first=clean_ex_first, + hist_exog_list=hist_exog_list, date_features=date_features, date_features_to_one_hot=date_features_to_one_hot, model=model, @@ -1545,6 +1557,7 @@ def cross_validation( finetune_depth: _Finetune_Depth = 1, finetune_loss: _Loss = "default", clean_ex_first: bool = True, + hist_exog_list: Optional[list[str]] = None, date_features: Union[bool, list[str]] = False, date_features_to_one_hot: Union[bool, list[str]] = False, model: _Model = "timegpt-1", @@ -1601,8 +1614,9 @@ def cross_validation( finetune_loss : str (default='default') Loss function to use for finetuning. Options are: `default`, `mae`, `mse`, `rmse`, `mape`, and `smape`. clean_ex_first : bool (default=True) - Clean exogenous signal before making forecasts - using TimeGPT. + Clean exogenous signal before making forecasts using TimeGPT. + hist_exog_list : list of str, optional (default=None) + Column names of the historical exogenous features. date_features : bool or list of str or callable, optional (default=False) Features computed from the dates. Can be pandas date attributes or functions that will take the dates as input. @@ -1644,6 +1658,7 @@ def cross_validation( finetune_depth=finetune_depth, finetune_loss=finetune_loss, clean_ex_first=clean_ex_first, + hist_exog_list=hist_exog_list, date_features=date_features, date_features_to_one_hot=date_features_to_one_hot, model=model, @@ -1707,9 +1722,29 @@ def cross_validation( targets = _array_tails(targets, orig_indptr, np.diff(processed.indptr)) if processed.data.shape[1] > 1: X = processed.data[:, 1:].T - logger.info(f"Using the following exogenous features: {x_cols}") + if hist_exog_list is None: + hist_exog = None + futr_exog = x_cols + else: + missing_hist = set(hist_exog_list) - set(x_cols) + if missing_hist: + raise ValueError( + "The following exogenous features were declared as historic " + f"but were not found in `df`: {missing_hist}." + ) + futr_exog = [c for c in x_cols if c not in hist_exog_list] + # match the forecast method order [future, historic] + fcst_features_order = futr_exog + hist_exog_list + x_idxs = [x_cols.index(c) for c in fcst_features_order] + X = X[x_idxs] + hist_exog = [fcst_features_order.index(c) for c in hist_exog_list] + if futr_exog: + logger.info(f"Using future exogenous features: {futr_exog}") + if hist_exog_list: + logger.info(f"Using historical exogenous features: {hist_exog_list}") else: X = None + hist_exog = None logger.info("Calling Cross Validation Endpoint...") payload = { @@ -1724,6 +1759,7 @@ def cross_validation( "step_size": step_size, "freq": standard_freq, "clean_ex_first": clean_ex_first, + "hist_exog": hist_exog, "level": level, "finetune_steps": finetune_steps, "finetune_depth": finetune_depth,