feat: add hist_exog_list argument to cross_validation

Nixtla · Nov 7, 2024 · b40c2d0 · b40c2d0
1 parent b420490
commit b40c2d0
Show file tree

Hide file tree

Showing 2 changed files with 133 additions and 17 deletions.
diff --git a/nbs/src/nixtla_client.ipynb b/nbs/src/nixtla_client.ipynb
@@ -145,6 +145,7 @@
     "from dotenv import load_dotenv\n",
     "from fastcore.test import test_eq, test_fail\n",
     "from utilsforecast.data import generate_series\n",
+    "from utilsforecast.feature_engineering import fourier\n",
     "\n",
     "from nixtla.date_features import SpecialDates"
    ]
@@ -420,7 +421,8 @@
     "    target_col: str,\n",
     "    hist_exog: Optional[list[str]],\n",
     ") -> tuple[DFType, Optional[DFType]]:\n",
-    "    exogs = [c for c in df.columns if c not in (id_col, time_col, target_col)]\n",
+    "    base_cols = {id_col, time_col, target_col}\n",
+    "    exogs = [c for c in df.columns if c not in base_cols]\n",
     "    if hist_exog is None:\n",
     "        hist_exog = []\n",
     "    if X_df is None:\n",
@@ -437,7 +439,7 @@
     "        return df, None\n",
     "\n",
     "    # exogs in df that weren't declared as historic nor future\n",
-    "    futr_exog = [c for c in X_df.columns if c not in (id_col, time_col)]\n",
+    "    futr_exog = [c for c in X_df.columns if c not in base_cols]\n",
     "    declared_exogs = {*hist_exog, *futr_exog}\n",
     "    ignored_exogs = [c for c in exogs if c not in declared_exogs]\n",
     "    if ignored_exogs:\n",
@@ -455,6 +457,15 @@
     "            f\"but not in `df`: {missing_futr}.\"\n",
     "        )\n",
     "\n",
+    "    # features are provided through X_df but declared as historic\n",
+    "    futr_and_hist = set(futr_exog) & set(hist_exog)\n",
+    "    if futr_and_hist:\n",
+    "        warnings.warn(\n",
+    "            \"The following features were declared as historic but found in `X_df`: \"\n",
+    "            f\"{futr_and_hist}, they will be considered as historic.\"\n",
+    "        )\n",
+    "        futr_exog = [f for f in futr_exog if f not in hist_exog]\n",
+    "\n",
     "    # Make sure df and X_df are in right order\n",
     "    df = df[[id_col, time_col, target_col, *futr_exog, *hist_exog]]\n",
     "    X_df = X_df[[id_col, time_col, *futr_exog]]\n",
@@ -537,7 +548,7 @@
     "    processed = ufp.process_df(\n",
     "        df=df, id_col=id_col, time_col=time_col, target_col=target_col\n",
     "    )\n",
-    "    if X_df is not None:\n",
+    "    if X_df is not None and X_df.shape[1] > 2:\n",
     "        X_df = ensure_time_dtype(X_df, time_col=time_col)\n",
     "        processed_X = ufp.process_df(\n",
     "            df=X_df, id_col=id_col, time_col=time_col, target_col=None,\n",
@@ -718,7 +729,8 @@
     "                if isinstance(v, np.ndarray):\n",
     "                    if np.issubdtype(v.dtype, np.floating):\n",
     "                        v_cont = np.ascontiguousarray(v, dtype=np.float32)\n",
-    "                        d[k] = np.nan_to_num(v_cont, \n",
+    "                        d[k] = np.nan_to_num(\n",
+    "                            v_cont, \n",
     "                            nan=np.nan, \n",
     "                            posinf=np.finfo(np.float32).max, \n",
     "                            neginf=np.finfo(np.float32).min,\n",
@@ -1226,7 +1238,7 @@
     "            X = processed.data[:, 1:].T\n",
     "            if futr_cols is not None:\n",
     "                logger.info(f'Using future exogenous features: {futr_cols}')\n",
-    "            if hist_exog_list is not None:\n",
+    "            if hist_exog_list:\n",
     "                logger.info(f'Using historical exogenous features: {hist_exog_list}')\n",
     "        else:\n",
     "            X = None\n",
@@ -1541,6 +1553,7 @@
     "        finetune_depth: _Finetune_Depth,\n",
     "        finetune_loss: _Loss,\n",
     "        clean_ex_first: bool,\n",
+    "        hist_exog_list: Optional[list[str]],\n",
     "        date_features: Union[bool, Sequence[Union[str, Callable]]],\n",
     "        date_features_to_one_hot: Union[bool, list[str]],\n",
     "        model: _Model,\n",
@@ -1550,7 +1563,7 @@
     "    \n",
     "        schema, partition_config = _distributed_setup(\n",
     "            df=df,\n",
-    "            method='forecast',\n",
+    "            method='cross_validation',\n",
     "            id_col=id_col,\n",
     "            time_col=time_col,\n",
     "            target_col=target_col,\n",
@@ -1578,6 +1591,7 @@
     "                finetune_depth=finetune_depth,\n",
     "                finetune_loss=finetune_loss,\n",
     "                clean_ex_first=clean_ex_first,\n",
+    "                hist_exog_list=hist_exog_list,\n",
     "                date_features=date_features,\n",
     "                date_features_to_one_hot=date_features_to_one_hot,\n",
     "                model=model,\n",
@@ -1605,6 +1619,7 @@
     "        finetune_depth: _Finetune_Depth = 1,\n",
     "        finetune_loss: _Loss = 'default',\n",
     "        clean_ex_first: bool = True,\n",
+    "        hist_exog_list: Optional[list[str]] = None,\n",
     "        date_features: Union[bool, list[str]] = False,\n",
     "        date_features_to_one_hot: Union[bool, list[str]] = False,\n",
     "        model: _Model = 'timegpt-1',\n",
@@ -1661,8 +1676,9 @@
     "        finetune_loss : str (default='default')\n",
     "            Loss function to use for finetuning. Options are: `default`, `mae`, `mse`, `rmse`, `mape`, and `smape`.\n",
     "        clean_ex_first : bool (default=True)\n",
-    "            Clean exogenous signal before making forecasts\n",
-    "            using TimeGPT.\n",
+    "            Clean exogenous signal before making forecasts using TimeGPT.\n",
+    "        hist_exog_list : list of str, optional (default=None)\n",
+    "            Column names of the historical exogenous features.\n",
     "        date_features : bool or list of str or callable, optional (default=False)\n",
     "            Features computed from the dates.\n",
     "            Can be pandas date attributes or functions that will take the dates as input.\n",
@@ -1704,6 +1720,7 @@
     "                finetune_depth=finetune_depth,\n",
     "                finetune_loss=finetune_loss,\n",
     "                clean_ex_first=clean_ex_first,\n",
+    "                hist_exog_list=hist_exog_list,\n",
     "                date_features=date_features,\n",
     "                date_features_to_one_hot=date_features_to_one_hot,\n",
     "                model=model,\n",
@@ -1767,9 +1784,29 @@
     "            targets = _array_tails(targets, orig_indptr, np.diff(processed.indptr))\n",
     "        if processed.data.shape[1] > 1:\n",
     "            X = processed.data[:, 1:].T\n",
-    "            logger.info(f'Using the following exogenous features: {x_cols}')\n",
+    "            if hist_exog_list is None:\n",
+    "                hist_exog = None\n",
+    "                futr_exog = x_cols\n",
+    "            else:\n",
+    "                missing_hist = set(hist_exog_list) - set(x_cols)\n",
+    "                if missing_hist:\n",
+    "                    raise ValueError(\n",
+    "                        \"The following exogenous features were declared as historic \"\n",
+    "                        f\"but were not found in `df`: {missing_hist}.\"\n",
+    "                    )\n",
+    "                futr_exog = [c for c in x_cols if c not in hist_exog_list]\n",
+    "                # match the forecast method order [future, historic]\n",
+    "                fcst_features_order = futr_exog + hist_exog_list\n",
+    "                x_idxs = [x_cols.index(c) for c in fcst_features_order]\n",
+    "                X = X[x_idxs]\n",
+    "                hist_exog = [fcst_features_order.index(c) for c in hist_exog_list]\n",
+    "            if futr_exog:\n",
+    "                logger.info(f'Using future exogenous features: {futr_exog}')\n",
+    "            if hist_exog_list:\n",
+    "                logger.info(f'Using historical exogenous features: {hist_exog_list}')\n",
     "        else:\n",
     "            X = None\n",
+    "            hist_exog = None\n",
     "\n",
     "        logger.info('Calling Cross Validation Endpoint...')\n",
     "        payload = {\n",
@@ -1784,6 +1821,7 @@
     "            'step_size': step_size,\n",
     "            'freq': standard_freq,\n",
     "            'clean_ex_first': clean_ex_first,\n",
+    "            'hist_exog': hist_exog,\n",
     "            'level': level,\n",
     "            'finetune_steps': finetune_steps,\n",
     "            'finetune_depth': finetune_depth,\n",
@@ -2205,6 +2243,48 @@
     ")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| hide\n",
+    "# historic exog in cv\n",
+    "freq = 'D'\n",
+    "h = 5\n",
+    "series = generate_series(2, freq=freq)\n",
+    "series_with_features, _ = fourier(series, freq=freq, season_length=7, k=2)\n",
+    "splits = ufp.backtest_splits(\n",
+    "    df=series_with_features,\n",
+    "    n_windows=1,\n",
+    "    h=h,\n",
+    "    id_col='unique_id',\n",
+    "    time_col='ds',\n",
+    "    freq=freq,\n",
+    ")\n",
+    "_, train, valid = next(splits)\n",
+    "x_cols = train.columns.drop(['unique_id', 'ds', 'y']).tolist()\n",
+    "for hist_exog_list in [None, [], [x_cols[2], x_cols[1]], x_cols]:\n",
+    "    cv_res = nixtla_client.cross_validation(\n",
+    "        series_with_features,\n",
+    "        n_windows=1,\n",
+    "        h=h,\n",
+    "        freq=freq,\n",
+    "        hist_exog_list=hist_exog_list,\n",
+    "    )\n",
+    "    fcst_res = nixtla_client.forecast(\n",
+    "        train,\n",
+    "        h=h,\n",
+    "        freq=freq,\n",
+    "        hist_exog_list=hist_exog_list,\n",
+    "        X_df=valid,\n",
+    "    )\n",
+    "    np.testing.assert_allclose(\n",
+    "        cv_res['TimeGPT'], fcst_res['TimeGPT'], atol=1e-4, rtol=1e-3\n",
+    "    )"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,

diff --git a/nixtla/nixtla_client.py b/nixtla/nixtla_client.py
@@ -350,7 +350,8 @@ def _validate_exog(
     target_col: str,
     hist_exog: Optional[list[str]],
 ) -> tuple[DFType, Optional[DFType]]:
-    exogs = [c for c in df.columns if c not in (id_col, time_col, target_col)]
+    base_cols = {id_col, time_col, target_col}
+    exogs = [c for c in df.columns if c not in base_cols]
     if hist_exog is None:
         hist_exog = []
     if X_df is None:
@@ -367,7 +368,7 @@ def _validate_exog(
         return df, None
 
     # exogs in df that weren't declared as historic nor future
-    futr_exog = [c for c in X_df.columns if c not in (id_col, time_col)]
+    futr_exog = [c for c in X_df.columns if c not in base_cols]
     declared_exogs = {*hist_exog, *futr_exog}
     ignored_exogs = [c for c in exogs if c not in declared_exogs]
     if ignored_exogs:
@@ -385,6 +386,15 @@ def _validate_exog(
             f"but not in `df`: {missing_futr}."
         )
 
+    # features are provided through X_df but declared as historic
+    futr_and_hist = set(futr_exog) & set(hist_exog)
+    if futr_and_hist:
+        warnings.warn(
+            "The following features were declared as historic but found in `X_df`: "
+            f"{futr_and_hist}, they will be considered as historic."
+        )
+        futr_exog = [f for f in futr_exog if f not in hist_exog]
+
     # Make sure df and X_df are in right order
     df = df[[id_col, time_col, target_col, *futr_exog, *hist_exog]]
     X_df = X_df[[id_col, time_col, *futr_exog]]
@@ -469,7 +479,7 @@ def _preprocess(
     processed = ufp.process_df(
         df=df, id_col=id_col, time_col=time_col, target_col=target_col
     )
-    if X_df is not None:
+    if X_df is not None and X_df.shape[1] > 2:
         X_df = ensure_time_dtype(X_df, time_col=time_col)
         processed_X = ufp.process_df(
             df=X_df,
@@ -1158,7 +1168,7 @@ def forecast(
             X = processed.data[:, 1:].T
             if futr_cols is not None:
                 logger.info(f"Using future exogenous features: {futr_cols}")
-            if hist_exog_list is not None:
+            if hist_exog_list:
                 logger.info(f"Using historical exogenous features: {hist_exog_list}")
         else:
             X = None
@@ -1481,6 +1491,7 @@ def _distributed_cross_validation(
         finetune_depth: _Finetune_Depth,
         finetune_loss: _Loss,
         clean_ex_first: bool,
+        hist_exog_list: Optional[list[str]],
         date_features: Union[bool, Sequence[Union[str, Callable]]],
         date_features_to_one_hot: Union[bool, list[str]],
         model: _Model,
@@ -1490,7 +1501,7 @@ def _distributed_cross_validation(
 
         schema, partition_config = _distributed_setup(
             df=df,
-            method="forecast",
+            method="cross_validation",
             id_col=id_col,
             time_col=time_col,
             target_col=target_col,
@@ -1518,6 +1529,7 @@ def _distributed_cross_validation(
                 finetune_depth=finetune_depth,
                 finetune_loss=finetune_loss,
                 clean_ex_first=clean_ex_first,
+                hist_exog_list=hist_exog_list,
                 date_features=date_features,
                 date_features_to_one_hot=date_features_to_one_hot,
                 model=model,
@@ -1545,6 +1557,7 @@ def cross_validation(
         finetune_depth: _Finetune_Depth = 1,
         finetune_loss: _Loss = "default",
         clean_ex_first: bool = True,
+        hist_exog_list: Optional[list[str]] = None,
         date_features: Union[bool, list[str]] = False,
         date_features_to_one_hot: Union[bool, list[str]] = False,
         model: _Model = "timegpt-1",
@@ -1601,8 +1614,9 @@ def cross_validation(
         finetune_loss : str (default='default')
             Loss function to use for finetuning. Options are: `default`, `mae`, `mse`, `rmse`, `mape`, and `smape`.
         clean_ex_first : bool (default=True)
-            Clean exogenous signal before making forecasts
-            using TimeGPT.
+            Clean exogenous signal before making forecasts using TimeGPT.
+        hist_exog_list : list of str, optional (default=None)
+            Column names of the historical exogenous features.
         date_features : bool or list of str or callable, optional (default=False)
             Features computed from the dates.
             Can be pandas date attributes or functions that will take the dates as input.
@@ -1644,6 +1658,7 @@ def cross_validation(
                 finetune_depth=finetune_depth,
                 finetune_loss=finetune_loss,
                 clean_ex_first=clean_ex_first,
+                hist_exog_list=hist_exog_list,
                 date_features=date_features,
                 date_features_to_one_hot=date_features_to_one_hot,
                 model=model,
@@ -1707,9 +1722,29 @@ def cross_validation(
             targets = _array_tails(targets, orig_indptr, np.diff(processed.indptr))
         if processed.data.shape[1] > 1:
             X = processed.data[:, 1:].T
-            logger.info(f"Using the following exogenous features: {x_cols}")
+            if hist_exog_list is None:
+                hist_exog = None
+                futr_exog = x_cols
+            else:
+                missing_hist = set(hist_exog_list) - set(x_cols)
+                if missing_hist:
+                    raise ValueError(
+                        "The following exogenous features were declared as historic "
+                        f"but were not found in `df`: {missing_hist}."
+                    )
+                futr_exog = [c for c in x_cols if c not in hist_exog_list]
+                # match the forecast method order [future, historic]
+                fcst_features_order = futr_exog + hist_exog_list
+                x_idxs = [x_cols.index(c) for c in fcst_features_order]
+                X = X[x_idxs]
+                hist_exog = [fcst_features_order.index(c) for c in hist_exog_list]
+            if futr_exog:
+                logger.info(f"Using future exogenous features: {futr_exog}")
+            if hist_exog_list:
+                logger.info(f"Using historical exogenous features: {hist_exog_list}")
         else:
             X = None
+            hist_exog = None
 
         logger.info("Calling Cross Validation Endpoint...")
         payload = {
@@ -1724,6 +1759,7 @@ def cross_validation(
             "step_size": step_size,
             "freq": standard_freq,
             "clean_ex_first": clean_ex_first,
+            "hist_exog": hist_exog,
             "level": level,
             "finetune_steps": finetune_steps,
             "finetune_depth": finetune_depth,