Nixtla · jmoralez · Nov 25, 2024 · Nov 7, 2024 · Nov 12, 2024 · Nov 13, 2024
diff --git a/nbs/src/nixtla_client.ipynb b/nbs/src/nixtla_client.ipynb
@@ -145,6 +145,7 @@
     "from dotenv import load_dotenv\n",
     "from fastcore.test import test_eq, test_fail\n",
     "from utilsforecast.data import generate_series\n",
+    "from utilsforecast.feature_engineering import fourier\n",
     "\n",
     "from nixtla.date_features import SpecialDates"
    ]
@@ -420,7 +421,8 @@
     "    target_col: str,\n",
     "    hist_exog: Optional[list[str]],\n",
     ") -> tuple[DFType, Optional[DFType]]:\n",
-    "    exogs = [c for c in df.columns if c not in (id_col, time_col, target_col)]\n",
+    "    base_cols = {id_col, time_col, target_col}\n",
+    "    exogs = [c for c in df.columns if c not in base_cols]\n",
     "    if hist_exog is None:\n",
     "        hist_exog = []\n",
     "    if X_df is None:\n",
@@ -437,7 +439,7 @@
     "        return df, None\n",
     "\n",
     "    # exogs in df that weren't declared as historic nor future\n",
-    "    futr_exog = [c for c in X_df.columns if c not in (id_col, time_col)]\n",
+    "    futr_exog = [c for c in X_df.columns if c not in base_cols]\n",
     "    declared_exogs = {*hist_exog, *futr_exog}\n",
     "    ignored_exogs = [c for c in exogs if c not in declared_exogs]\n",
     "    if ignored_exogs:\n",
@@ -455,6 +457,15 @@
     "            f\"but not in `df`: {missing_futr}.\"\n",
     "        )\n",
     "\n",
+    "    # features are provided through X_df but declared as historic\n",
+    "    futr_and_hist = set(futr_exog) & set(hist_exog)\n",
+    "    if futr_and_hist:\n",
+    "        warnings.warn(\n",
+    "            \"The following features were declared as historic but found in `X_df`: \"\n",
+    "            f\"{futr_and_hist}, they will be considered as historic.\"\n",
+    "        )\n",
+    "        futr_exog = [f for f in futr_exog if f not in hist_exog]\n",
+    "\n",
     "    # Make sure df and X_df are in right order\n",
     "    df = df[[id_col, time_col, target_col, *futr_exog, *hist_exog]]\n",
     "    X_df = X_df[[id_col, time_col, *futr_exog]]\n",
@@ -537,7 +548,7 @@
     "    processed = ufp.process_df(\n",
     "        df=df, id_col=id_col, time_col=time_col, target_col=target_col\n",
     "    )\n",
-    "    if X_df is not None:\n",
+    "    if X_df is not None and X_df.shape[1] > 2:\n",
     "        X_df = ensure_time_dtype(X_df, time_col=time_col)\n",
     "        processed_X = ufp.process_df(\n",
     "            df=X_df, id_col=id_col, time_col=time_col, target_col=None,\n",
@@ -718,7 +729,8 @@
     "                if isinstance(v, np.ndarray):\n",
     "                    if np.issubdtype(v.dtype, np.floating):\n",
     "                        v_cont = np.ascontiguousarray(v, dtype=np.float32)\n",
-    "                        d[k] = np.nan_to_num(v_cont, \n",
+    "                        d[k] = np.nan_to_num(\n",
+    "                            v_cont, \n",
     "                            nan=np.nan, \n",
     "                            posinf=np.finfo(np.float32).max, \n",
     "                            neginf=np.finfo(np.float32).min,\n",
@@ -967,6 +979,7 @@
     "        finetune_depth: _Finetune_Depth,\n",
     "        finetune_loss: _Loss,\n",
     "        clean_ex_first: bool,\n",
+    "        hist_exog_list: Optional[list[str]],\n",
     "        validate_api_key: bool,\n",
     "        add_history: bool,\n",
     "        date_features: Union[bool, list[Union[str, Callable]]],\n",
@@ -1023,6 +1036,7 @@
     "                finetune_depth=finetune_depth,\n",
     "                finetune_loss=finetune_loss,\n",
     "                clean_ex_first=clean_ex_first,\n",
+    "                hist_exog_list=hist_exog_list,\n",
     "                validate_api_key=validate_api_key,\n",
     "                add_history=add_history,\n",
     "                date_features=date_features,\n",
@@ -1158,6 +1172,7 @@
     "                finetune_depth=finetune_depth,\n",
     "                finetune_loss=finetune_loss,\n",
     "                clean_ex_first=clean_ex_first,\n",
+    "                hist_exog_list=hist_exog_list,\n",
     "                validate_api_key=validate_api_key,\n",
     "                add_history=add_history,\n",
     "                date_features=date_features,\n",
@@ -1226,7 +1241,7 @@
     "            X = processed.data[:, 1:].T\n",
     "            if futr_cols is not None:\n",
     "                logger.info(f'Using future exogenous features: {futr_cols}')\n",
-    "            if hist_exog_list is not None:\n",
+    "            if hist_exog_list:\n",
     "                logger.info(f'Using historical exogenous features: {hist_exog_list}')\n",
     "        else:\n",
     "            X = None\n",
@@ -1541,6 +1556,7 @@
     "        finetune_depth: _Finetune_Depth,\n",
     "        finetune_loss: _Loss,\n",
     "        clean_ex_first: bool,\n",
+    "        hist_exog_list: Optional[list[str]],\n",
     "        date_features: Union[bool, Sequence[Union[str, Callable]]],\n",
     "        date_features_to_one_hot: Union[bool, list[str]],\n",
     "        model: _Model,\n",
@@ -1550,7 +1566,7 @@
     "    \n",
     "        schema, partition_config = _distributed_setup(\n",
     "            df=df,\n",
-    "            method='forecast',\n",
+    "            method='cross_validation',\n",
     "            id_col=id_col,\n",
     "            time_col=time_col,\n",
     "            target_col=target_col,\n",
@@ -1578,6 +1594,7 @@
     "                finetune_depth=finetune_depth,\n",
     "                finetune_loss=finetune_loss,\n",
     "                clean_ex_first=clean_ex_first,\n",
+    "                hist_exog_list=hist_exog_list,\n",
     "                date_features=date_features,\n",
     "                date_features_to_one_hot=date_features_to_one_hot,\n",
     "                model=model,\n",
@@ -1605,6 +1622,7 @@
     "        finetune_depth: _Finetune_Depth = 1,\n",
     "        finetune_loss: _Loss = 'default',\n",
     "        clean_ex_first: bool = True,\n",
+    "        hist_exog_list: Optional[list[str]] = None,\n",
     "        date_features: Union[bool, list[str]] = False,\n",
     "        date_features_to_one_hot: Union[bool, list[str]] = False,\n",
     "        model: _Model = 'timegpt-1',\n",
@@ -1661,8 +1679,9 @@
     "        finetune_loss : str (default='default')\n",
     "            Loss function to use for finetuning. Options are: `default`, `mae`, `mse`, `rmse`, `mape`, and `smape`.\n",
     "        clean_ex_first : bool (default=True)\n",
-    "            Clean exogenous signal before making forecasts\n",
-    "            using TimeGPT.\n",
+    "            Clean exogenous signal before making forecasts using TimeGPT.\n",
+    "        hist_exog_list : list of str, optional (default=None)\n",
+    "            Column names of the historical exogenous features.\n",
     "        date_features : bool or list of str or callable, optional (default=False)\n",
     "            Features computed from the dates.\n",
     "            Can be pandas date attributes or functions that will take the dates as input.\n",
@@ -1704,6 +1723,7 @@
     "                finetune_depth=finetune_depth,\n",
     "                finetune_loss=finetune_loss,\n",
     "                clean_ex_first=clean_ex_first,\n",
+    "                hist_exog_list=hist_exog_list,\n",
     "                date_features=date_features,\n",
     "                date_features_to_one_hot=date_features_to_one_hot,\n",
     "                model=model,\n",
@@ -1767,9 +1787,29 @@
     "            targets = _array_tails(targets, orig_indptr, np.diff(processed.indptr))\n",
     "        if processed.data.shape[1] > 1:\n",
     "            X = processed.data[:, 1:].T\n",
-    "            logger.info(f'Using the following exogenous features: {x_cols}')\n",
+    "            if hist_exog_list is None:\n",
+    "                hist_exog = None\n",
+    "                futr_exog = x_cols\n",
+    "            else:\n",
+    "                missing_hist = set(hist_exog_list) - set(x_cols)\n",
+    "                if missing_hist:\n",
+    "                    raise ValueError(\n",
+    "                        \"The following exogenous features were declared as historic \"\n",
+    "                        f\"but were not found in `df`: {missing_hist}.\"\n",
+    "                    )\n",
+    "                futr_exog = [c for c in x_cols if c not in hist_exog_list]\n",
+    "                # match the forecast method order [future, historic]\n",
+    "                fcst_features_order = futr_exog + hist_exog_list\n",
+    "                x_idxs = [x_cols.index(c) for c in fcst_features_order]\n",
+    "                X = X[x_idxs]\n",
+    "                hist_exog = [fcst_features_order.index(c) for c in hist_exog_list]\n",
+    "            if futr_exog:\n",
+    "                logger.info(f'Using future exogenous features: {futr_exog}')\n",
+    "            if hist_exog_list:\n",
+    "                logger.info(f'Using historical exogenous features: {hist_exog_list}')\n",
     "        else:\n",
     "            X = None\n",
+    "            hist_exog = None\n",
     "\n",
     "        logger.info('Calling Cross Validation Endpoint...')\n",
     "        payload = {\n",
@@ -1784,6 +1824,7 @@
     "            'step_size': step_size,\n",
     "            'freq': standard_freq,\n",
     "            'clean_ex_first': clean_ex_first,\n",
+    "            'hist_exog': hist_exog,\n",
     "            'level': level,\n",
     "            'finetune_steps': finetune_steps,\n",
     "            'finetune_depth': finetune_depth,\n",
@@ -1953,6 +1994,7 @@
     "    finetune_depth: _Finetune_Depth,\n",
     "    finetune_loss: _Loss,\n",
     "    clean_ex_first: bool,\n",
+    "    hist_exog_list: Optional[list[str]],\n",
     "    validate_api_key: bool,\n",
     "    add_history: bool,\n",
     "    date_features: Union[bool, list[Union[str, Callable]]],\n",
@@ -1981,6 +2023,7 @@
     "        finetune_depth=finetune_depth,\n",
     "        finetune_loss=finetune_loss,\n",
     "        clean_ex_first=clean_ex_first,\n",
+    "        hist_exog_list=hist_exog_list,\n",
     "        validate_api_key=validate_api_key,\n",
     "        add_history=add_history,\n",
     "        date_features=date_features,\n",
@@ -2037,6 +2080,7 @@
     "    finetune_depth: _Finetune_Depth,\n",
     "    finetune_loss: _Loss,\n",
     "    clean_ex_first: bool,\n",
+    "    hist_exog_list: Optional[list[str]],\n",
     "    date_features: Union[bool, list[str]],\n",
     "    date_features_to_one_hot: Union[bool, list[str]],\n",
     "    model: _Model,\n",
@@ -2058,6 +2102,7 @@
     "        finetune_depth=finetune_depth,\n",
     "        finetune_loss=finetune_loss,\n",
     "        clean_ex_first=clean_ex_first,\n",
+    "        hist_exog_list=hist_exog_list,\n",
     "        date_features=date_features,\n",
     "        date_features_to_one_hot=date_features_to_one_hot,\n",
     "        model=model,\n",
@@ -2205,6 +2250,48 @@
     ")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| hide\n",
+    "# historic exog in cv\n",
+    "freq = 'D'\n",
+    "h = 5\n",
+    "series = generate_series(2, freq=freq)\n",
+    "series_with_features, _ = fourier(series, freq=freq, season_length=7, k=2)\n",
+    "splits = ufp.backtest_splits(\n",
+    "    df=series_with_features,\n",
+    "    n_windows=1,\n",
+    "    h=h,\n",
+    "    id_col='unique_id',\n",
+    "    time_col='ds',\n",
+    "    freq=freq,\n",
+    ")\n",
+    "_, train, valid = next(splits)\n",
+    "x_cols = train.columns.drop(['unique_id', 'ds', 'y']).tolist()\n",
+    "for hist_exog_list in [None, [], [x_cols[2], x_cols[1]], x_cols]:\n",
+    "    cv_res = nixtla_client.cross_validation(\n",
+    "        series_with_features,\n",
+    "        n_windows=1,\n",
+    "        h=h,\n",
+    "        freq=freq,\n",
+    "        hist_exog_list=hist_exog_list,\n",
+    "    )\n",
+    "    fcst_res = nixtla_client.forecast(\n",
+    "        train,\n",
+    "        h=h,\n",
+    "        freq=freq,\n",
+    "        hist_exog_list=hist_exog_list,\n",
+    "        X_df=valid,\n",
+    "    )\n",
+    "    np.testing.assert_allclose(\n",
+    "        cv_res['TimeGPT'], fcst_res['TimeGPT'], atol=1e-4, rtol=1e-3\n",
+    "    )"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,