Nixtla · AzulGarza · Nov 9, 2023 · Nov 8, 2023 · Nov 8, 2023 · Nov 8, 2023
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -37,42 +37,19 @@ jobs:
           git status -s # display the status to see which nbs need cleaning up
           if [ -n "$(git status -uno -s)" ]; then echo -e "!!! Detected unstripped out notebooks\n!!!Remember to run nbdev_install_git_hooks"; false; fi
 
-  run-all-tests:
-    runs-on: ubuntu-latest 
-    strategy:
-      fail-fast: false
-      matrix:
-        python-version: [3.9, '3.10']
-    steps:
-      - name: Clone repo
-        uses: actions/checkout@v2
-
-      - name: Set up environment
-        uses: mamba-org/setup-micromamba@v1 
-        with:
-          environment-file: environment.yml
-          create-args: python=${{ matrix.python-version }}
-          cache-environment: true
-
-      - name: Install pip requirements
-        run: pip install ./ 
-
-      - name: Run tests 
-        env:
-          TIMEGPT_TOKEN: ${{ secrets.TIMEGPT_TOKEN }}
-          TIMEGPT_API_URL: ${{ secrets.TIMEGPT_API_URL }}
-          TIMEGPT_CUSTOM_URL_TOKEN: ${{ secrets.TIMEGPT_CUSTOM_URL_TOKEN }}
-          TIMEGPT_CUSTOM_URL: ${{ secrets.TIMEGPT_CUSTOM_URL }}
-          API_KEY_FRED: ${{ secrets.API_KEY_FRED }}
-        run: nbdev_test
-
-  run-local-tests:
+  run-tests:
     runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: false
       matrix:
         os: [ubuntu-latest, macos-latest, windows-latest]
         python-version: [3.9, '3.10']
+    env:
+      TIMEGPT_TOKEN: ${{ secrets.TIMEGPT_TOKEN }}
+      TIMEGPT_API_URL: ${{ secrets.TIMEGPT_API_URL }}
+      TIMEGPT_CUSTOM_URL_TOKEN: ${{ secrets.TIMEGPT_CUSTOM_URL_TOKEN }}
+      TIMEGPT_CUSTOM_URL: ${{ secrets.TIMEGPT_CUSTOM_URL }}
+      API_KEY_FRED: ${{ secrets.API_KEY_FRED }}
     steps:
       - name: Clone repo
         uses: actions/checkout@v2
@@ -88,10 +65,8 @@ jobs:
         run: pip install ./ 
 
       - name: Run tests 
-        env:
-          TIMEGPT_TOKEN: ${{ secrets.TIMEGPT_TOKEN }}
-          TIMEGPT_API_URL: ${{ secrets.TIMEGPT_API_URL }}
-          TIMEGPT_CUSTOM_URL_TOKEN: ${{ secrets.TIMEGPT_CUSTOM_URL_TOKEN }}
-          TIMEGPT_CUSTOM_URL: ${{ secrets.TIMEGPT_CUSTOM_URL }}
-          API_KEY_FRED: ${{ secrets.API_KEY_FRED }}
         run: nbdev_test --skip_file_glob "*distributed*"
+
+      - name: Run tests with distributed (ubuntu)
+        if: matrix.os == 'ubuntu-latest'
+        run: nbdev_test --file_glob "*distributed*"
diff --git a/environment.yml b/environment.yml
@@ -15,7 +15,7 @@ dependencies:
     - flake8
     - python-dotenv
     - statsforecast
-    - utilsforecast
+    - utilsforecast>=0.0.13
     - requests
     - duckdb<0.8
     - fugue[ray]

diff --git a/nbs/distributed.timegpt.ipynb b/nbs/distributed.timegpt.ipynb
@@ -234,6 +234,52 @@
     "            X_df=None,\n",
     "        )\n",
     "        return anomalies_df\n",
+    "\n",
+    "    def cross_validation(\n",
+    "            self,\n",
+    "            df: fugue.AnyDataFrame,\n",
+    "            h: int,\n",
+    "            freq: Optional[str] = None,    \n",
+    "            id_col: str = 'unique_id',\n",
+    "            time_col: str = 'ds',\n",
+    "            target_col: str = 'y',\n",
+    "            level: Optional[List[Union[int, float]]] = None,\n",
+    "            finetune_steps: int = 0,\n",
+    "            clean_ex_first: bool = True,\n",
+    "            validate_token: bool = False,\n",
+    "            date_features: Union[bool, List[str]] = False,\n",
+    "            date_features_to_one_hot: Union[bool, List[str]] = True,\n",
+    "            model: str = 'timegpt-1',\n",
+    "            n_windows: int = 1,\n",
+    "            step_size: Optional[int] = None,\n",
+    "            num_partitions: Optional[int] = None,\n",
+    "        ) -> fugue.AnyDataFrame:\n",
+    "        kwargs = dict(\n",
+    "            h=h,\n",
+    "            freq=freq,\n",
+    "            id_col=id_col,\n",
+    "            time_col=time_col,\n",
+    "            target_col=target_col,\n",
+    "            level=level,\n",
+    "            finetune_steps=finetune_steps,\n",
+    "            clean_ex_first=clean_ex_first,\n",
+    "            validate_token=validate_token,\n",
+    "            date_features=date_features,\n",
+    "            date_features_to_one_hot=date_features_to_one_hot,\n",
+    "            model=model,\n",
+    "            n_windows=n_windows,\n",
+    "            step_size=step_size,\n",
+    "        )\n",
+    "        schema = self._get_forecast_schema(id_col=id_col, time_col=time_col, level=level, cv=True)\n",
+    "        fcst_df = self._distribute_method(\n",
+    "            method=self._cross_validation,\n",
+    "            df=df,\n",
+    "            kwargs=kwargs,\n",
+    "            schema=schema,\n",
+    "            num_partitions=num_partitions,\n",
+    "            id_col=id_col,\n",
+    "        )\n",
+    "        return fcst_df\n",
     "    \n",
     "    def _instantiate_timegpt(self):\n",
     "        from nixtlats.timegpt import _TimeGPT\n",
@@ -270,10 +316,21 @@
     "        ) -> pd.DataFrame:\n",
     "        timegpt = self._instantiate_timegpt()\n",
     "        return timegpt._detect_anomalies(df=df, **kwargs)\n",
+    "\n",
+    "    def _cross_validation(\n",
+    "            self, \n",
+    "            df: pd.DataFrame, \n",
+    "            kwargs,\n",
+    "        ) -> pd.DataFrame:\n",
+    "        timegpt = self._instantiate_timegpt()\n",
+    "        return timegpt._cross_validation(df=df, **kwargs)\n",
     "    \n",
     "    @staticmethod\n",
-    "    def _get_forecast_schema(id_col, time_col, level):\n",
-    "        schema = f'{id_col}:string,{time_col}:datetime,TimeGPT:double'\n",
+    "    def _get_forecast_schema(id_col, time_col, level, cv=False):\n",
+    "        schema = f'{id_col}:string,{time_col}:datetime'\n",
+    "        if cv:\n",
+    "            schema = f'{schema},cutoff:datetime'\n",
+    "        schema = f'{schema},TimeGPT:double'\n",
     "        if level is not None:\n",
     "            level = sorted(level)\n",
     "            schema = f'{schema},{\",\".join([f\"TimeGPT-lo-{lv}:double\" for lv in reversed(level)])}'\n",
@@ -425,6 +482,44 @@
     "    )"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| hide\n",
+    "def test_cv_same_results_num_partitions(\n",
+    "        df: fugue.AnyDataFrame, \n",
+    "        horizon: int = 12, \n",
+    "        id_col: str = 'unique_id',\n",
+    "        time_col: str = 'ds',\n",
+    "        **fcst_kwargs,\n",
+    "    ):\n",
+    "    fcst_df = distributed_timegpt.cross_validation(\n",
+    "        df=df, \n",
+    "        h=horizon, \n",
+    "        num_partitions=1,\n",
+    "        id_col=id_col,\n",
+    "        time_col=time_col,\n",
+    "        **fcst_kwargs\n",
+    "    )\n",
+    "    fcst_df = fa.as_pandas(fcst_df)\n",
+    "    fcst_df_2 = distributed_timegpt.cross_validation(\n",
+    "        df=df, \n",
+    "        h=horizon, \n",
+    "        num_partitions=2,\n",
+    "        id_col=id_col,\n",
+    "        time_col=time_col,\n",
+    "        **fcst_kwargs\n",
+    "    )\n",
+    "    fcst_df_2 = fa.as_pandas(fcst_df_2)\n",
+    "    pd.testing.assert_frame_equal(\n",
+    "        fcst_df.sort_values([id_col, time_col]).reset_index(drop=True),\n",
+    "        fcst_df_2.sort_values([id_col, time_col]).reset_index(drop=True),\n",
+    "    )"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -433,6 +528,9 @@
    "source": [
     "#| hide\n",
     "def test_forecast_dataframe(df: fugue.AnyDataFrame):\n",
+    "    test_cv_same_results_num_partitions(df, n_windows=2, step_size=1)\n",
+    "    test_cv_same_results_num_partitions(df, n_windows=3, step_size=None, horizon=1)\n",
+    "    test_cv_same_results_num_partitions(df, model='timegpt-1-long-horizon', horizon=1)\n",
     "    test_forecast_diff_results_diff_models(df)\n",
     "    test_forecast(df, num_partitions=1)\n",
     "    test_forecast(df, level=[90, 80], num_partitions=1)\n",
@@ -807,6 +905,16 @@
     "test_forecast_x_dataframe_diff_cols(spark_df_x_diff_cols, spark_future_ex_vars_df_diff_cols)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| hide\n",
+    "spark.stop()"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},

diff --git a/...ocs/how-to-guides/distributed.spark.ipynb → ...-to-guides/0_distributed_fcst_spark.ipynb b/...ocs/how-to-guides/distributed.spark.ipynb → ...-to-guides/0_distributed_fcst_spark.ipynb
@@ -5,7 +5,7 @@
    "id": "5ff81b5a-514d-4d8b-953e-c8f7cb4ba215",
    "metadata": {},
    "source": [
-    "# How to use TimeGPT on Spark\n",
+    "# How to on Spark: Forecasting\n",
     "> Run TimeGPT distributedly on top of Spark.\n",
     "\n",
     "`TimeGPT` works on top of Spark, Dask, and Ray through Fugue. `TimeGPT` will read the input DataFrame and use the corresponding engine. For example, if the input is a Spark DataFrame, StatsForecast will use the existing Spark session to run the forecast.\n"
@@ -383,6 +383,16 @@
     "timegpt_fcst_ex_vars_df = timegpt.forecast(df=spark_df, X_df=spark_future_ex_vars_df, h=24, level=[80, 90])\n",
     "timegpt_fcst_ex_vars_df.show(5)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "620ef1e3-da4f-4949-bf12-6fd3727dfec6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark.stop()"
+   ]
   }
  ],
  "metadata": {