Forecast Scenario Notebook for Local and Remote Inferencing (#3429)

* Initial commit with the codes * Working batch inference with gap * Cleanup code and delete outputs * Fix format issues * Working batch inf e2e * Fix formatting issues --------- Co-authored-by: Rahul Kumar <74648335+iamrk04@users.noreply.github.com>
Azure · Oct 29, 2024 · bc167ff · bc167ff
1 parent b0b3ff7
commit bc167ff
Show file tree

Hide file tree

Showing 8 changed files with 2,036 additions and 0 deletions.
diff --git a/...toml-forecasting-forecast-function/auto-ml-forecasting-function-gap-batch-inference.ipynb b/...toml-forecasting-forecast-function/auto-ml-forecasting-function-gap-batch-inference.ipynb
diff --git a/...toml-forecasting-forecast-function/auto-ml-forecasting-function-gap-local-inference.ipynb b/...toml-forecasting-forecast-function/auto-ml-forecasting-function-gap-local-inference.ipynb
diff --git a/...jobs/automl-forecasting-forecast-function/auto-ml-forecasting-function-gap-training.ipynb b/...jobs/automl-forecasting-forecast-function/auto-ml-forecasting-function-gap-training.ipynb
diff --git a/...dalone-jobs/automl-forecasting-forecast-function/forecasting_script/forecasting_script.py b/...dalone-jobs/automl-forecasting-forecast-function/forecasting_script/forecasting_script.py
@@ -0,0 +1,67 @@
+"""
+This is the script that is executed on the compute instance. It relies
+on the model.pkl file which is uploaded along with this script to the
+compute instance.
+"""
+
+import os
+
+import pandas as pd
+
+from azureml.core import Dataset, Run
+import joblib
+from pandas.tseries.frequencies import to_offset
+
+
+def init():
+    global target_column_name
+    global fitted_model
+
+    target_column_name = os.environ["TARGET_COLUMN_NAME"]
+    # AZUREML_MODEL_DIR is an environment variable created during deployment
+    # It is the path to the model folder (./azureml-models)
+    # Please provide your model's folder name if there's one
+    model_path = os.path.join(os.environ["AZUREML_MODEL_DIR"], "model.pkl")
+    fitted_model = joblib.load(model_path)
+
+
+def run(mini_batch):
+    print(f"run method start: {__file__}, run({mini_batch})")
+    resultList = []
+    for test in mini_batch:
+        if os.path.splitext(test)[-1] == ".parquet":
+            X_test = pd.read_parquet(test)
+        elif os.path.splitext(test)[-1] == ".csv":
+            X_test = pd.read_csv(test, parse_dates=[fitted_model.time_column_name])
+        else:
+            continue  # Skip if it's neither a Parquet nor CSV file
+
+        y_test = X_test.pop(target_column_name).values
+
+        # We have default quantiles values set as below(95th percentile)
+        quantiles = [0.025, 0.5, 0.975]
+        predicted_column_name = "predicted"
+        PI = "prediction_interval"
+        fitted_model.quantiles = quantiles
+        pred_quantiles = fitted_model.forecast_quantiles(
+            X_test, ignore_data_errors=True
+        )
+        pred_quantiles[PI] = pred_quantiles[[min(quantiles), max(quantiles)]].apply(
+            lambda x: "[{}, {}]".format(x[0], x[1]), axis=1
+        )
+        X_test[target_column_name] = y_test
+        X_test[PI] = pred_quantiles[PI].values
+        X_test[predicted_column_name] = pred_quantiles[0.5].values
+        # drop rows where prediction or actuals are nan
+        # happens because of missing actuals
+        # or at edges of time due to lags/rolling windows
+        clean = X_test[
+            X_test[[target_column_name, predicted_column_name]].notnull().all(axis=1)
+        ]
+        print(
+            f"The predictions have {clean.shape[0]} rows and {clean.shape[1]} columns."
+        )
+
+        resultList.append(clean)
+
+    return pd.concat(resultList, sort=False, ignore_index=True)
diff --git a/...s/automl-forecasting-forecast-function/forecasting_script/parallel_run_step.settings.json b/...s/automl-forecasting-forecast-function/forecasting_script/parallel_run_step.settings.json
@@ -0,0 +1 @@
+{"append_row": {"pandas.DataFrame.to_csv": {"sep": ","}}}
diff --git a/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-forecast-function/helper.py b/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-forecast-function/helper.py
@@ -0,0 +1,119 @@
+# Generate synthetic data
+
+import pandas as pd
+import numpy as np
+
+
+def get_timeseries(
+    train_len: int,
+    test_len: int,
+    time_column_name: str,
+    target_column_name: str,
+    time_series_id_column_name: str,
+    time_series_number: int = 1,
+    freq: str = "H",
+):
+    """
+    Return the time series of designed length.
+
+    :param train_len: The length of training data (one series).
+    :type train_len: int
+    :param test_len: The length of testing data (one series).
+    :type test_len: int
+    :param time_column_name: The desired name of a time column.
+    :type time_column_name: str
+    :param time_series_number: The number of time series in the data set.
+    :type time_series_number: int
+    :param freq: The frequency string representing pandas offset.
+                 see https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html
+    :type freq: str
+    :returns: the tuple of train and test data sets.
+    :rtype: tuple
+
+    """
+    data_train = []  # type: List[pd.DataFrame]
+    data_test = []  # type: List[pd.DataFrame]
+    data_length = train_len + test_len
+    for i in range(time_series_number):
+        X = pd.DataFrame(
+            {
+                time_column_name: pd.date_range(
+                    start="2000-01-01", periods=data_length, freq=freq
+                ),
+                target_column_name: np.arange(data_length).astype(float)
+                + np.random.rand(data_length)
+                + i * 5,
+                "ext_predictor": np.asarray(range(42, 42 + data_length)),
+                time_series_id_column_name: np.repeat("ts{}".format(i), data_length),
+            }
+        )
+        data_train.append(X[:train_len])
+        data_test.append(X[train_len:])
+    X_train = pd.concat(data_train)
+    y_train = X_train.pop(target_column_name).values
+    X_test = pd.concat(data_test)
+    y_test = X_test.pop(target_column_name).values
+    return X_train, y_train, X_test, y_test
+
+
+def make_forecasting_query(
+    fulldata, time_column_name, target_column_name, forecast_origin, horizon, lookback
+):
+
+    """
+    This function will take the full dataset, and create the query
+    to predict all values of the time series from the `forecast_origin`
+    forward for the next `horizon` horizons. Context from previous
+    `lookback` periods will be included.
+
+
+
+    fulldata: pandas.DataFrame           a time series dataset. Needs to contain X and y.
+    time_column_name: string             which column (must be in fulldata) is the time axis
+    target_column_name: string           which column (must be in fulldata) is to be forecast
+    forecast_origin: datetime type       the last time we (pretend to) have target values
+    horizon: timedelta                   how far forward, in time units (not periods)
+    lookback: timedelta                  how far back does the model look
+
+    Example:
+
+
+    ```
+
+    forecast_origin = pd.to_datetime("2012-09-01") + pd.DateOffset(days=5) # forecast 5 days after end of training
+    print(forecast_origin)
+
+    X_query, y_query = make_forecasting_query(data,
+                       forecast_origin = forecast_origin,
+                       horizon = pd.DateOffset(days=7), # 7 days into the future
+                       lookback = pd.DateOffset(days=1), # model has lag 1 period (day)
+                      )
+
+    ```
+    """
+
+    X_past = fulldata[
+        (fulldata[time_column_name] > forecast_origin - lookback)
+        & (fulldata[time_column_name] <= forecast_origin)
+    ]
+
+    X_future = fulldata[
+        (fulldata[time_column_name] > forecast_origin)
+        & (fulldata[time_column_name] <= forecast_origin + horizon)
+    ]
+
+    y_past = X_past.pop(target_column_name).values.astype(float)
+    y_future = X_future.pop(target_column_name).values.astype(float)
+
+    # Now take y_future and turn it into question marks
+    y_query = y_future.copy().astype(float)  # because sometimes life hands you an int
+    y_query.fill(np.nan)
+
+    print("X_past is " + str(X_past.shape) + " - shaped")
+    print("X_future is " + str(X_future.shape) + " - shaped")
+    print("y_past is " + str(y_past.shape) + " - shaped")
+    print("y_query is " + str(y_query.shape) + " - shaped")
+
+    X_pred = pd.concat([X_past, X_future])
+    y_pred = np.concatenate([y_past, y_query])
+    return X_pred, y_pred
diff --git a/...jobs/automl-forecasting-forecast-function/images/forecast_function_at_train.png b/...jobs/automl-forecasting-forecast-function/images/forecast_function_at_train.png
diff --git a/...toml-forecasting-forecast-function/images/forecast_function_away_from_train.png b/...toml-forecasting-forecast-function/images/forecast_function_away_from_train.png
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"append_row": {"pandas.DataFrame.to_csv": {"sep": ","}}}