Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/static covs regression #1412

Merged
merged 19 commits into from
Dec 16, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions darts/models/forecasting/regression_model.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
"""
Regression Model
----------------
A `RegressionModel` forecasts future values of a target series based on lagged values of
A `RegressionModel` forecasts future values of a target series based on

* The target series (past lags only)

* An optional past_covariates series (past lags only)

* An optional future_covariates series (possibly past and future lags)

* Available static covariates


The regression models are learned in a supervised way, and they can wrap around any "scikit-learn like" regression model
acting on tabular data having ``fit()`` and ``predict()`` methods.
Expand All @@ -21,6 +23,8 @@
The lags can be specified either using an integer - in which case it represents the _number_ of (past or future) lags
to take into consideration, or as a list - in which case the lags have to be enumerated (strictly negative values
denoting past lags and positive values including 0 denoting future lags).
When static covariates are present, they are appended to the lagged features. When multiple time series are passed,
if their static covariates do not have the same size, the shorter ones are padded with 0 valued features.
"""

import math
Expand All @@ -33,7 +37,7 @@
from darts.logging import get_logger, raise_if, raise_if_not, raise_log
from darts.models.forecasting.forecasting_model import GlobalForecastingModel
from darts.timeseries import TimeSeries
from darts.utils.data.tabularization import _create_lagged_data
from darts.utils.data.tabularization import _add_static_covariates, _create_lagged_data
from darts.utils.multioutput import MultiOutputRegressor
from darts.utils.utils import _check_quantiles, seq2series, series2seq

Expand Down Expand Up @@ -325,6 +329,10 @@ def _create_lagged_data(
multi_models=self.multi_models,
)

training_samples = _add_static_covariates(
self.model, target_series, training_samples
)

return training_samples, training_labels

def _fit_model(
Expand Down Expand Up @@ -655,6 +663,8 @@ def predict(

# concatenate retrieved lags
X = np.concatenate(np_X, axis=1)
X = _add_static_covariates(self.model, series, X)

# X has shape (n_series * n_samples, n_regression_features)
prediction = self._predict_and_sample(X, num_samples, **kwargs)
# prediction shape (n_series * n_samples, output_chunk_length, n_components)
Expand Down Expand Up @@ -687,6 +697,10 @@ def _predict_and_sample(
def __str__(self):
return self.model.__str__()

@staticmethod
def _supports_static_covariates() -> bool:
return True


class _LikelihoodMixin:
"""
Expand Down
138 changes: 138 additions & 0 deletions darts/tests/models/forecasting/test_regression_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,15 @@
import pandas as pd
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder

import darts
from darts import TimeSeries
from darts.dataprocessing.encoders import (
FutureCyclicEncoder,
PastDatetimeAttributeEncoder,
)
from darts.dataprocessing.transformers import StaticCovariatesTransformer
from darts.logging import get_logger
from darts.metrics import mae, rmse
from darts.models import (
Expand Down Expand Up @@ -638,6 +640,142 @@ def test_prediction_data_creation(self):
[44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0],
)

def test_static_covs_addition(self):

static_covs1 = pd.DataFrame(
data={
"cont": [0.1, 0.2, 0.3],
"cat": ["a", "b", "c"], # should lead to 9 one-hot encoded columns
}
).astype(dtype={"cat": "category"})

static_covs2 = pd.DataFrame(data={"cont": [0.1, 0.2, 0.3]})

# default transformer_num = MinMaxScaler()
scaler = StaticCovariatesTransformer(transformer_cat=OneHotEncoder())
ref_series = tg.linear_timeseries(length=10)
series1 = TimeSeries.from_times_and_values(
times=ref_series.time_index,
values=np.concatenate([ref_series.values()] * 3, axis=1),
columns=["comp1", "comp2", "comp3"],
static_covariates=static_covs1,
)
series1 = scaler.fit_transform(series1)

series2 = TimeSeries.from_times_and_values(
times=ref_series.time_index,
values=np.concatenate([ref_series.values() * 100] * 3, axis=1),
columns=["comp1", "comp2", "comp3"],
static_covariates=static_covs2,
)

series3 = TimeSeries.from_times_and_values(
times=ref_series.time_index,
values=np.concatenate([ref_series.values() * 200] * 3, axis=1),
columns=["comp1", "comp2", "comp3"],
)

series4 = TimeSeries.from_times_and_values(
times=ref_series.time_index,
values=np.concatenate([ref_series.values()] * 3, axis=1),
columns=["comp1", "comp2", "comp3"],
)

reg_model = RegressionModel(lags=1, output_chunk_length=1)
all_series = [series1, series2, series3]
max_samples = 5
all_series_width = series1.n_components
max_scovs_width = max(
[
s.static_covariates_values(copy=False).reshape(1, -1).shape[1]
for s in all_series
if s.has_static_covariates
]
)

# no static covs
features = reg_model._create_lagged_data(
series3, None, None, max_samples_per_ts=max_samples
)[0]
self.assertEqual(features.shape, (5, 3))

# static covs with different dims
features = reg_model._create_lagged_data(
all_series, None, None, max_samples_per_ts=max_samples
)[0]
self.assertEqual(
features.shape,
(max_samples * len(all_series), all_series_width + max_scovs_width),
)

# no static covs at prediction but static covs at training
reg_model.fit(all_series)
pred_features = reg_model._create_lagged_data(
series4, None, None, max_samples_per_ts=1
)[
0
] # simulates features prep at prediction time
self.assertEqual(pred_features.shape, (1, all_series_width + max_scovs_width))
eliane-maalouf marked this conversation as resolved.
Show resolved Hide resolved

def test_static_cov_accuracy(self):
# based on : https://unit8co.github.io/darts/examples/15-static-covariates.html

# given
period = 20
sine_series = tg.sine_timeseries(
length=4 * period,
value_frequency=1 / period,
column_name="smooth",
freq="h",
)

sine_vals = sine_series.values()
linear_vals = np.expand_dims(np.linspace(1, -1, num=19), -1)

sine_vals[21:40] = linear_vals
sine_vals[61:80] = linear_vals
irregular_series = TimeSeries.from_times_and_values(
values=sine_vals, times=sine_series.time_index, columns=["irregular"]
)

# no static covs
train_series_no_cov = [sine_series, irregular_series]

# categorical static covs
sine_series_st_cat = sine_series.with_static_covariates(
pd.DataFrame(data={"curve_type": ["smooth"]})
)
irregular_series_st_cat = irregular_series.with_static_covariates(
pd.DataFrame(data={"curve_type": ["non_smooth"]})
)
train_series_static_cov = [sine_series_st_cat, irregular_series_st_cat]

scaler = StaticCovariatesTransformer(transformer_cat=OneHotEncoder())
train_series_static_cov = scaler.fit_transform(train_series_static_cov)

# when
model_no_static_cov = RandomForest(lags=period // 2, bootstrap=False)
model_no_static_cov.fit(train_series_no_cov)
predict_series_no_cov = [series[:60] for series in train_series_no_cov]
pred_no_static_cov = model_no_static_cov.predict(
n=int(period / 2), series=predict_series_no_cov
)

model_static_cov = RandomForest(lags=period // 2, bootstrap=False)
model_static_cov.fit(train_series_static_cov)
predict_series_static_cov = [series[:60] for series in train_series_static_cov]
pred_static_cov = model_static_cov.predict(
n=int(period / 2), series=predict_series_static_cov
)

# then
for series, ps_no_st, ps_st_cat in zip(
train_series_static_cov, pred_no_static_cov, pred_static_cov
):
rmses = [rmse(series, ps) for ps in [ps_no_st, ps_st_cat]]

self.assertLess(rmses[1], rmses[0])

def test_models_runnability(self):
train_y, test_y = self.sine_univariate1.split_before(0.7)
multi_models_modes = [True, False]
Expand Down
2 changes: 1 addition & 1 deletion darts/utils/data/inference_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -476,7 +476,7 @@ def __init__(
use_static_covariates=use_static_covariates,
)

# This dataset is in charge of serving historic and future future covariates
# This dataset is in charge of serving historic and future covariates
self.ds_future = DualCovariatesInferenceDataset(
target_series=target_series,
covariates=future_covariates,
Expand Down
58 changes: 58 additions & 0 deletions darts/utils/data/tabularization.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from darts.logging import raise_if
from darts.timeseries import TimeSeries
from darts.utils.utils import series2seq


def _create_lagged_data(
Expand Down Expand Up @@ -161,3 +162,60 @@ def _create_lagged_data(
X = np.concatenate(Xs, axis=0)
y = np.concatenate(ys, axis=0)
return X, y, Ts


def _add_static_covariates(model, series, features):
"""
Add static covariates to the features. Accounts for series with potentially different static covariates
by padding with 0 to accomodate for the maximum number of available static_covariates in any of the given
series in the sequence. If no static covariates are provided for a given series, its corresponding features
are padded with 0.
"""

series = series2seq(series)
reps = features.shape[0] // len(series)
# collect static covariates info
map = {"covs_width": [], "values": []}
for ts in series:
if ts.static_covariates is not None:
# reshape with order="F" to ensure that the covariates are read column wise
scovs = ts.static_covariates_values(copy=False).reshape(1, -1, order="F")
map["covs_width"].append(scovs.shape[1])
map["values"].append(scovs)
else:
map["covs_width"].append(0)
map["values"].append(np.array([]))

max_width = max(map["covs_width"])

if max_width == 0:
if (
hasattr(model, "n_features_in_")
and model.n_features_in_ is not None
and model.n_features_in_ > features.shape[1]
):
# for when series in prediction do not have static covariates but some of the training series did
pad_zeros = np.zeros((1, model.n_features_in_ - features.shape[1]))
return np.concatenate(
[features, np.tile(pad_zeros, reps=(reps, 1))], axis=1
)
else:
return features

else:
# at least one series in the sequence has static covariates
static_covs = []

# build static covariates array
for i in range(len(series)):
pad_zeros = np.zeros((1, max_width - map["covs_width"][i]))
scovs = (
np.concatenate((map["values"][i], pad_zeros), axis=1)
if map["covs_width"][i] > 0
else pad_zeros
)
static_covs.append(np.tile(scovs, reps=(reps, 1)))
static_covs = np.concatenate(static_covs, axis=0)

# concatenate static covariates to features
return np.concatenate([features, static_covs], axis=1)