From 1f5c8a6dc3577e8a70ecc3339ad3162c895e4ae3 Mon Sep 17 00:00:00 2001 From: capelastegui Date: Wed, 24 Mar 2021 15:20:44 +0000 Subject: [PATCH] bug #201; Error in model_utils.get_s_x_extrapolate() --- anticipy/model_utils.py | 28 ++++++++++++++++++++-------- tests/test_forecast.py | 20 +++++++++++++++++++- tests/test_model_utils.py | 36 +++++++++++++++++++++++++++++++----- 3 files changed, 70 insertions(+), 14 deletions(-) diff --git a/anticipy/model_utils.py b/anticipy/model_utils.py index b370ebc..a87c88f 100644 --- a/anticipy/model_utils.py +++ b/anticipy/model_utils.py @@ -128,9 +128,8 @@ def get_s_x_extrapolate( date_start_actuals, date_end_actuals, model=None, - freq='W', + freq=None, extrapolate_years=2.5, - shifted_origin=0, scaling_factor=100.0, x_start_actuals=0.): """ @@ -155,6 +154,8 @@ def get_s_x_extrapolate( :param scaling_factor: Value used for scaling a_x for certain model functions :type scaling_factor: float + :param x_start_actuals: numeric index for the first actuals sample + :type x_start_actuals: int :return: Series of floats with DateTimeIndex. To be used as (a_date, a_x) input for a model function. :rtype: pandas.Series @@ -169,8 +170,17 @@ def get_s_x_extrapolate( date_start_actuals = pd.to_datetime(date_start_actuals) date_end_actuals = pd.to_datetime(date_end_actuals) + weekday_adjustment = date_start_actuals.weekday() + expected_freq = dict_wday_name.get(weekday_adjustment) if freq is None: # Default frequency - freq = 'W' + freq = expected_freq + else: + if freq.startswith('W'): + assert expected_freq == freq, \ + 'Error: with weekly frequency, freq ' \ + 'parameter must match weekday of date_start_actuals:' \ + ' {} - {} , {}' \ + .format(freq, expected_freq, date_start_actuals) freq_short = freq[0:1] # Changes e.g. W-MON to W # freq_units_per_year = 52.0 if freq_short=='W' else 365.0 @@ -182,13 +192,16 @@ def get_s_x_extrapolate( date_end_forecast = date_end_actuals + \ pd.DateOffset(**offset_input) - index = pd.date_range( + i_date = pd.date_range( date_start_actuals, date_end_forecast, freq=freq, name='date') - a_x = get_normalized_x_from_date(pd.Series(index)).values - s_x = pd.Series(index=index, data=a_x) + s_date = pd.Series(i_date) + + # Get days passed since date_start, then add x_start_actuals + s_x = (s_date - date_start_actuals).dt.days + x_start_actuals + s_x.index = i_date else: # Otherwise, use numeric index # we extrapolate future samples equal to 100*extrapolate_years @@ -202,10 +215,9 @@ def get_s_x_extrapolate( index=index, data=np.arange( x_start_actuals, - x_start_actuals + index.size)) + shifted_origin + x_start_actuals + index.size)) + x_start_actuals if model_requires_scaling(model): s_x = s_x / scaling_factor - return s_x diff --git a/tests/test_forecast.py b/tests/test_forecast.py index 99faf6c..78d84ef 100644 --- a/tests/test_forecast.py +++ b/tests/test_forecast.py @@ -1670,7 +1670,25 @@ def test_run_forecast_naive(self): df_forecast = dict_result['forecast'] logger_info('df_forecast', df_forecast) - logger.info('Test 3b: weight column, season_add_mult = \'both\'') + logger.info('Test 3b - initial sample is 0-weight, ' + 'extrapolate_years=0') + df1.weight[0:2] = 0. + logger_info('df1:', df1) + + dict_result = run_forecast( + simplify_output=False, df_y=df1, + l_model_trend=[forecast_models.model_naive], + extrapolate_years=0) + + df_data = dict_result['data'] + df_metadata = dict_result['metadata'] + df_optimize_info = dict_result['optimize_info'] + + logger_info('df_metadata:', df_metadata) + logger_info('df_optimize_info:', df_optimize_info) + logger_info('df_data:', df_data.groupby(['source', 'model']).tail(60)) + + logger.info('Test 3c: weight column, season_add_mult = \'both\'') df1 = pd.DataFrame( {'y': np.arange(0, 10.), diff --git a/tests/test_model_utils.py b/tests/test_model_utils.py index 79fd7a4..cafce5c 100644 --- a/tests/test_model_utils.py +++ b/tests/test_model_utils.py @@ -51,7 +51,7 @@ def test_apply_a_x_scaling(self): logger.info('f_model: %s', model) logger_info('a_x', a_x) - def test_get_a_x_date_extrapolate(self): + def test_get_s_x_extrapolate(self): # TODO: TEST Output size, scenarios with different frequencies l_df_y = [ # Single ts @@ -74,9 +74,6 @@ def test_get_a_x_date_extrapolate(self): ] l_time_resolutions = [ # Default config - 'W-SUN', - 'W', - 'W-MON', 'D', 'MS', 'YS' @@ -128,7 +125,7 @@ def test_get_a_x_date_extrapolate(self): ts = l_df_y[0] model = l_models[0] - time_resolution = l_time_resolutions[0] + time_resolution = None # Default - weekly frequency s_x = get_s_x_extrapolate( ts.index.min(), ts.index.max(), @@ -213,6 +210,35 @@ def test_get_a_x_date_extrapolate(self): logger_info('t_values len', len(s_x)) self.assertEqual(len(s_x), 10 + 3.0 * 365) + def test_get_s_x_extrapolate_gap(self): + # Test get_s_x_extrapolate with a gap at the start of actuals + + """ + Context - see #201 + There is a problem caused when: + - Actuals data has 0-weight samples at the start + """ + logger.info('Test 1 - default settings') + x_start_actuals = 100 + s_x = get_s_x_extrapolate( + '2021-03-06', '2021-04-30', + extrapolate_years=1.0 / 365, # 1 day + x_start_actuals=x_start_actuals + ) + logger_info('s_x:', s_x) + self.assertEqual(s_x.iloc[0], x_start_actuals) + + logger.info('Test 2 - daily freq') + x_start_actuals = 100 + s_x = get_s_x_extrapolate( + '2021-03-06', '2021-03-10', + extrapolate_years=1.0 / 365, # 1 day + x_start_actuals=x_start_actuals, + freq='D' + ) + logger_info('s_x:', s_x) + self.assertEqual(s_x.iloc[0], x_start_actuals) + def test_get_aic_c(self): # Known error scenario: 0 error, 1 parameters - should return -inf