|
| 1 | +import pandas as pd |
| 2 | +import statsmodels.formula.api as smf |
| 3 | +import numpy as np |
| 4 | +from pandas import Series, DataFrame |
| 5 | + |
| 6 | +__all__ = ['RollingOLS'] |
| 7 | + |
| 8 | + |
| 9 | +def rwindows(a, window): |
| 10 | + """Create rolling window blocks from a given array. |
| 11 | +
|
| 12 | + The shape of the result is meant to translate cleanly to pandas DataFrame |
| 13 | + convention of computing rolling statistics for blocks. |
| 14 | +
|
| 15 | + Parameters |
| 16 | + ========== |
| 17 | + a : numpy.ndarray |
| 18 | + Of ndim {1, 2} |
| 19 | + window : int |
| 20 | + The window size |
| 21 | +
|
| 22 | + Returns |
| 23 | + ======= |
| 24 | + blocks : ndarray |
| 25 | + A higher-dimensional array containing each window (block) |
| 26 | +
|
| 27 | + Shape of *a* Shape of *blocks* |
| 28 | + ============ ================= |
| 29 | + (x, ) (x - window + 1, window, 1) |
| 30 | + (x, y) (x - window + 1, window, y) |
| 31 | + ... ... |
| 32 | +
|
| 33 | + That is, each innermost element of the result is a window/block. |
| 34 | + """ |
| 35 | + |
| 36 | + if a.ndim == 1: |
| 37 | + a = a.reshape(-1, 1) |
| 38 | + shape = a.shape[0] - window + 1, window, a.shape[-1] |
| 39 | + strides = (a.strides[0],) + a.strides |
| 40 | + blocks = np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) |
| 41 | + return blocks |
| 42 | + |
| 43 | + |
| 44 | +class RollingOLS(object): |
| 45 | + """Provides rolling ordinary least squares (OLS) regression capability. |
| 46 | +
|
| 47 | + Note: this approach is designed to be functional and user-friendly. It |
| 48 | + works well on smaller (<10,000) datasets, but may create memory issues with |
| 49 | + datasets >100,000 samples. It works by creating a RegressionWrapper for |
| 50 | + each rolling period, from which various regression attributes can be called. |
| 51 | +
|
| 52 | + Parameters |
| 53 | + ========== |
| 54 | + endog : Series |
| 55 | + dependent variable |
| 56 | + exog : Series or DataFrame |
| 57 | + array of independent variable(s) |
| 58 | + window : int |
| 59 | + window length |
| 60 | + has_intercept : bool, default True |
| 61 | + if False, an intercept column equal to 1 will be added to exog |
| 62 | + """ |
| 63 | + |
| 64 | + def __init__(self, endog, exog, window): |
| 65 | + self.endog = endog |
| 66 | + self.exog = exog |
| 67 | + self.window = window |
| 68 | + self._result_idx = self.exog.index[self.window - 1:] |
| 69 | + |
| 70 | + # Create a MultiIndex for 3-dimensional result data such as rolling |
| 71 | + # residuals and fitted values. |
| 72 | + outer = np.repeat(self._result_idx.values, self.window, 0) |
| 73 | + inner = rwindows(self.exog.index.values, self.window).flatten() |
| 74 | + tups = list(zip(outer, inner)) |
| 75 | + self._result_idx_3d = pd.MultiIndex.from_tuples(tups, |
| 76 | + names=['Date Ending', 'Date']) |
| 77 | + |
| 78 | + def fit(self): |
| 79 | + """Container for RegressionResultsWrappers. |
| 80 | +
|
| 81 | + Full regression results are ran once for each rolling window and |
| 82 | + stored where various attributes can later be called. |
| 83 | + """ |
| 84 | + |
| 85 | + self.rendog = rwindows(self.endog.values, window=self.window) |
| 86 | + self.rexog = rwindows(self.exog.values, window=self.window) |
| 87 | + self.models = [smf.OLS(y, x, hasconst=True).fit() for y, x in |
| 88 | + zip(self.rendog, self.rexog)] |
| 89 | + # return self to enable method chaining |
| 90 | + return self |
| 91 | + |
| 92 | + def _get(self, attr): |
| 93 | + """Call different regression attributes from statsmodels.OLS results. |
| 94 | +
|
| 95 | + Internal method used to call @cache_readonly results from each |
| 96 | + RegressionResults wrapper. |
| 97 | +
|
| 98 | + Available attributes are here: |
| 99 | + statsmodels.regression.linear_model.RegressionResults |
| 100 | +
|
| 101 | + Parameters |
| 102 | + ========== |
| 103 | + attr : str |
| 104 | + string form of the attribute to call; example: 'tvalues' |
| 105 | + """ |
| 106 | + |
| 107 | + return [getattr(n, attr) for n in self.models] |
| 108 | + |
| 109 | + # 1d data (return type is pd.Series) |
| 110 | + # These properties consist of a scalar for each rolling period. |
| 111 | + # -------------------------------------------------------------------------- |
| 112 | + |
| 113 | + @property |
| 114 | + def aic(self): |
| 115 | + """Akaike information criterion.""" |
| 116 | + return Series(self._get('aic'), index=self._result_idx, |
| 117 | + name='aic') |
| 118 | + |
| 119 | + @property |
| 120 | + def bic(self): |
| 121 | + """Bayesian information criterion.""" |
| 122 | + return Series(self._get('bic'), index=self._result_idx, |
| 123 | + name='bic') |
| 124 | + |
| 125 | + @property |
| 126 | + def condition_number(self): |
| 127 | + """Return condition number of exogenous matrix. |
| 128 | +
|
| 129 | + Calculated as ratio of largest to smallest eigenvalue. |
| 130 | + """ |
| 131 | + return Series(self._get('condition_number'), index=self._result_idx, |
| 132 | + name='condition_number') |
| 133 | + |
| 134 | + @property |
| 135 | + def df_model(self): |
| 136 | + """Model (regression) degrees of freedom (dof).""" |
| 137 | + return Series(self._get('df_model'), index=self._result_idx, |
| 138 | + name='df_model') |
| 139 | + |
| 140 | + @property |
| 141 | + def df_resid(self): |
| 142 | + """Residual degrees of freedom (dof).""" |
| 143 | + return Series(self._get('df_resid'), index=self._result_idx, |
| 144 | + name='df_resid') |
| 145 | + |
| 146 | + @property |
| 147 | + def df_total(self): |
| 148 | + """Total degrees of freedom (dof).""" |
| 149 | + return self.df_model + self.df_resid |
| 150 | + |
| 151 | + @property |
| 152 | + def ess(self): |
| 153 | + """Error sum of squares (sum of squared residuals).""" |
| 154 | + return Series(self._get('ess'), index=self._result_idx, |
| 155 | + name='ess') |
| 156 | + |
| 157 | + @property |
| 158 | + def fstat(self): |
| 159 | + """F-statistic of the fully specified model. |
| 160 | +
|
| 161 | + Calculated as the mean squared error of the model divided by the |
| 162 | + mean squared error of the residuals. |
| 163 | + """ |
| 164 | + |
| 165 | + return Series(self._get('fvalue'), index=self._result_idx, |
| 166 | + name='fstat') |
| 167 | + |
| 168 | + @property |
| 169 | + def f_pvalue(self): |
| 170 | + """p-value associated with the F-statistic.""" |
| 171 | + return Series(self._get('f_pvalue'), index=self._result_idx, |
| 172 | + name='f_pvalue') |
| 173 | + |
| 174 | + @property |
| 175 | + def mse_model(self): |
| 176 | + """Mean squared error of the model. |
| 177 | +
|
| 178 | + The explained sum of squares divided by the model dof. |
| 179 | + """ |
| 180 | + |
| 181 | + return Series(self._get('mse_model'), index=self._result_idx, |
| 182 | + name='mse_model') |
| 183 | + |
| 184 | + @property |
| 185 | + def mse_resid(self): |
| 186 | + """Mean squared error of the residuals. |
| 187 | +
|
| 188 | + The sum of squared residuals divided by the residual dof. |
| 189 | + """ |
| 190 | + |
| 191 | + return Series(self._get('mse_resid'), index=self._result_idx, |
| 192 | + name='mse_resid') |
| 193 | + |
| 194 | + @property |
| 195 | + def mse_total(self): |
| 196 | + """Total mean squared error. |
| 197 | +
|
| 198 | + The uncentered total sum of squares divided by nobs. |
| 199 | + """ |
| 200 | + |
| 201 | + return Series(self._get('mse_total'), index=self._result_idx, |
| 202 | + name='mse_total') |
| 203 | + |
| 204 | + @property |
| 205 | + def nobs(self): |
| 206 | + """Number of observations.""" |
| 207 | + return Series(self._get('nobs'), index=self._result_idx, |
| 208 | + name='nobs') |
| 209 | + |
| 210 | + @property |
| 211 | + def rss(self): |
| 212 | + """Regression sum of squares.""" |
| 213 | + return Series(self._get('ssr'), index=self._result_idx, |
| 214 | + name='rss') |
| 215 | + |
| 216 | + @property |
| 217 | + def rsq(self): |
| 218 | + """R-squared of a model with an intercept. |
| 219 | +
|
| 220 | + This is defined here as 1 - ssr/centered_tss if the constant is |
| 221 | + included in the model and 1 - ssr/uncentered_tss if the constant is |
| 222 | + omitted. |
| 223 | + """ |
| 224 | + return Series(self._get('rsquared'), index=self._result_idx, |
| 225 | + name='rsq') |
| 226 | + |
| 227 | + @property |
| 228 | + def rsq_adj(self): |
| 229 | + """Adjusted R-squared of a model with an intercept. |
| 230 | +
|
| 231 | + This is defined here as 1 - (nobs-1)/df_resid * (1-rsquared) if a |
| 232 | + constant is included and 1 - nobs/df_resid * (1-rsquared) if no |
| 233 | + constant is included. |
| 234 | + """ |
| 235 | + return Series(self._get('rsquared_adj'), index=self._result_idx, |
| 236 | + name='rsq_adj') |
| 237 | + |
| 238 | + @property |
| 239 | + def tss(self): |
| 240 | + """Total sum of squares.""" |
| 241 | + return Series(self._get('centered_tss'), index=self._result_idx, |
| 242 | + name='centered_tss') |
| 243 | + |
| 244 | + # 2d data (return type is pd.DataFrame) |
| 245 | + # For models with >1 exogenous variable, these properties consist of an |
| 246 | + # nx1 vector for each rolling period. |
| 247 | + # -------------------------------------------------------------------------- |
| 248 | + |
| 249 | + @property |
| 250 | + def coefs(self): |
| 251 | + """The linear coefficients that minimize the least squares criterion. |
| 252 | +
|
| 253 | + This is usually called Beta for the classical linear model. |
| 254 | + """ |
| 255 | + |
| 256 | + if isinstance(self.exog, DataFrame): |
| 257 | + return DataFrame(self._get('params'), index=self._result_idx, |
| 258 | + columns=self.exog.columns) |
| 259 | + else: |
| 260 | + return pd.Series(self._get('params'), index=self._result_idx) |
| 261 | + |
| 262 | + @property |
| 263 | + def pvalues(self): |
| 264 | + """Returns the coefficient p-values in DataFrame form.""" |
| 265 | + return DataFrame(self._get('pvalues'), index=self._result_idx, |
| 266 | + columns=self.exog.columns) |
| 267 | + |
| 268 | + @property |
| 269 | + def tvalues(self): |
| 270 | + """Returns the coefficient t-statistics in DataFrame form.""" |
| 271 | + return DataFrame(self._get('tvalues'), index=self._result_idx, |
| 272 | + columns=self.exog.columns) |
| 273 | + |
| 274 | + @property |
| 275 | + def stderrs(self): |
| 276 | + """The standard errors of the parameter estimates.""" |
| 277 | + return DataFrame(self._get('bse'), index=self._result_idx, |
| 278 | + columns=self.exog.columns) |
| 279 | + |
| 280 | + # 3d data (return type is a MultiIndex pd.DataFrame) |
| 281 | + # Note that pd.Panel was deprecated in 0.20.1 |
| 282 | + # For models with >1 exogenous variable, these properties consist of an |
| 283 | + # nxm vector for each rolling period. |
| 284 | + # The "outer" index will be _result_idx (period-ending basis), with the |
| 285 | + # inner indices being the individual periods within each outer period. |
| 286 | + # -------------------------------------------------------------------------- |
| 287 | + |
| 288 | + @property |
| 289 | + def fitted_values(self): |
| 290 | + """The predicted the values for the original (unwhitened) design.""" |
| 291 | + return Series(np.array(self._get('fittedvalues')).flatten(), |
| 292 | + index=self._result_idx_3d, |
| 293 | + name='fittedvalues') |
| 294 | + |
| 295 | + @property |
| 296 | + def resids(self): |
| 297 | + """The residuals of the model.""" |
| 298 | + return Series(np.array(self._get('resid')).flatten(), |
| 299 | + index=self._result_idx_3d, |
| 300 | + name='resids') |
0 commit comments