research stat arbitrage

alifanov · alifanov · commit c95907b88606 · 2017-08-21T09:00:48.000+03:00
diff --git a/stat_arbitrage/download_bars.py b/stat_arbitrage/download_bars.py
@@ -24,7 +24,7 @@ def __init__(self):
 
     def get_chart_data(self, pair):
         end = int(time.time())
-        start = end - 60*60*24*90
+        start = end - 60*60*24*120
         return self.api.returnChartData(pair, period=300, start=start, end=end)
 
 
diff --git a/stat_arbitrage/ols.py b/stat_arbitrage/ols.py
@@ -0,0 +1,300 @@
+import pandas as pd
+import statsmodels.formula.api as smf
+import numpy as np
+from pandas import Series, DataFrame
+
+__all__ = ['RollingOLS']
+
+
+def rwindows(a, window):
+    """Create rolling window blocks from a given array.
+
+    The shape of the result is meant to translate cleanly to pandas DataFrame
+    convention of computing rolling statistics for blocks.
+
+    Parameters
+    ==========
+    a : numpy.ndarray
+        Of ndim {1, 2}
+    window : int
+        The window size
+
+    Returns
+    =======
+    blocks : ndarray
+        A higher-dimensional array containing each window (block)
+
+    Shape of *a*            Shape of *blocks*
+    ============            =================
+    (x, )                   (x - window + 1, window, 1)
+    (x, y)                  (x - window + 1, window, y)
+    ...                     ...
+
+    That is, each innermost element of the result is a window/block.
+    """
+
+    if a.ndim == 1:
+        a = a.reshape(-1, 1)
+    shape = a.shape[0] - window + 1, window, a.shape[-1]
+    strides = (a.strides[0],) + a.strides
+    blocks = np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
+    return blocks
+
+
+class RollingOLS(object):
+    """Provides rolling ordinary least squares (OLS) regression capability.
+
+    Note: this approach is designed to be functional and user-friendly.  It
+    works well on smaller (<10,000) datasets, but may create memory issues with
+    datasets >100,000 samples.  It works by creating a RegressionWrapper for
+    each rolling period, from which various regression attributes can be called.
+
+    Parameters
+    ==========
+    endog : Series
+        dependent variable
+    exog : Series or DataFrame
+        array of independent variable(s)
+    window : int
+        window length
+    has_intercept : bool, default True
+        if False, an intercept column equal to 1 will be added to exog
+    """
+
+    def __init__(self, endog, exog, window):
+        self.endog = endog
+        self.exog = exog
+        self.window = window
+        self._result_idx = self.exog.index[self.window - 1:]
+
+        # Create a MultiIndex for 3-dimensional result data such as rolling
+        #   residuals and fitted values.
+        outer = np.repeat(self._result_idx.values, self.window, 0)
+        inner = rwindows(self.exog.index.values, self.window).flatten()
+        tups = list(zip(outer, inner))
+        self._result_idx_3d = pd.MultiIndex.from_tuples(tups,
+                                                        names=['Date Ending', 'Date'])
+
+    def fit(self):
+        """Container for RegressionResultsWrappers.
+
+        Full regression results are ran once for each rolling window and
+        stored where various attributes can later be called.
+        """
+
+        self.rendog = rwindows(self.endog.values, window=self.window)
+        self.rexog = rwindows(self.exog.values, window=self.window)
+        self.models = [smf.OLS(y, x, hasconst=True).fit() for y, x in
+                       zip(self.rendog, self.rexog)]
+        # return self to enable method chaining
+        return self
+
+    def _get(self, attr):
+        """Call different regression attributes from statsmodels.OLS results.
+
+        Internal method used to call @cache_readonly results from each
+          RegressionResults wrapper.
+
+        Available attributes are here:
+        statsmodels.regression.linear_model.RegressionResults
+
+        Parameters
+        ==========
+        attr : str
+            string form of the attribute to call; example: 'tvalues'
+        """
+
+        return [getattr(n, attr) for n in self.models]
+
+    # 1d data (return type is pd.Series)
+    # These properties consist of a scalar for each rolling period.
+    # --------------------------------------------------------------------------
+
+    @property
+    def aic(self):
+        """Akaike information criterion."""
+        return Series(self._get('aic'), index=self._result_idx,
+                      name='aic')
+
+    @property
+    def bic(self):
+        """Bayesian information criterion."""
+        return Series(self._get('bic'), index=self._result_idx,
+                      name='bic')
+
+    @property
+    def condition_number(self):
+        """Return condition number of exogenous matrix.
+
+        Calculated as ratio of largest to smallest eigenvalue.
+        """
+        return Series(self._get('condition_number'), index=self._result_idx,
+                      name='condition_number')
+
+    @property
+    def df_model(self):
+        """Model (regression) degrees of freedom (dof)."""
+        return Series(self._get('df_model'), index=self._result_idx,
+                      name='df_model')
+
+    @property
+    def df_resid(self):
+        """Residual degrees of freedom (dof)."""
+        return Series(self._get('df_resid'), index=self._result_idx,
+                      name='df_resid')
+
+    @property
+    def df_total(self):
+        """Total degrees of freedom (dof)."""
+        return self.df_model + self.df_resid
+
+    @property
+    def ess(self):
+        """Error sum of squares (sum of squared residuals)."""
+        return Series(self._get('ess'), index=self._result_idx,
+                      name='ess')
+
+    @property
+    def fstat(self):
+        """F-statistic of the fully specified model.
+
+        Calculated as the mean squared error of the model divided by the
+        mean squared error of the residuals.
+        """
+
+        return Series(self._get('fvalue'), index=self._result_idx,
+                      name='fstat')
+
+    @property
+    def f_pvalue(self):
+        """p-value associated with the F-statistic."""
+        return Series(self._get('f_pvalue'), index=self._result_idx,
+                      name='f_pvalue')
+
+    @property
+    def mse_model(self):
+        """Mean squared error of the model.
+
+        The explained sum of squares  divided by the model dof.
+        """
+
+        return Series(self._get('mse_model'), index=self._result_idx,
+                      name='mse_model')
+
+    @property
+    def mse_resid(self):
+        """Mean squared error of the residuals.
+
+        The sum of squared residuals divided by the residual dof.
+        """
+
+        return Series(self._get('mse_resid'), index=self._result_idx,
+                      name='mse_resid')
+
+    @property
+    def mse_total(self):
+        """Total mean squared error.
+
+        The uncentered total sum of squares divided by nobs.
+        """
+
+        return Series(self._get('mse_total'), index=self._result_idx,
+                      name='mse_total')
+
+    @property
+    def nobs(self):
+        """Number of observations."""
+        return Series(self._get('nobs'), index=self._result_idx,
+                      name='nobs')
+
+    @property
+    def rss(self):
+        """Regression sum of squares."""
+        return Series(self._get('ssr'), index=self._result_idx,
+                      name='rss')
+
+    @property
+    def rsq(self):
+        """R-squared of a model with an intercept.
+
+        This is defined here as 1 - ssr/centered_tss if the constant is
+        included in the model and 1 - ssr/uncentered_tss if the constant is
+        omitted.
+        """
+        return Series(self._get('rsquared'), index=self._result_idx,
+                      name='rsq')
+
+    @property
+    def rsq_adj(self):
+        """Adjusted R-squared of a model with an intercept.
+
+        This is defined here as 1 - (nobs-1)/df_resid * (1-rsquared) if a
+        constant is included and 1 - nobs/df_resid * (1-rsquared) if no
+        constant is included.
+        """
+        return Series(self._get('rsquared_adj'), index=self._result_idx,
+                      name='rsq_adj')
+
+    @property
+    def tss(self):
+        """Total sum of squares."""
+        return Series(self._get('centered_tss'), index=self._result_idx,
+                      name='centered_tss')
+
+    # 2d data (return type is pd.DataFrame)
+    # For models with >1 exogenous variable, these properties consist of an
+    #   nx1 vector for each rolling period.
+    # --------------------------------------------------------------------------
+
+    @property
+    def coefs(self):
+        """The linear coefficients that minimize the least squares criterion.
+
+        This is usually called Beta for the classical linear model.
+        """
+
+        if isinstance(self.exog, DataFrame):
+            return DataFrame(self._get('params'), index=self._result_idx,
+                             columns=self.exog.columns)
+        else:
+            return pd.Series(self._get('params'), index=self._result_idx)
+
+    @property
+    def pvalues(self):
+        """Returns the coefficient p-values in DataFrame form."""
+        return DataFrame(self._get('pvalues'), index=self._result_idx,
+                         columns=self.exog.columns)
+
+    @property
+    def tvalues(self):
+        """Returns the coefficient t-statistics in DataFrame form."""
+        return DataFrame(self._get('tvalues'), index=self._result_idx,
+                         columns=self.exog.columns)
+
+    @property
+    def stderrs(self):
+        """The standard errors of the parameter estimates."""
+        return DataFrame(self._get('bse'), index=self._result_idx,
+                         columns=self.exog.columns)
+
+    # 3d data (return type is a MultiIndex pd.DataFrame)
+    # Note that pd.Panel was deprecated in 0.20.1
+    # For models with >1 exogenous variable, these properties consist of an
+    #   nxm vector for each rolling period.
+    # The "outer" index will be _result_idx (period-ending basis), with the
+    #   inner indices being the individual periods within each outer period.
+    # --------------------------------------------------------------------------
+
+    @property
+    def fitted_values(self):
+        """The predicted the values for the original (unwhitened) design."""
+        return Series(np.array(self._get('fittedvalues')).flatten(),
+                      index=self._result_idx_3d,
+                      name='fittedvalues')
+
+    @property
+    def resids(self):
+        """The residuals of the model."""
+        return Series(np.array(self._get('resid')).flatten(),
+                      index=self._result_idx_3d,
+                      name='resids')
diff --git a/stat_arbitrage/test_start_from_quant_strat.py b/stat_arbitrage/test_start_from_quant_strat.py

-Original file line number
+Diff line change
 +import matplotlib.pyplot as plt
 +import numpy as np
 +import os, os.path
 +import pandas as pd
 +import statsmodels.api as sm
++
 +from sklearn import linear_model
++
++
 +def rolling_beta(X, y, idx, window=100):
 +    assert len(X) == len(y)
++
 +    out_dates = []
 +    out_beta = []
++
 +    model_ols = linear_model.LinearRegression()
++
 +    for iStart in range(0, len(X) - window):
 +        iEnd = iStart + window
++
 +        _x = X[iStart:iEnd].values.reshape(-1, 1)
 +        _y = y[iStart:iEnd].values.reshape(-1, 1)
++
 +        model_ols.fit(_x, _y)
++
 +        # store output
 +        out_dates.append(idx[iEnd])
 +        out_beta.append(model_ols.coef_[0][0])
++
 +    return pd.DataFrame({'beta': out_beta}, index=out_dates)
++
++
 +def create_pairs_dataframe(datadir, symbols):
 +    """Creates a pandas DataFrame containing the closing price
 +    of a pair of symbols based on CSV files containing a datetime
 +    stamp and OHLCV data."""
++
 +    # Open the individual CSV files and read into pandas DataFrames
 +    print("Importing CSV data...")
 +    sym1 = pd.read_csv(os.path.join(datadir, 'BTC_%s.csv' % symbols[0]),
 +                       header=0, index_col=0,
 +                       names=['date', 'open', 'high', 'low', 'close'])
 +    sym2 = pd.read_csv(os.path.join(datadir, 'BTC_%s.csv' % symbols[1]),
 +                       header=0, index_col=0,
 +                       names=['date', 'open', 'high', 'low', 'close'])
++
 +    # Create a pandas DataFrame with the close prices of each symbol
 +    # correctly aligned and dropping missing entries
 +    print("Constructing dual matrix for %s and %s..." % symbols)
 +    pairs = pd.DataFrame(index=sym1.index)
 +    pairs['%s_close' % symbols[0].lower()] = sym1['close']
 +    pairs['%s_close' % symbols[1].lower()] = sym2['close']
 +    pairs = pairs.dropna()
 +    return pairs
++
++
 +def calculate_spread_zscore(pairs, symbols, lookback=100):
 +    """Creates a hedge ratio between the two symbols by calculating
 +    a rolling linear regression with a defined lookback period. This
 +    is then used to create a z-score of the 'spread' between the two
 +    symbols based on a linear combination of the two."""
++
 +    # Use the pandas Ordinary Least Squares method to fit a rolling
 +    # linear regression between the two closing price time series
 +    s0 = symbols[0].lower()
 +    s1 = symbols[1].lower()
++
 +    print("Fitting the rolling Linear Regression...")
++
 +    ols = rolling_beta(pairs['%s_close' % s0],
 +                     pairs['%s_close' % s1],
 +                       pairs.index,
 +                     window=lookback)
++
 +    # Construct the hedge ratio and eliminate the first
 +    # lookback-length empty/NaN period
 +    pairs['hedge_ratio'] = ols['beta']
 +    pairs = pairs.dropna()
++
 +    # Create the spread and then a z-score of the spread
 +    print("Creating the spread/zscore columns...")
 +    pairs['hedge_ratio'] = [v for v in pairs['hedge_ratio'].values]
 +    pairs['spread'] = pairs['{}_close'.format(s0)] - pairs['hedge_ratio'] * pairs['{}_close'.format(s1)]
 +    pairs['zscore'] = (pairs['spread'] - np.mean(pairs['spread'])) / np.std(pairs['spread'])
 +    return pairs
++
++
 +def create_long_short_market_signals(pairs, symbols,
 +                                     z_entry_threshold=2.0,
 +                                     z_exit_threshold=1.0):
 +    """Create the entry/exit signals based on the exceeding of
 +    z_enter_threshold for entering a position and falling below
 +    z_exit_threshold for exiting a position."""
++
 +    # Calculate when to be long, short and when to exit
 +    pairs['longs'] = (pairs['zscore'] <= -z_entry_threshold) * 1.0
 +    pairs['shorts'] = (pairs['zscore'] >= z_entry_threshold) * 1.0
 +    pairs['exits'] = (np.abs(pairs['zscore']) <= z_exit_threshold) * 1.0
++
 +    # These signals are needed because we need to propagate a
 +    # position forward, i.e. we need to stay long if the zscore
 +    # threshold is less than z_entry_threshold by still greater
 +    # than z_exit_threshold, and vice versa for shorts.
 +    pairs['long_market'] = 0.0
 +    pairs['short_market'] = 0.0
++
 +    # These variables track whether to be long or short while
 +    # iterating through the bars
 +    long_market = 0
 +    short_market = 0
++
 +    # Calculates when to actually be "in" the market, i.e. to have a
 +    # long or short position, as well as when not to be.
 +    # Since this is using iterrows to loop over a dataframe, it will
 +    # be significantly less efficient than a vectorised operation,
 +    # i.e. slow!
 +    print("Calculating when to be in the market (long and short)...")
 +    for i, b in enumerate(pairs.iterrows()):
 +        # Calculate longs
 +        if b[1]['longs'] == 1.0:
 +            long_market = 1
 +        # Calculate shorts
 +        if b[1]['shorts'] == 1.0:
 +            short_market = 1
 +        # Calculate exists
 +        if b[1]['exits'] == 1.0:
 +            long_market = 0
 +            short_market = 0
 +        # This directly assigns a 1 or 0 to the long_market/short_market
 +        # columns, such that the strategy knows when to actually stay in!
 +        pairs.loc[pairs.index[i], 'long_market'] = long_market
 +        pairs.loc[pairs.index[i], 'short_market'] = short_market
 +    return pairs
++
++
 +def create_portfolio_returns(pairs, symbols):
 +    """Creates a portfolio pandas DataFrame which keeps track of
 +    the account equity and ultimately generates an equity curve.
 +    This can be used to generate drawdown and risk/reward ratios."""
++
 +    # Convenience variables for symbols
 +    sym1 = symbols[0].lower()
 +    sym2 = symbols[1].lower()
++
 +    # Construct the portfolio object with positions information
 +    # Note that minuses to keep track of shorts!
 +    print("Constructing a portfolio...")
 +    portfolio = pd.DataFrame(index=pairs.index)
 +    portfolio['positions'] = pairs['long_market'] - pairs['short_market']
 +    portfolio[sym1] = -1.0 * pairs['%s_close' % sym1] * portfolio['positions']
 +    portfolio[sym2] = pairs['%s_close' % sym2] * portfolio['positions']
 +    portfolio['total'] = portfolio[sym1] + portfolio[sym2]
++
 +    # Construct a percentage returns stream and eliminate all
 +    # of the NaN and -inf/+inf cells
 +    print("Constructing the equity curve...")
 +    portfolio['returns'] = portfolio['total'].pct_change()
 +    portfolio['returns'].fillna(0.0, inplace=True)
 +    portfolio['returns'].replace([np.inf, -np.inf], 0.0, inplace=True)
 +    portfolio['returns'].replace(-1.0, 0.0, inplace=True)
++
 +    # Calculate the full equity curve
 +    portfolio['returns'] = (portfolio['returns'] + 1.0).cumprod()
 +    return portfolio
++
++
 +if __name__ == "__main__":
 +    datadir = 'datasets/'  # Change this to reflect your data path!
 +    symbols = ('ETC', 'LTC')
++
 +    # lookbacks = range(10, 310, 10)
 +    # returns = []
++
 +    lb = 20
++
 +    # Adjust lookback period from 50 to 200 in increments
 +    # of 10 in order to produce sensitivities
 +    # for lb in lookbacks:
 +    print("Calculating lookback=%s..." % lb)
 +    pairs = create_pairs_dataframe(datadir, symbols)
 +    pairs = calculate_spread_zscore(pairs, symbols, lookback=lb)
 +    pairs = create_long_short_market_signals(pairs, symbols,
 +                                             z_entry_threshold=2.0,
 +                                             z_exit_threshold=1.0)
++
 +    portfolio = create_portfolio_returns(pairs, symbols)
 +    portfolio['returns'].plot()
 +    plt.show()
 +    # returns.append(portfolio.iloc[-1]['returns'])
 +    print()
++
 +    # print("Plot the lookback-performance scatterchart...")
 +    # print('Best lookback: {}'.format(lookbacks[returns.index(max(returns))]))
 +    # plt.plot(lookbacks, returns, '-o')
 +    # plt.show()
 +#
++
 +# best lookback: 20