Skip to content

Commit c95907b

Browse files
committedAug 21, 2017
research stat arbitrage
1 parent b193aae commit c95907b

File tree

3 files changed

+499
-1
lines changed

3 files changed

+499
-1
lines changed
 

‎stat_arbitrage/download_bars.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ def __init__(self):
2424

2525
def get_chart_data(self, pair):
2626
end = int(time.time())
27-
start = end - 60*60*24*90
27+
start = end - 60*60*24*120
2828
return self.api.returnChartData(pair, period=300, start=start, end=end)
2929

3030

‎stat_arbitrage/ols.py

+300
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,300 @@
1+
import pandas as pd
2+
import statsmodels.formula.api as smf
3+
import numpy as np
4+
from pandas import Series, DataFrame
5+
6+
__all__ = ['RollingOLS']
7+
8+
9+
def rwindows(a, window):
10+
"""Create rolling window blocks from a given array.
11+
12+
The shape of the result is meant to translate cleanly to pandas DataFrame
13+
convention of computing rolling statistics for blocks.
14+
15+
Parameters
16+
==========
17+
a : numpy.ndarray
18+
Of ndim {1, 2}
19+
window : int
20+
The window size
21+
22+
Returns
23+
=======
24+
blocks : ndarray
25+
A higher-dimensional array containing each window (block)
26+
27+
Shape of *a* Shape of *blocks*
28+
============ =================
29+
(x, ) (x - window + 1, window, 1)
30+
(x, y) (x - window + 1, window, y)
31+
... ...
32+
33+
That is, each innermost element of the result is a window/block.
34+
"""
35+
36+
if a.ndim == 1:
37+
a = a.reshape(-1, 1)
38+
shape = a.shape[0] - window + 1, window, a.shape[-1]
39+
strides = (a.strides[0],) + a.strides
40+
blocks = np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
41+
return blocks
42+
43+
44+
class RollingOLS(object):
45+
"""Provides rolling ordinary least squares (OLS) regression capability.
46+
47+
Note: this approach is designed to be functional and user-friendly. It
48+
works well on smaller (<10,000) datasets, but may create memory issues with
49+
datasets >100,000 samples. It works by creating a RegressionWrapper for
50+
each rolling period, from which various regression attributes can be called.
51+
52+
Parameters
53+
==========
54+
endog : Series
55+
dependent variable
56+
exog : Series or DataFrame
57+
array of independent variable(s)
58+
window : int
59+
window length
60+
has_intercept : bool, default True
61+
if False, an intercept column equal to 1 will be added to exog
62+
"""
63+
64+
def __init__(self, endog, exog, window):
65+
self.endog = endog
66+
self.exog = exog
67+
self.window = window
68+
self._result_idx = self.exog.index[self.window - 1:]
69+
70+
# Create a MultiIndex for 3-dimensional result data such as rolling
71+
# residuals and fitted values.
72+
outer = np.repeat(self._result_idx.values, self.window, 0)
73+
inner = rwindows(self.exog.index.values, self.window).flatten()
74+
tups = list(zip(outer, inner))
75+
self._result_idx_3d = pd.MultiIndex.from_tuples(tups,
76+
names=['Date Ending', 'Date'])
77+
78+
def fit(self):
79+
"""Container for RegressionResultsWrappers.
80+
81+
Full regression results are ran once for each rolling window and
82+
stored where various attributes can later be called.
83+
"""
84+
85+
self.rendog = rwindows(self.endog.values, window=self.window)
86+
self.rexog = rwindows(self.exog.values, window=self.window)
87+
self.models = [smf.OLS(y, x, hasconst=True).fit() for y, x in
88+
zip(self.rendog, self.rexog)]
89+
# return self to enable method chaining
90+
return self
91+
92+
def _get(self, attr):
93+
"""Call different regression attributes from statsmodels.OLS results.
94+
95+
Internal method used to call @cache_readonly results from each
96+
RegressionResults wrapper.
97+
98+
Available attributes are here:
99+
statsmodels.regression.linear_model.RegressionResults
100+
101+
Parameters
102+
==========
103+
attr : str
104+
string form of the attribute to call; example: 'tvalues'
105+
"""
106+
107+
return [getattr(n, attr) for n in self.models]
108+
109+
# 1d data (return type is pd.Series)
110+
# These properties consist of a scalar for each rolling period.
111+
# --------------------------------------------------------------------------
112+
113+
@property
114+
def aic(self):
115+
"""Akaike information criterion."""
116+
return Series(self._get('aic'), index=self._result_idx,
117+
name='aic')
118+
119+
@property
120+
def bic(self):
121+
"""Bayesian information criterion."""
122+
return Series(self._get('bic'), index=self._result_idx,
123+
name='bic')
124+
125+
@property
126+
def condition_number(self):
127+
"""Return condition number of exogenous matrix.
128+
129+
Calculated as ratio of largest to smallest eigenvalue.
130+
"""
131+
return Series(self._get('condition_number'), index=self._result_idx,
132+
name='condition_number')
133+
134+
@property
135+
def df_model(self):
136+
"""Model (regression) degrees of freedom (dof)."""
137+
return Series(self._get('df_model'), index=self._result_idx,
138+
name='df_model')
139+
140+
@property
141+
def df_resid(self):
142+
"""Residual degrees of freedom (dof)."""
143+
return Series(self._get('df_resid'), index=self._result_idx,
144+
name='df_resid')
145+
146+
@property
147+
def df_total(self):
148+
"""Total degrees of freedom (dof)."""
149+
return self.df_model + self.df_resid
150+
151+
@property
152+
def ess(self):
153+
"""Error sum of squares (sum of squared residuals)."""
154+
return Series(self._get('ess'), index=self._result_idx,
155+
name='ess')
156+
157+
@property
158+
def fstat(self):
159+
"""F-statistic of the fully specified model.
160+
161+
Calculated as the mean squared error of the model divided by the
162+
mean squared error of the residuals.
163+
"""
164+
165+
return Series(self._get('fvalue'), index=self._result_idx,
166+
name='fstat')
167+
168+
@property
169+
def f_pvalue(self):
170+
"""p-value associated with the F-statistic."""
171+
return Series(self._get('f_pvalue'), index=self._result_idx,
172+
name='f_pvalue')
173+
174+
@property
175+
def mse_model(self):
176+
"""Mean squared error of the model.
177+
178+
The explained sum of squares divided by the model dof.
179+
"""
180+
181+
return Series(self._get('mse_model'), index=self._result_idx,
182+
name='mse_model')
183+
184+
@property
185+
def mse_resid(self):
186+
"""Mean squared error of the residuals.
187+
188+
The sum of squared residuals divided by the residual dof.
189+
"""
190+
191+
return Series(self._get('mse_resid'), index=self._result_idx,
192+
name='mse_resid')
193+
194+
@property
195+
def mse_total(self):
196+
"""Total mean squared error.
197+
198+
The uncentered total sum of squares divided by nobs.
199+
"""
200+
201+
return Series(self._get('mse_total'), index=self._result_idx,
202+
name='mse_total')
203+
204+
@property
205+
def nobs(self):
206+
"""Number of observations."""
207+
return Series(self._get('nobs'), index=self._result_idx,
208+
name='nobs')
209+
210+
@property
211+
def rss(self):
212+
"""Regression sum of squares."""
213+
return Series(self._get('ssr'), index=self._result_idx,
214+
name='rss')
215+
216+
@property
217+
def rsq(self):
218+
"""R-squared of a model with an intercept.
219+
220+
This is defined here as 1 - ssr/centered_tss if the constant is
221+
included in the model and 1 - ssr/uncentered_tss if the constant is
222+
omitted.
223+
"""
224+
return Series(self._get('rsquared'), index=self._result_idx,
225+
name='rsq')
226+
227+
@property
228+
def rsq_adj(self):
229+
"""Adjusted R-squared of a model with an intercept.
230+
231+
This is defined here as 1 - (nobs-1)/df_resid * (1-rsquared) if a
232+
constant is included and 1 - nobs/df_resid * (1-rsquared) if no
233+
constant is included.
234+
"""
235+
return Series(self._get('rsquared_adj'), index=self._result_idx,
236+
name='rsq_adj')
237+
238+
@property
239+
def tss(self):
240+
"""Total sum of squares."""
241+
return Series(self._get('centered_tss'), index=self._result_idx,
242+
name='centered_tss')
243+
244+
# 2d data (return type is pd.DataFrame)
245+
# For models with >1 exogenous variable, these properties consist of an
246+
# nx1 vector for each rolling period.
247+
# --------------------------------------------------------------------------
248+
249+
@property
250+
def coefs(self):
251+
"""The linear coefficients that minimize the least squares criterion.
252+
253+
This is usually called Beta for the classical linear model.
254+
"""
255+
256+
if isinstance(self.exog, DataFrame):
257+
return DataFrame(self._get('params'), index=self._result_idx,
258+
columns=self.exog.columns)
259+
else:
260+
return pd.Series(self._get('params'), index=self._result_idx)
261+
262+
@property
263+
def pvalues(self):
264+
"""Returns the coefficient p-values in DataFrame form."""
265+
return DataFrame(self._get('pvalues'), index=self._result_idx,
266+
columns=self.exog.columns)
267+
268+
@property
269+
def tvalues(self):
270+
"""Returns the coefficient t-statistics in DataFrame form."""
271+
return DataFrame(self._get('tvalues'), index=self._result_idx,
272+
columns=self.exog.columns)
273+
274+
@property
275+
def stderrs(self):
276+
"""The standard errors of the parameter estimates."""
277+
return DataFrame(self._get('bse'), index=self._result_idx,
278+
columns=self.exog.columns)
279+
280+
# 3d data (return type is a MultiIndex pd.DataFrame)
281+
# Note that pd.Panel was deprecated in 0.20.1
282+
# For models with >1 exogenous variable, these properties consist of an
283+
# nxm vector for each rolling period.
284+
# The "outer" index will be _result_idx (period-ending basis), with the
285+
# inner indices being the individual periods within each outer period.
286+
# --------------------------------------------------------------------------
287+
288+
@property
289+
def fitted_values(self):
290+
"""The predicted the values for the original (unwhitened) design."""
291+
return Series(np.array(self._get('fittedvalues')).flatten(),
292+
index=self._result_idx_3d,
293+
name='fittedvalues')
294+
295+
@property
296+
def resids(self):
297+
"""The residuals of the model."""
298+
return Series(np.array(self._get('resid')).flatten(),
299+
index=self._result_idx_3d,
300+
name='resids')
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
import matplotlib.pyplot as plt
2+
import numpy as np
3+
import os, os.path
4+
import pandas as pd
5+
import statsmodels.api as sm
6+
7+
from sklearn import linear_model
8+
9+
10+
def rolling_beta(X, y, idx, window=100):
11+
assert len(X) == len(y)
12+
13+
out_dates = []
14+
out_beta = []
15+
16+
model_ols = linear_model.LinearRegression()
17+
18+
for iStart in range(0, len(X) - window):
19+
iEnd = iStart + window
20+
21+
_x = X[iStart:iEnd].values.reshape(-1, 1)
22+
_y = y[iStart:iEnd].values.reshape(-1, 1)
23+
24+
model_ols.fit(_x, _y)
25+
26+
# store output
27+
out_dates.append(idx[iEnd])
28+
out_beta.append(model_ols.coef_[0][0])
29+
30+
return pd.DataFrame({'beta': out_beta}, index=out_dates)
31+
32+
33+
def create_pairs_dataframe(datadir, symbols):
34+
"""Creates a pandas DataFrame containing the closing price
35+
of a pair of symbols based on CSV files containing a datetime
36+
stamp and OHLCV data."""
37+
38+
# Open the individual CSV files and read into pandas DataFrames
39+
print("Importing CSV data...")
40+
sym1 = pd.read_csv(os.path.join(datadir, 'BTC_%s.csv' % symbols[0]),
41+
header=0, index_col=0,
42+
names=['date', 'open', 'high', 'low', 'close'])
43+
sym2 = pd.read_csv(os.path.join(datadir, 'BTC_%s.csv' % symbols[1]),
44+
header=0, index_col=0,
45+
names=['date', 'open', 'high', 'low', 'close'])
46+
47+
# Create a pandas DataFrame with the close prices of each symbol
48+
# correctly aligned and dropping missing entries
49+
print("Constructing dual matrix for %s and %s..." % symbols)
50+
pairs = pd.DataFrame(index=sym1.index)
51+
pairs['%s_close' % symbols[0].lower()] = sym1['close']
52+
pairs['%s_close' % symbols[1].lower()] = sym2['close']
53+
pairs = pairs.dropna()
54+
return pairs
55+
56+
57+
def calculate_spread_zscore(pairs, symbols, lookback=100):
58+
"""Creates a hedge ratio between the two symbols by calculating
59+
a rolling linear regression with a defined lookback period. This
60+
is then used to create a z-score of the 'spread' between the two
61+
symbols based on a linear combination of the two."""
62+
63+
# Use the pandas Ordinary Least Squares method to fit a rolling
64+
# linear regression between the two closing price time series
65+
s0 = symbols[0].lower()
66+
s1 = symbols[1].lower()
67+
68+
print("Fitting the rolling Linear Regression...")
69+
70+
ols = rolling_beta(pairs['%s_close' % s0],
71+
pairs['%s_close' % s1],
72+
pairs.index,
73+
window=lookback)
74+
75+
# Construct the hedge ratio and eliminate the first
76+
# lookback-length empty/NaN period
77+
pairs['hedge_ratio'] = ols['beta']
78+
pairs = pairs.dropna()
79+
80+
# Create the spread and then a z-score of the spread
81+
print("Creating the spread/zscore columns...")
82+
pairs['hedge_ratio'] = [v for v in pairs['hedge_ratio'].values]
83+
pairs['spread'] = pairs['{}_close'.format(s0)] - pairs['hedge_ratio'] * pairs['{}_close'.format(s1)]
84+
pairs['zscore'] = (pairs['spread'] - np.mean(pairs['spread'])) / np.std(pairs['spread'])
85+
return pairs
86+
87+
88+
def create_long_short_market_signals(pairs, symbols,
89+
z_entry_threshold=2.0,
90+
z_exit_threshold=1.0):
91+
"""Create the entry/exit signals based on the exceeding of
92+
z_enter_threshold for entering a position and falling below
93+
z_exit_threshold for exiting a position."""
94+
95+
# Calculate when to be long, short and when to exit
96+
pairs['longs'] = (pairs['zscore'] <= -z_entry_threshold) * 1.0
97+
pairs['shorts'] = (pairs['zscore'] >= z_entry_threshold) * 1.0
98+
pairs['exits'] = (np.abs(pairs['zscore']) <= z_exit_threshold) * 1.0
99+
100+
# These signals are needed because we need to propagate a
101+
# position forward, i.e. we need to stay long if the zscore
102+
# threshold is less than z_entry_threshold by still greater
103+
# than z_exit_threshold, and vice versa for shorts.
104+
pairs['long_market'] = 0.0
105+
pairs['short_market'] = 0.0
106+
107+
# These variables track whether to be long or short while
108+
# iterating through the bars
109+
long_market = 0
110+
short_market = 0
111+
112+
# Calculates when to actually be "in" the market, i.e. to have a
113+
# long or short position, as well as when not to be.
114+
# Since this is using iterrows to loop over a dataframe, it will
115+
# be significantly less efficient than a vectorised operation,
116+
# i.e. slow!
117+
print("Calculating when to be in the market (long and short)...")
118+
for i, b in enumerate(pairs.iterrows()):
119+
# Calculate longs
120+
if b[1]['longs'] == 1.0:
121+
long_market = 1
122+
# Calculate shorts
123+
if b[1]['shorts'] == 1.0:
124+
short_market = 1
125+
# Calculate exists
126+
if b[1]['exits'] == 1.0:
127+
long_market = 0
128+
short_market = 0
129+
# This directly assigns a 1 or 0 to the long_market/short_market
130+
# columns, such that the strategy knows when to actually stay in!
131+
pairs.loc[pairs.index[i], 'long_market'] = long_market
132+
pairs.loc[pairs.index[i], 'short_market'] = short_market
133+
return pairs
134+
135+
136+
def create_portfolio_returns(pairs, symbols):
137+
"""Creates a portfolio pandas DataFrame which keeps track of
138+
the account equity and ultimately generates an equity curve.
139+
This can be used to generate drawdown and risk/reward ratios."""
140+
141+
# Convenience variables for symbols
142+
sym1 = symbols[0].lower()
143+
sym2 = symbols[1].lower()
144+
145+
# Construct the portfolio object with positions information
146+
# Note that minuses to keep track of shorts!
147+
print("Constructing a portfolio...")
148+
portfolio = pd.DataFrame(index=pairs.index)
149+
portfolio['positions'] = pairs['long_market'] - pairs['short_market']
150+
portfolio[sym1] = -1.0 * pairs['%s_close' % sym1] * portfolio['positions']
151+
portfolio[sym2] = pairs['%s_close' % sym2] * portfolio['positions']
152+
portfolio['total'] = portfolio[sym1] + portfolio[sym2]
153+
154+
# Construct a percentage returns stream and eliminate all
155+
# of the NaN and -inf/+inf cells
156+
print("Constructing the equity curve...")
157+
portfolio['returns'] = portfolio['total'].pct_change()
158+
portfolio['returns'].fillna(0.0, inplace=True)
159+
portfolio['returns'].replace([np.inf, -np.inf], 0.0, inplace=True)
160+
portfolio['returns'].replace(-1.0, 0.0, inplace=True)
161+
162+
# Calculate the full equity curve
163+
portfolio['returns'] = (portfolio['returns'] + 1.0).cumprod()
164+
return portfolio
165+
166+
167+
if __name__ == "__main__":
168+
datadir = 'datasets/' # Change this to reflect your data path!
169+
symbols = ('ETC', 'LTC')
170+
171+
# lookbacks = range(10, 310, 10)
172+
# returns = []
173+
174+
lb = 20
175+
176+
# Adjust lookback period from 50 to 200 in increments
177+
# of 10 in order to produce sensitivities
178+
# for lb in lookbacks:
179+
print("Calculating lookback=%s..." % lb)
180+
pairs = create_pairs_dataframe(datadir, symbols)
181+
pairs = calculate_spread_zscore(pairs, symbols, lookback=lb)
182+
pairs = create_long_short_market_signals(pairs, symbols,
183+
z_entry_threshold=2.0,
184+
z_exit_threshold=1.0)
185+
186+
portfolio = create_portfolio_returns(pairs, symbols)
187+
portfolio['returns'].plot()
188+
plt.show()
189+
# returns.append(portfolio.iloc[-1]['returns'])
190+
print()
191+
192+
# print("Plot the lookback-performance scatterchart...")
193+
# print('Best lookback: {}'.format(lookbacks[returns.index(max(returns))]))
194+
# plt.plot(lookbacks, returns, '-o')
195+
# plt.show()
196+
#
197+
198+
# best lookback: 20

0 commit comments

Comments
 (0)
Please sign in to comment.