-
Notifications
You must be signed in to change notification settings - Fork 0
/
MLBacktester.py
126 lines (105 loc) · 4.9 KB
/
MLBacktester.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
plt.style.use("seaborn-v0_8")
class MLBacktester():
''' Class for the vectorized backtesting of Machine Learning-based trading strategies (Classification).
'''
def __init__(self, symbol, start, end, tc):
'''
Parameters
----------
symbol: str
ticker symbol (instrument) to be backtested
start: str
start date for data import
end: str
end date for data import
tc: float
proportional transaction/trading costs per trade
'''
self.symbol = symbol
self.start = start
self.end = end
self.tc = tc
self.model = LogisticRegression(C = 1e6, max_iter = 100000, multi_class = "ovr")
self.results = None
self.get_data()
def __repr__(self):
rep = "MLBacktester(symbol = {}, start = {}, end = {}, tc = {})"
return rep.format(self.symbol, self.start, self.end, self.tc)
def get_data(self):
''' Imports the data from five_minute_pairs.csv (source can be changed).
'''
raw = pd.read_csv("five_minute_pairs.csv", parse_dates = ["time"], index_col = "time")
raw = raw[self.symbol].to_frame().dropna()
raw = raw.loc[self.start:self.end]
raw.rename(columns={self.symbol: "price"}, inplace=True)
raw["returns"] = np.log(raw / raw.shift(1))
self.data = raw
def split_data(self, start, end):
''' Splits the data into training set & test set.
'''
data = self.data.loc[start:end].copy()
return data
def prepare_features(self, start, end):
''' Prepares the feature columns for training set and test set.
'''
self.data_subset = self.split_data(start, end)
self.feature_columns = []
for lag in range(1, self.lags + 1):
col = "lag{}".format(lag)
self.data_subset[col] = self.data_subset["returns"].shift(lag)
self.feature_columns.append(col)
self.data_subset.dropna(inplace=True)
def fit_model(self, start, end):
''' Fitting the ML Model.
'''
self.prepare_features(start, end)
self.model.fit(self.data_subset[self.feature_columns], np.sign(self.data_subset["returns"]))
def test_strategy(self, train_ratio = 0.7, lags = 5):
'''
Backtests the ML-based strategy.
Parameters
----------
train_ratio: float (between 0 and 1.0 excl.)
Splitting the dataset into training set (train_ratio) and test set (1 - train_ratio).
lags: int
number of lags serving as model features.
'''
self.lags = lags
# determining datetime for start, end and split (for training an testing period)
full_data = self.data.copy()
split_index = int(len(full_data) * train_ratio)
split_date = full_data.index[split_index-1]
train_start = full_data.index[0]
test_end = full_data.index[-1]
# fit the model on the training set
self.fit_model(train_start, split_date)
# prepare the test set
self.prepare_features(split_date, test_end)
# make predictions on the test set
predict = self.model.predict(self.data_subset[self.feature_columns])
self.data_subset["pred"] = predict
# calculate Strategy Returns
self.data_subset["strategy"] = self.data_subset["pred"] * self.data_subset["returns"]
# determine the number of trades in each bar
self.data_subset["trades"] = self.data_subset["pred"].diff().fillna(0).abs()
# subtract transaction/trading costs from pre-cost return
self.data_subset.strategy = self.data_subset.strategy - self.data_subset.trades * self.tc
# calculate cumulative returns for strategy & buy and hold
self.data_subset["creturns"] = self.data_subset["returns"].cumsum().apply(np.exp)
self.data_subset["cstrategy"] = self.data_subset['strategy'].cumsum().apply(np.exp)
self.results = self.data_subset
perf = self.results["cstrategy"].iloc[-1] # absolute performance of the strategy
outperf = perf - self.results["creturns"].iloc[-1] # out-/underperformance of strategy
return round(perf, 6), round(outperf, 6)
def plot_results(self):
''' Plots the performance of the trading strategy and compares to "buy and hold".
'''
if self.results is None:
print("Run test_strategy() first.")
else:
title = "Logistic Regression: {} | TC = {}".format(self.symbol, self.tc)
self.results[["creturns", "cstrategy"]].plot(title=title, figsize=(12, 8))