-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathregression_pd_预测一个月.py
149 lines (129 loc) · 6.5 KB
/
regression_pd_预测一个月.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# -*- coding: utf-8 -*-
from __future__ import division ###小数除法
default_encoding="utf-8"
from getBaseData_pd import *
import statsmodels.api as sm
import logging
def calcOLS(data,y_var,x_vars):
Y=data[y_var]
X=data[x_vars]
X = sm.add_constant(X) ###加一个常数项
# est=sm.OLS(Y,X,missing='drop').fit()
est=sm.OLS(Y,X,missing='drop').fit(cov_type='HAC',cov_kwds={'maxlags':3})
if len(est.params)==(len(x_vars)+1): ###回归时出现少于自变量加常数项的个数数,导致无法组合成df,组成series,过滤不符合要求的组
res = pd.Series({'rsquared': est.rsquared, 'rsquared_adj': est.rsquared_adj})
est.params = est.params.append(res)
return est.params
def calcMonthIV():
MonthIVData=getMonthIVData()
MonthIVData["dt_drft"]=MonthIVData["log_diff_return"]-MonthIVData["log_free_risk_return"]
y_var="dt_drft"
x_vars=["riskpremium1","smb1","hml1","raw1","cma1"]
# print(first_gression_data[first_gression_data.isnull().values==True])
group_first_gression_data=MonthIVData.groupby("bond_code")
result_data = group_first_gression_data.apply(calcOLS, y_var, x_vars)
result_data = result_data.reset_index()
data=pd.merge(MonthIVData,result_data,on="bond_code",how="inner",suffixes=('_data', '_coef')) ###接下来计算日iv,statmodel可能有更简单的方法,下个版本改进
data["error"]=(data["dt_drft"]-(data["riskpremium1_data"]*data["riskpremium1_coef"]+
data["smb1_data"] * data["smb1_coef"]+
data["hml1_data"] * data["hml1_coef"]+
data["raw1_data"] * data["raw1_coef"]+
data["raw1_data"] * data["raw1_coef"]
))**2 ###求每日残差平方
data["month"]=data["day"].apply(lambda item :item[:6]) ###提取月份
group_data=data.groupby(["bond_code","month"])[["error"]] ###按bon_code,month 分组
count_group_data=group_data.count()
sum_group_data=group_data.sum()
sum_group_data["month_iv"]=(sum_group_data["error"]/count_group_data["error"])**0.5 ###均方差即为month_iv
sum_group_data=sum_group_data.reset_index()
print(sum_group_data.head(1))
file_save_path=file_path_dict[u"month_iv"]
if os.access(file_save_path, os.F_OK):
os.remove(file_save_path) ###若文件存在,先删除
sum_group_data.to_csv(file_save_path, mode='a', index=False) # df.to_csv, 参数mode='a'表示追加
def calcFirstRegression(y_var,x_vars):
first_regression_data=getFirstRegressionData()
first_regression_data["r_rft"] = first_regression_data["r_rft"].shift(-1)
first_regression_data=first_regression_data.dropna(axis=0, how='all') ###dropna返回一个df
group_first_regression_data=first_regression_data.groupby("bond_code")
# print(group_first_regression_data.count())
# print(group_first_regression_data.dtypes)
result_data=group_first_regression_data.apply(calcOLS,y_var,x_vars)
result_data=result_data.reset_index()
save_file_path = file_path_dict["first_regression_coef"]
if os.access(save_file_path, os.F_OK):
os.remove(save_file_path) ###若文件存在,先删除
result_data.to_csv(save_file_path, mode='a', index=False) # df.to_csv, 参数mode='a'表示追加
def calcSecondRegression(y_var,x_vars,output_file_coef,output_file_result):
second_regression_data=getSecondRegressionData()
group_Second_regression_data=second_regression_data.groupby("month")
result_data=group_Second_regression_data.apply(calcOLS,y_var,x_vars)
result_data=result_data.reset_index()
save_file_path = output_file_coef
if os.access(save_file_path, os.F_OK):
os.remove(save_file_path) ###若文件存在,先删除
result_data.to_csv(save_file_path, mode='a', index=False) # df.to_csv, 参数mode='a'表示追加
#### t 检验,statsmodel可能有,没找到
mean_result_data=result_data.mean()
std_result_data=result_data.std()
count_result_data=result_data.count()
t_value=mean_result_data/(std_result_data/(count_result_data-1)**0.5)
mean_coef=mean_result_data
result=pd.DataFrame([t_value,mean_coef])
result=result.T
logger.info("########################################################################################")
logger.info("t_value and mean_coef")
result.columns=['t_value','mean_coef']
logger.info(result)
print(result)
logger.info("########################################################################################")
res = mean_result_data[['rsquared', 'rsquared_adj']]
logger.info("rsquared and rsquared_adj")
logger.info(res)
print(res)
result.to_excel(output_file_result)
if __name__ == "__main__":
y_var="r_rft"
x_vars=[
# "size_rjv",
"mean_rjv",
"arr_rjv",
"std_rjv",
# "riskpremium1",
# "smb1",
# "hml1",
# "rmv1",
# "cma1",
"iv",
# "size_rjv_square",
# "mean_rjv_square",
# "arr_rjv_square",
# "std_rjv_square",
# "iv_square",
# "iv_size_rjv",
# "iv_mean_rjv",
# "iv_arr_rjv",
# "iv_std_rjv",
# "huanshoulv",
# "shiyinglv",
# "shizhi",
# "ppi",
# "cpi",
# "xinzenggudingtouzi",
# "gudingtouzi",
# "m2",
# "pd_1",
]
logger = logging.getLogger(__name__)
logger.setLevel(level=logging.INFO)
handler = logging.FileHandler('nine.log')
logger.addHandler(handler)
second_regression_coef_path="d:/pandas/result/test.txt"
second_regression_result="d:/pandas/result/跳跃不含iv_预测一个月_coef.xls"
logger.info("########################################################################################")
logger.info("predict one month")
calcFirstRegression(y_var,x_vars)
calcSecondRegressionData() ###根据第一步的系数计算第二步所需数据
calcSecondRegression(y_var,x_vars,second_regression_coef_path,second_regression_result)
logger.info("########################################################################################")
logger.info("########################################################################################")