Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

关于alpha158提取问题修复 #6

Open
huakyouin opened this issue Nov 10, 2024 · 1 comment
Open

关于alpha158提取问题修复 #6

huakyouin opened this issue Nov 10, 2024 · 1 comment

Comments

@huakyouin
Copy link

huakyouin commented Nov 10, 2024

我发现example中alpha158转换结果跟qlib直接生成的有以下不一致:

  • 缺失slope, r2, resd, cord, imax, imin, imxd特征
  • 部分特征计算结果与qlib不一致

我通过下面代码进行了修复,残留问题包括:

  • max、min特征与qlib结果仍然不一致,但已经使用了polars的原始函数了
  • resd、slope自己的实现与qlib差了差不多3倍,不清楚qlib什么样的处理导致的(影响较小)
## 特征转换
import polars as pl
from polars_ta.prefix.tdx import *
from polars_ta.prefix.wq import *

df = pl.read_csv("../data/cleaned/csi300_stock_feats.csv")

OPEN, HIGH, LOW, CLOSE, VOLUME, AMOUNT, VWAP = [pl.col(col) for col in ['open', 'high', 'low', 'close', 'volume', 'amount', 'vwap']]

def fast_linregress(x, y):
            x_mean = np.mean(x)
            y_mean = np.mean(y)
            slope = np.dot(x - x_mean, y - y_mean) / np.dot(x - x_mean, x - x_mean)
            intercept = y_mean - slope * x_mean
            y_pred = slope * x + intercept
            ss_total = np.sum((y - np.mean(y)) ** 2) + 1e-12
            ss_residual = np.sum((y - y_pred) ** 2)
            r2 = 1 - (ss_residual / ss_total)
            resd = np.sum(y - y_pred)
            return slope, intercept, r2, resd

def func_ts_date(df: pl.DataFrame) -> pl.DataFrame:
    print(df['instrument'][0])
    df = df.sort(by=['datetime'])
    df = df.with_columns([
        ((CLOSE - OPEN) / OPEN).alias('KMID'),
        ((HIGH - LOW) / OPEN).alias("KLEN"),
        ((CLOSE - OPEN) / (HIGH - LOW + 1e-12)).alias("KMID2"),
        ((HIGH - max_(OPEN, CLOSE)) / OPEN).alias("KUP"),
        ((HIGH - max_(OPEN, CLOSE)) / (HIGH - LOW + 1e-12)).alias("KUP2"),
        ((min_(OPEN, CLOSE) - LOW) / OPEN).alias("KLOW"),
        ((min_(OPEN, CLOSE) - LOW) / (HIGH - LOW + 1e-12)).alias("KLOW2"),
        ((2 * CLOSE - HIGH - LOW) / OPEN).alias("KSFT"),
        ((2 * CLOSE - HIGH - LOW) / (HIGH - LOW + 1e-12)).alias("KSFT2"),
        *[(ts_delay(OPEN, i) / CLOSE).alias(f'OPEN{i}') for i in [0]],
        *[(ts_delay(HIGH, i) / CLOSE).alias(f'HIGH{i}') for i in [0]],
        *[(ts_delay(LOW, i) / CLOSE).alias(f'LOW{i}') for i in [0]],
        *[(ts_delay(VWAP, i) / CLOSE).alias(f'VWAP{i}') for i in [0]],
    ])
    for i in [5,10,20,30,60]:
        df = df.with_columns([
            (ts_delay(CLOSE, i) / CLOSE).alias(f'ROC{i}'),
            (ts_mean(CLOSE, i) / CLOSE).alias(f'MA{i}'),
            (CLOSE.rolling_std(i) / CLOSE).alias(f'STD{i}'),
            (CLOSE.rolling_max(i) / CLOSE).alias(f'MAX{i}'),
            (CLOSE.rolling_min(i) / CLOSE).alias(f'MIN{i}'),
            (CLOSE.rolling_quantile(0.8, interpolation='linear', window_size=i) / CLOSE).alias(f'QTLU{i}'),
            (CLOSE.rolling_quantile(0.2, interpolation='linear', window_size=i) / CLOSE).alias(f'QTLD{i}'),
            (ts_rank(CLOSE, i)).alias(f'RANK{i}'),
            (ts_RSV(HIGH, LOW, CLOSE, i)).alias(f'RSV{i}'),
            (1 - ts_arg_max(HIGH, i) / i).alias(f'IMAX{i}'),
            (1 - ts_arg_min(LOW, i) / i).alias(f'IMIN{i}'),
            (ts_corr(CLOSE, log1p(VOLUME), i)).alias(f'CORR{i}'),
            (ts_corr(CLOSE / ts_delay(CLOSE, 1), log1p(VOLUME / ts_delay(VOLUME, 1)), i)).alias(f'CORD{i}'),
            (ts_mean(CLOSE > ts_delay(CLOSE, 1), i)).alias(f'CNTP{i}'),
            (ts_mean(CLOSE < ts_delay(CLOSE, 1), i)).alias(f'CNTN{i}'),
            (ts_sum(max_(CLOSE - ts_delay(CLOSE, 1), 0), i) / (ts_sum(abs_(CLOSE - ts_delay(CLOSE, 1)), i) + 1e-12)).alias(f'SUMP{i}'),
            (ts_sum(max_(ts_delay(CLOSE, 1) - CLOSE, 0), i) / (ts_sum(abs_(CLOSE - ts_delay(CLOSE, 1)), i) + 1e-12)).alias(f'SUMN{i}'),
            (ts_mean(VOLUME, i) / (VOLUME + 1e-12)).alias(f'VMA{i}'),
            (VOLUME.rolling_std(i) / (VOLUME + 1e-12)).alias(f'VSTD{i}'),
            ((abs_(ts_returns(CLOSE, 1)) * VOLUME).rolling_std(i) / (ts_mean(abs_(ts_returns(CLOSE, 1)) * VOLUME, i) + 1e-12)).alias(f'WVMA{i}'),
            (ts_sum(max_(VOLUME - ts_delay(VOLUME, 1), 0), i) / (ts_sum(abs_(VOLUME - ts_delay(VOLUME, 1)), i) + 1e-12)).alias(f'VSUMP{i}'),
            (ts_sum(max_(ts_delay(VOLUME, 1) - VOLUME, 0), i) / (ts_sum(abs_(VOLUME - ts_delay(VOLUME, 1)), i) + 1e-12)).alias(f'VSUMN{i}')
        ])
        df = df.with_columns([
            (pl.col(f"IMAX{i}") -pl.col(f"IMIN{i}")).alias(f"IMXD{i}"),
            (pl.col(f"CNTP{i}") - pl.col(f"CNTN{i}")).alias(f'CNTD{i}'),
            (pl.col(f"SUMP{i}") - pl.col(f"SUMN{i}")).alias(f'SUMD{i}'),
            (pl.col(f"VSUMP{i}") - pl.col(f"VSUMN{i}")).alias(f'VSUMD{i}'),
        ])

        reg = [fast_linregress(x = np.arange(i), y = df["close"][idx: idx + i].to_numpy()) for idx in range(len(df) - i + 1)]
        beta = [None] * (i - 1) + [item[0] for item in reg if item]
        rsqr = [None] * (i - 1) + [item[2] for item in reg if item]
        resi = [None] * (i - 1) + [item[3] for item in reg if item]
        row_n = len(df)
        df = df.with_columns([
            pl.Series(f'BETA{i}', beta[:row_n]),
            pl.Series(f'RSQR{i}', rsqr[:row_n]),
            pl.Series(f'RESI{i}', resi[:row_n]),
        ])
    return df

# df = df.group_by('instrument').map_groups(func_ts_date)
# print(df)

df = func_ts_date(df.filter(pl.col('instrument') == 'SH600005'))
@wukan1986
Copy link
Owner

(CLOSE.rolling_max(i) / CLOSE).alias(f'MAX{i}'),
(CLOSE.rolling_min(i) / CLOSE).alias(f'MIN{i}'),

这两句非常简单,结果不同会不会是停牌没有排除?或者没复权?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants