From 18ecc69aefe2e49cc784443833ee9f6ce6331259 Mon Sep 17 00:00:00 2001 From: Young Date: Thu, 7 Apr 2022 14:12:47 +0800 Subject: [PATCH 1/2] Add Qlib notebook tutorial --- README.md | 6 +- examples/tutorial/detailed_workflow.ipynb | 1225 +++++++++++++++++++ examples/workflow_by_code.ipynb | 1 - qlib/contrib/model/__init__.py | 10 +- qlib/data/dataset/__init__.py | 3 + qlib/workflow/cli.py | 5 + scripts/data_collector/pit/requirements.txt | 3 +- 7 files changed, 1246 insertions(+), 7 deletions(-) create mode 100644 examples/tutorial/detailed_workflow.ipynb diff --git a/README.md b/README.md index fde015723a..9ae767771a 100644 --- a/README.md +++ b/README.md @@ -386,6 +386,8 @@ Dataset plays a very important role in Quant. Here is a list of the datasets bui Your PR to build new Quant dataset is highly welcomed. # More About Qlib +If you want to have a quick glance at the most frequently used components of qlib, you can try notebooks [here](examples/tutorial/). + The detailed documents are organized in [docs](docs/). [Sphinx](http://www.sphinx-doc.org) and the readthedocs theme is required to build the documentation in html formats. ```bash @@ -471,8 +473,10 @@ If you don't know how to start to contribute, you can refer to the following exa | Models | [Implement a new model](https://github.com/microsoft/qlib/pull/689) | [Good first issues](https://github.com/microsoft/qlib/labels/good%20first%20issue) are labelled to indicate that they are easy to start your contributions. + +You can find some impefect implementation in Qlib by `rg 'TODO|FIXME' qlib` -If you would like to become one of Qlib's maintainers to contribute more (e.g. help merge PR, triage issues), please contact us by email([qlib@microsoft.com](mailto:qlib@microsoft.com)). We are glad to help you to set the right permission. +If you would like to become one of Qlib's maintainers to contribute more (e.g. help merge PR, triage issues), please contact us by email([qlib@microsoft.com](mailto:qlib@microsoft.com)). We are glad to help to upgrade your permission. ## Licence Most contributions require you to agree to a diff --git a/examples/tutorial/detailed_workflow.ipynb b/examples/tutorial/detailed_workflow.ipynb new file mode 100644 index 0000000000..f6bc58ac64 --- /dev/null +++ b/examples/tutorial/detailed_workflow.ipynb @@ -0,0 +1,1225 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "2ac18d8f", + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright (c) Microsoft Corporation.\n", + "# Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "id": "096c6260", + "metadata": {}, + "source": [ + "# Introduction\n", + "Though users can automatically run the whole Quant research worklfow based on configurations with Qlib.\n", + "\n", + "Some advanced users usally would like to carefully customize each component to explore more in Quant.\n", + "\n", + "If you just want a simple example of Qlib. [Quick start](https://github.com/microsoft/qlib#quick-start) and [workflow_by_code](https://github.com/microsoft/qlib/blob/main/examples/workflow_by_code.ipynb) may be a better choice for you.\n", + "\n", + "If you want to know more details about Quant research, this notebook may be a better place for you to start.\n", + "\n", + "We hope this script could be a tutorial for users who are interested in the details of Quant.\n", + "\n", + "This notebook tries to demonstrate how can we use Qlib to build components step by step. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b96a4196", + "metadata": {}, + "outputs": [], + "source": [ + "from pprint import pprint\n", + "from pathlib import Path\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e707694", + "metadata": {}, + "outputs": [], + "source": [ + "MARKET = \"csi300\"\n", + "BENCHMARK = \"SH000300\"\n", + "EXP_NAME = \"tutorial_exp\"" + ] + }, + { + "cell_type": "markdown", + "id": "ff16a42b", + "metadata": {}, + "source": [ + "# Data" + ] + }, + { + "cell_type": "markdown", + "id": "df055d7d", + "metadata": {}, + "source": [ + "## Get data" + ] + }, + { + "cell_type": "markdown", + "id": "e9898c23", + "metadata": {}, + "source": [ + "Users can follow [the steps](https://github.com/microsoft/qlib/tree/main/scripts#download-qlib-data) to download data with CLI.\n", + "\n", + "In this example we use the underlying API to automatically download data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0bcfa97", + "metadata": {}, + "outputs": [], + "source": [ + "from qlib.tests.data import GetData\n", + "GetData().qlib_data(exists_skip=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "42b89646", + "metadata": {}, + "outputs": [], + "source": [ + "import qlib\n", + "qlib.init()" + ] + }, + { + "cell_type": "markdown", + "id": "90080d29", + "metadata": {}, + "source": [ + "## Inspect raw data" + ] + }, + { + "cell_type": "markdown", + "id": "413e99b8", + "metadata": {}, + "source": [ + "Currently, Qlib support several kinds of data source." + ] + }, + { + "cell_type": "markdown", + "id": "ecf241e9", + "metadata": {}, + "source": [ + "### Calendar" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c386f38", + "metadata": {}, + "outputs": [], + "source": [ + "from qlib.data import D\n", + "D.calendar(start_time='2010-01-01', end_time='2017-12-31', freq='day')[:2] # calendar data" + ] + }, + { + "cell_type": "markdown", + "id": "f436b49d", + "metadata": {}, + "source": [ + "### Basic data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a889b763", + "metadata": {}, + "outputs": [], + "source": [ + "df = D.features(['SH601216'], ['$open', '$high', '$low', '$close', '$factor'], start_time='2020-05-01', end_time='2020-05-31') " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eceb43c8", + "metadata": {}, + "outputs": [], + "source": [ + "import plotly.graph_objects as go\n", + "fig = go.Figure(data=[go.Candlestick(x=df.index.get_level_values(\"datetime\"),\n", + " open=df['$open'],\n", + " high=df['$high'],\n", + " low=df['$low'],\n", + " close=df['$close'])])\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "id": "768ef188", + "metadata": {}, + "source": [ + "### price adjustment" + ] + }, + { + "cell_type": "markdown", + "id": "d9a536a4", + "metadata": {}, + "source": [ + "Maybe you think the price is not what it looks like in real world.\n", + "\n", + "Due to the price adjustment, the price will be different from the real trading data ." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "45df33b5", + "metadata": {}, + "outputs": [], + "source": [ + "import plotly.graph_objects as go\n", + "fig = go.Figure(data=[go.Candlestick(x=df.index.get_level_values(\"datetime\"),\n", + " open=df['$open'] / df['$factor'],\n", + " high=df['$high'] / df['$factor'],\n", + " low=df['$low'] / df['$factor'],\n", + " close=df['$close'] / df['$factor'])])\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "id": "d6acffc3", + "metadata": {}, + "source": [ + "Please notice the price gap on [2020-05-26](http://vip.stock.finance.sina.com.cn/corp/view/vISSUE_ShareBonusDetail.php?stockid=601216&type=1&end_date=2020-05-20)\n", + "\n", + "If we want to represent the change of assets value by price, adjust prices are necesary.\n", + "By default, Qlib stores the adjusted prices." + ] + }, + { + "cell_type": "markdown", + "id": "af5f063e", + "metadata": {}, + "source": [ + "### Static universe V.S. dynamic universe" + ] + }, + { + "cell_type": "markdown", + "id": "8b9b8ce5", + "metadata": {}, + "source": [ + "Dynamic universe" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "50d3ab70", + "metadata": {}, + "outputs": [], + "source": [ + "# dynamic universe\n", + "universe = D.list_instruments(D.instruments('csi100'), start_time='2010-01-01', end_time='2020-12-31')\n", + "pprint(universe)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6be08f23", + "metadata": {}, + "outputs": [], + "source": [ + "print(len(universe))" + ] + }, + { + "cell_type": "markdown", + "id": "28e7dd04", + "metadata": {}, + "source": [ + "Qlib use dynamic universe by default.\n", + "\n", + "csi100 has around 100 stocks each day(it is not that accurate due to the low precision of data)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ad8b8503", + "metadata": {}, + "outputs": [], + "source": [ + "df = D.features(D.instruments('csi100'), ['$close'], start_time='2010-01-01', end_time='2020-12-31') \n", + "df.groupby('datetime').size().plot()" + ] + }, + { + "cell_type": "markdown", + "id": "5f37f2ef", + "metadata": {}, + "source": [ + "### Point-In-Time data" + ] + }, + { + "cell_type": "markdown", + "id": "5dfa9e9d", + "metadata": {}, + "source": [ + "#### download data\n", + "NOTE: To run the test faster, we only download the data of two stocks" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "da0a9564", + "metadata": {}, + "outputs": [], + "source": [ + "p = Path(\"~/.qlib/qlib_data/cn_data/financial\").expanduser()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4657fe13", + "metadata": {}, + "outputs": [], + "source": [ + "if not p.exists():\n", + " !cd ../../scripts/data_collector/pit/ && pip install -r requirements.txt\n", + " !cd ../../scripts/data_collector/pit/ && python collector.py download_data --source_dir ~/.qlib/stock_data/source/pit --start 2000-01-01 --end 2020-01-01 --interval quarterly --symbol_regex \"^(600519|000725).*\"\n", + " !cd ../../scripts/data_collector/pit/ && python collector.py normalize_data --interval quarterly --source_dir ~/.qlib/stock_data/source/pit --normalize_dir ~/.qlib/stock_data/source/pit_normalized\n", + " !cd ../../scripts/ && python dump_pit.py dump --csv_path ~/.qlib/stock_data/source/pit_normalized --qlib_dir ~/.qlib/qlib_data/cn_data --interval quarterly\n", + " pass" + ] + }, + { + "cell_type": "markdown", + "id": "9358cb89", + "metadata": {}, + "source": [ + "#### querying data\n", + "using `roewa(performanceExpressROEWa,业绩快报净资产收益率ROE-加权)` as an example\n", + "\n", + "If we want to get fundamental data `in the most recent quarter` daily, we can use following example.\n", + "\n", + "Maitai release part of its fundamental data on [2019-07-13](http://www.cninfo.com.cn/new/disclosure/detail?stockCode=600519&announcementId=1206443183&orgId=gssh0600519&announcementTime=2019-07-13) and release others on [2019-07-18](http://www.cninfo.com.cn/new/disclosure/detail?stockCode=600519&announcementId=1206456129&orgId=gssh0600519&announcementTime=2019-07-18)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47ee1621", + "metadata": {}, + "outputs": [], + "source": [ + "instruments = [\"sh600519\"]\n", + "data = D.features(instruments, ['P($$roewa_q)'], start_time=\"2019-01-01\", end_time=\"2019-07-19\", freq=\"day\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "752f4ffe", + "metadata": {}, + "outputs": [], + "source": [ + "data.tail(15)" + ] + }, + { + "cell_type": "markdown", + "id": "0e370d2d", + "metadata": {}, + "source": [ + "### experss engine\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a46d166c", + "metadata": {}, + "outputs": [], + "source": [ + "D.features([\"sh600519\"], ['(EMA($close, 12) - EMA($close, 26))/$close - EMA((EMA($close, 12) - EMA($close, 26))/$close, 9)/$close'])" + ] + }, + { + "cell_type": "markdown", + "id": "5ddcd1ea", + "metadata": {}, + "source": [ + "\n", + "## Dataset loading and preprocessing " + ] + }, + { + "cell_type": "markdown", + "id": "f54c6804", + "metadata": {}, + "source": [ + "Some heuristic principles of create features\n", + "- make the features comparable between instrumets: remove unit from the features.\n", + "- try to keep the distribution invariant\n", + "- keep the scale of features similar" + ] + }, + { + "cell_type": "markdown", + "id": "93013dcd", + "metadata": {}, + "source": [ + "### data loader\n", + "\n", + "It's interface can be found [here](https://github.com/microsoft/qlib/blob/main/qlib/data/dataset/loader.py#L24) \n", + "\n", + "QlibDataLoader is an implementation which load data from Qlib's data source" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dcfa44a6", + "metadata": {}, + "outputs": [], + "source": [ + "from qlib.data.dataset.loader import QlibDataLoader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d78b4bf", + "metadata": {}, + "outputs": [], + "source": [ + "qdl = QlibDataLoader(config=(['$close / Ref($close, 10)'], ['RET10']))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3fb29d8e", + "metadata": {}, + "outputs": [], + "source": [ + "qdl.load(instruments=['sh600519'], start_time='20190101', end_time='20191231')" + ] + }, + { + "cell_type": "markdown", + "id": "0dded8f5", + "metadata": {}, + "source": [ + "### data handler" + ] + }, + { + "cell_type": "markdown", + "id": "78d6d2b0", + "metadata": {}, + "source": [ + "finance data can't be perfect.\n", + "\n", + "We have to process them before feeding them into Models" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c078fa3b", + "metadata": {}, + "outputs": [], + "source": [ + "df = qdl.load(instruments=['sh600519'], start_time='20190101', end_time='20191231')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "45e5adf9", + "metadata": {}, + "outputs": [], + "source": [ + "df.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "514b85e6", + "metadata": {}, + "outputs": [], + "source": [ + "df.plot(kind='hist')" + ] + }, + { + "cell_type": "markdown", + "id": "db1625f7", + "metadata": {}, + "source": [ + "Datahander is responsible for data preprocessing and provides data fetching interface \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43b35c17", + "metadata": {}, + "outputs": [], + "source": [ + "from qlib.data.dataset.handler import DataHandlerLP\n", + "from qlib.data.dataset.processor import ZScoreNorm, Fillna" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38a5f4b2", + "metadata": {}, + "outputs": [], + "source": [ + "# NOTE: normally, the training & validation time range will be `fit_start_time` , `fit_end_time`\n", + "# however,all the components are decomposed, so the training & validation time range is unknown when preprocessing.\n", + "dh = DataHandlerLP(instruments=['sh600519'], start_time='20170101', end_time='20191231',\n", + " infer_processors=[ZScoreNorm(fit_start_time='20170101', fit_end_time='20181231'), Fillna()],\n", + " data_loader=qdl)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9469fd1e", + "metadata": {}, + "outputs": [], + "source": [ + "df = dh.fetch()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fc35b3c0", + "metadata": {}, + "outputs": [], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1dd8e11b", + "metadata": {}, + "outputs": [], + "source": [ + "df.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7208efc3", + "metadata": {}, + "outputs": [], + "source": [ + "df.plot(kind='hist')" + ] + }, + { + "cell_type": "markdown", + "id": "1e0fb32c", + "metadata": {}, + "source": [ + "### dataset" + ] + }, + { + "cell_type": "markdown", + "id": "0302a801", + "metadata": {}, + "source": [ + "#### basic dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "96ef76e4", + "metadata": {}, + "outputs": [], + "source": [ + "from qlib.data.dataset import DatasetH, TSDatasetH" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12fd8296", + "metadata": {}, + "outputs": [], + "source": [ + "ds = DatasetH(dh, segments={\"train\": ('20180101', '20181231'), \"valid\": ('20190101', '20191231')})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4cc4c199", + "metadata": {}, + "outputs": [], + "source": [ + "ds.prepare('train')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4639b6a7", + "metadata": {}, + "outputs": [], + "source": [ + "ds.prepare('valid')" + ] + }, + { + "cell_type": "markdown", + "id": "a56001e4", + "metadata": {}, + "source": [ + "#### Time Series Dataset\n", + "\n", + "For different model, the required dataset format will be different.\n", + "\n", + "For example, Qlib provides a Time Series Dataset(TSDatasetH) to help users to create time-series dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "425135e1", + "metadata": {}, + "outputs": [], + "source": [ + "ds = TSDatasetH(step_len=10, handler=dh, segments={\"train\": ('20180101', '20181231'), \"valid\": ('20190101', '20191231')})\n", + "train_sampler = ds.prepare('train')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f724041", + "metadata": {}, + "outputs": [], + "source": [ + "train_sampler" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e5aa762c", + "metadata": {}, + "outputs": [], + "source": [ + "train_sampler[0] # Retrieving the first example" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eb64112c", + "metadata": {}, + "outputs": [], + "source": [ + "train_sampler['2018-01-08', 'sh600519'] # get the time series by <'timestamp', 'instrument_id'> index" + ] + }, + { + "cell_type": "markdown", + "id": "4e6ab197", + "metadata": {}, + "source": [ + "### Off-the-shelf dataset\n", + "\n", + "Qlib integrated some dataset alreadly" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e21b45c", + "metadata": { + "code_folding": [] + }, + "outputs": [], + "source": [ + "handler_kwargs = {\n", + " \"start_time\": \"2008-01-01\",\n", + " \"end_time\": \"2020-08-01\",\n", + " \"fit_start_time\": \"2008-01-01\",\n", + " \"fit_end_time\": \"2014-12-31\",\n", + " \"instruments\": MARKET,\n", + "}\n", + "handler_conf = {\n", + " \"class\": \"Alpha158\",\n", + " \"module_path\": \"qlib.contrib.data.handler\",\n", + " \"kwargs\": handler_kwargs,\n", + "}\n", + "pprint(handler_conf)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35d9d248", + "metadata": {}, + "outputs": [], + "source": [ + "pprint(handler_conf)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "17077f0c", + "metadata": {}, + "outputs": [], + "source": [ + "from qlib.utils import init_instance_by_config" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a20d9b2", + "metadata": {}, + "outputs": [], + "source": [ + "hd = init_instance_by_config(handler_conf)" + ] + }, + { + "cell_type": "markdown", + "id": "5aa02379", + "metadata": {}, + "source": [ + "Using config to create instance is a highly frequently used practice in Qlib (e.g. the [workflows configurations](https://github.com/microsoft/qlib/blob/main/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml) are based on it).\n", + "\n", + "\n", + "The above configuration is the same as the code below" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "480d35bf", + "metadata": {}, + "outputs": [], + "source": [ + "from qlib.contrib.data.handler import Alpha158\n", + "hd = Alpha158(**handler_kwargs)" + ] + }, + { + "cell_type": "markdown", + "id": "b170153d", + "metadata": {}, + "source": [ + "This dataset has the same structure as the simple one with 1 column we created just now." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "735758e5", + "metadata": {}, + "outputs": [], + "source": [ + "df = hd.fetch()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2b6de50c", + "metadata": {}, + "outputs": [], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb1bc4fb", + "metadata": {}, + "outputs": [], + "source": [ + "hd.data_loader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a927f1c", + "metadata": {}, + "outputs": [], + "source": [ + "hd.data_loader.fields" + ] + }, + { + "cell_type": "markdown", + "id": "f5a15747", + "metadata": {}, + "source": [ + "#### some details\n", + "\n", + "The training data may not be the same as the test data.\n", + "\n", + "e.g.\n", + "- the training dataset and test dataset use a different fitlering rules, data processing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf2defa0", + "metadata": {}, + "outputs": [], + "source": [ + "hd.learn_processors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ef55a881", + "metadata": {}, + "outputs": [], + "source": [ + "hd.infer_processors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7af3b077", + "metadata": {}, + "outputs": [], + "source": [ + "hd.process_type # appending type" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20127e19", + "metadata": {}, + "outputs": [], + "source": [ + "hd.fetch(col_set=\"label\", data_key=hd.DK_L)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09c37d41", + "metadata": {}, + "outputs": [], + "source": [ + "hd.fetch(col_set=\"label\", data_key=hd.DK_I)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ec2cfb12", + "metadata": {}, + "outputs": [], + "source": [ + "dataset_conf = {\n", + " \"class\": \"DatasetH\",\n", + " \"module_path\": \"qlib.data.dataset\",\n", + " \"kwargs\": {\n", + " \"handler\": hd,\n", + " \"segments\": {\n", + " \"train\": (\"2008-01-01\", \"2014-12-31\"),\n", + " \"valid\": (\"2015-01-01\", \"2016-12-31\"),\n", + " \"test\": (\"2017-01-01\", \"2020-08-01\"),\n", + " },\n", + " },\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aca33c3c", + "metadata": {}, + "outputs": [], + "source": [ + "dataset = init_instance_by_config(dataset_conf)" + ] + }, + { + "cell_type": "markdown", + "id": "0c89c15d", + "metadata": {}, + "source": [ + "# Model Training & Inference" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e916286", + "metadata": {}, + "outputs": [], + "source": [ + "from qlib.workflow import R\n", + "from qlib.workflow.record_temp import SignalRecord, PortAnaRecord, SigAnaRecord" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f6975911", + "metadata": {}, + "outputs": [], + "source": [ + "model = init_instance_by_config({\n", + " \"class\": \"LGBModel\",\n", + " \"module_path\": \"qlib.contrib.model.gbdt\",\n", + " \"kwargs\": {\n", + " \"loss\": \"mse\",\n", + " \"colsample_bytree\": 0.8879,\n", + " \"learning_rate\": 0.0421,\n", + " \"subsample\": 0.8789,\n", + " \"lambda_l1\": 205.6999,\n", + " \"lambda_l2\": 580.9768,\n", + " \"max_depth\": 8,\n", + " \"num_leaves\": 210,\n", + " \"num_threads\": 20,\n", + " },\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e2dafb6", + "metadata": {}, + "outputs": [], + "source": [ + "# start exp to train model\n", + "with R.start(experiment_name=EXP_NAME):\n", + " model.fit(dataset)\n", + " R.save_objects(trained_model=model)\n", + "\n", + " rec = R.get_recorder()\n", + " rid = rec.id # save the record id\n", + "\n", + " # Inference and saving signal\n", + " sr = SignalRecord(model, dataset, rec)\n", + " sr.generate()" + ] + }, + { + "cell_type": "markdown", + "id": "6b6b9f3d", + "metadata": {}, + "source": [ + "# Evaluation:\n", + "- Signal-based\n", + "- Portfolio-based: backtest " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4328f881", + "metadata": {}, + "outputs": [], + "source": [ + "###################################\n", + "# prediction, backtest & analysis\n", + "###################################\n", + "port_analysis_config = {\n", + " \"executor\": {\n", + " \"class\": \"SimulatorExecutor\",\n", + " \"module_path\": \"qlib.backtest.executor\",\n", + " \"kwargs\": {\n", + " \"time_per_step\": \"day\",\n", + " \"generate_portfolio_metrics\": True,\n", + " },\n", + " },\n", + " \"strategy\": {\n", + " \"class\": \"TopkDropoutStrategy\",\n", + " \"module_path\": \"qlib.contrib.strategy.signal_strategy\",\n", + " \"kwargs\": {\n", + " \"signal\": \"\",\n", + " \"topk\": 50,\n", + " \"n_drop\": 5,\n", + " },\n", + " },\n", + " \"backtest\": {\n", + " \"start_time\": \"2017-01-01\",\n", + " \"end_time\": \"2020-08-01\",\n", + " \"account\": 100000000,\n", + " \"benchmark\": BENCHMARK,\n", + " \"exchange_kwargs\": {\n", + " \"freq\": \"day\",\n", + " \"limit_threshold\": 0.095,\n", + " \"deal_price\": \"close\",\n", + " \"open_cost\": 0.0005,\n", + " \"close_cost\": 0.0015,\n", + " \"min_cost\": 5,\n", + " },\n", + " },\n", + "}\n", + "\n", + "# backtest and analysis\n", + "with R.start(experiment_name=EXP_NAME, recorder_id=rid, resume=True):\n", + "\n", + " # signal-based analysis\n", + " rec = R.get_recorder()\n", + " sar = SigAnaRecord(rec)\n", + " sar.generate()\n", + " \n", + " # portfolio-based analysis: backtest\n", + " par = PortAnaRecord(rec, port_analysis_config, \"day\")\n", + " par.generate()" + ] + }, + { + "cell_type": "markdown", + "id": "d66ad59d", + "metadata": {}, + "source": [ + "# Loading results & Analysis" + ] + }, + { + "cell_type": "markdown", + "id": "3d7d7dea", + "metadata": {}, + "source": [ + "## loading data\n", + "Because Qlib leverage MLflow to save model & data.\n", + "All the data can be access by `mlflow ui`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ec9dbb6", + "metadata": {}, + "outputs": [], + "source": [ + "# load recorder\n", + "recorder = R.get_recorder(recorder_id=rid, experiment_name=EXP_NAME)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "25e72b0d", + "metadata": {}, + "outputs": [], + "source": [ + "# load previous results\n", + "pred_df = recorder.load_object(\"pred.pkl\")\n", + "report_normal_df = recorder.load_object(\"portfolio_analysis/report_normal_1day.pkl\")\n", + "positions = recorder.load_object(\"portfolio_analysis/positions_normal_1day.pkl\")\n", + "analysis_df = recorder.load_object(\"portfolio_analysis/port_analysis_1day.pkl\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dce3696b", + "metadata": {}, + "outputs": [], + "source": [ + "# Previous Model can be loaded. but it is not used.\n", + "loaded_model = recorder.load_object(\"trained_model\")\n", + "loaded_model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf8eca78", + "metadata": {}, + "outputs": [], + "source": [ + "from qlib.contrib.report import analysis_model, analysis_position" + ] + }, + { + "cell_type": "markdown", + "id": "8d34b347", + "metadata": {}, + "source": [ + "## analysis position" + ] + }, + { + "cell_type": "markdown", + "id": "1cae642f", + "metadata": {}, + "source": [ + "### report" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47c727b2", + "metadata": {}, + "outputs": [], + "source": [ + "analysis_position.report_graph(report_normal_df)" + ] + }, + { + "cell_type": "markdown", + "id": "15b7ca14", + "metadata": {}, + "source": [ + "### risk analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3f100690", + "metadata": {}, + "outputs": [], + "source": [ + "analysis_position.risk_analysis_graph(analysis_df, report_normal_df)" + ] + }, + { + "cell_type": "markdown", + "id": "2ed48aee", + "metadata": {}, + "source": [ + "## analysis model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aec13561", + "metadata": {}, + "outputs": [], + "source": [ + "label_df = dataset.prepare(\"test\", col_set=\"label\")\n", + "label_df.columns = ['label']" + ] + }, + { + "cell_type": "markdown", + "id": "56d57363", + "metadata": {}, + "source": [ + "### score IC" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7612533d", + "metadata": {}, + "outputs": [], + "source": [ + "pred_label = pd.concat([label_df, pred_df], axis=1, sort=True).reindex(label_df.index)\n", + "analysis_position.score_ic_graph(pred_label)" + ] + }, + { + "cell_type": "markdown", + "id": "589664c4", + "metadata": {}, + "source": [ + "### model performance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "40258655", + "metadata": {}, + "outputs": [], + "source": [ + "analysis_model.model_performance_graph(pred_label)" + ] + } + ], + "metadata": { + "jupytext": { + "formats": "ipynb,auto:percent" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": true + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/workflow_by_code.ipynb b/examples/workflow_by_code.ipynb index 3d48669303..ac653b0802 100644 --- a/examples/workflow_by_code.ipynb +++ b/examples/workflow_by_code.ipynb @@ -256,7 +256,6 @@ "recorder = R.get_recorder(recorder_id=ba_rid, experiment_name=\"backtest_analysis\")\n", "print(recorder)\n", "pred_df = recorder.load_object(\"pred.pkl\")\n", - "pred_df_dates = pred_df.index.get_level_values(level='datetime')\n", "report_normal_df = recorder.load_object(\"portfolio_analysis/report_normal_1day.pkl\")\n", "positions = recorder.load_object(\"portfolio_analysis/positions_normal_1day.pkl\")\n", "analysis_df = recorder.load_object(\"portfolio_analysis/port_analysis_1day.pkl\")" diff --git a/qlib/contrib/model/__init__.py b/qlib/contrib/model/__init__.py index fab1af734c..c98f936acd 100644 --- a/qlib/contrib/model/__init__.py +++ b/qlib/contrib/model/__init__.py @@ -10,17 +10,19 @@ from .gbdt import LGBModel except ModuleNotFoundError: DEnsembleModel, LGBModel = None, None - print("Please install necessary libs for DEnsembleModel and LGBModel, such as lightgbm.") + print( + "ModuleNotFoundError. DEnsembleModel and LGBModel are skipped. (optional: maybe installing lightgbm can fix it.)" + ) try: from .xgboost import XGBModel except ModuleNotFoundError: XGBModel = None - print("Please install necessary libs for XGBModel, such as xgboost.") + print("ModuleNotFoundError. XGBModel is skipped(optional: maybe installing xgboost can fix it).") try: from .linear import LinearModel except ModuleNotFoundError: LinearModel = None - print("Please install necessary libs for LinearModel, such as scipy and sklearn.") + print("ModuleNotFoundError. LinearModel is skipped(optional: maybe installing scipy and sklearn can fix it).") # import pytorch models try: from .pytorch_alstm import ALSTM @@ -36,6 +38,6 @@ pytorch_classes = (ALSTM, GATs, GRU, LSTM, DNNModelPytorch, TabnetModel, SFM_Model, TCN, ADD) except ModuleNotFoundError: pytorch_classes = () - print("Please install necessary libs for PyTorch models.") + print("ModuleNotFoundError. PyTorch models are skipped (optional: maybe installing pytorch can fix it).") all_model_classes = (CatBoostModel, DEnsembleModel, LGBModel, XGBModel, LinearModel) + pytorch_classes diff --git a/qlib/data/dataset/__init__.py b/qlib/data/dataset/__init__.py index b1ec7383d4..7262640588 100644 --- a/qlib/data/dataset/__init__.py +++ b/qlib/data/dataset/__init__.py @@ -199,6 +199,9 @@ def prepare( col_set : str The col_set will be passed to self.handler when fetching data. + TODO: make it automatic: + - select DK_I for test data + - select DK_L for training data. data_key : str The data to fetch: DK_* Default is DK_I, which indicate fetching data for **inference**. diff --git a/qlib/workflow/cli.py b/qlib/workflow/cli.py index d4df0153ec..e0a925b412 100644 --- a/qlib/workflow/cli.py +++ b/qlib/workflow/cli.py @@ -43,6 +43,11 @@ def sys_config(config, config_path): # workflow handler function def workflow(config_path, experiment_name="workflow", uri_folder="mlruns"): + """ + This is a Qlib CLI entrance. + User can run the whole Quant research workflow defined by a configure file + - the code is located here ``qlib/workflow/cli.py` + """ with open(config_path) as fp: config = yaml.safe_load(fp) diff --git a/scripts/data_collector/pit/requirements.txt b/scripts/data_collector/pit/requirements.txt index 0cd9b42f9c..8b652cbd19 100644 --- a/scripts/data_collector/pit/requirements.txt +++ b/scripts/data_collector/pit/requirements.txt @@ -6,4 +6,5 @@ pandas lxml loguru baostock -yahooquery \ No newline at end of file +yahooquery +beautifulsoup4 From ffe36a5e75b396b8f9dcf6d8bd9a18a26331ec1f Mon Sep 17 00:00:00 2001 From: Young Date: Fri, 8 Apr 2022 09:21:05 +0800 Subject: [PATCH 2/2] Update tutorial --- examples/tutorial/detailed_workflow.ipynb | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/examples/tutorial/detailed_workflow.ipynb b/examples/tutorial/detailed_workflow.ipynb index f6bc58ac64..f96e2a52cf 100644 --- a/examples/tutorial/detailed_workflow.ipynb +++ b/examples/tutorial/detailed_workflow.ipynb @@ -696,16 +696,6 @@ "pprint(handler_conf)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "35d9d248", - "metadata": {}, - "outputs": [], - "source": [ - "pprint(handler_conf)" - ] - }, { "cell_type": "code", "execution_count": null, @@ -895,7 +885,9 @@ "id": "0c89c15d", "metadata": {}, "source": [ - "# Model Training & Inference" + "# Model Training & Inference\n", + "\n", + "[Model interface](https://github.com/microsoft/qlib/blob/main/qlib/model/base.py)" ] }, { @@ -975,10 +967,10 @@ "###################################\n", "port_analysis_config = {\n", " \"executor\": {\n", + " \"time_per_step\"\n", " \"class\": \"SimulatorExecutor\",\n", " \"module_path\": \"qlib.backtest.executor\",\n", - " \"kwargs\": {\n", - " \"time_per_step\": \"day\",\n", + " \"kwargs\": {: \"day\",\n", " \"generate_portfolio_metrics\": True,\n", " },\n", " },\n", @@ -1188,7 +1180,8 @@ ], "metadata": { "jupytext": { - "formats": "ipynb,auto:percent" + "encoding": "# -*- coding: utf-8 -*-", + "formats": "ipynb,py:percent" }, "kernelspec": { "display_name": "Python 3 (ipykernel)",