From 18ecc69aefe2e49cc784443833ee9f6ce6331259 Mon Sep 17 00:00:00 2001
From: Young <afe.young@gmail.com>
Date: Thu, 7 Apr 2022 14:12:47 +0800
Subject: [PATCH 1/2] Add Qlib notebook tutorial

---
 README.md                                   |    6 +-
 examples/tutorial/detailed_workflow.ipynb   | 1225 +++++++++++++++++++
 examples/workflow_by_code.ipynb             |    1 -
 qlib/contrib/model/__init__.py              |   10 +-
 qlib/data/dataset/__init__.py               |    3 +
 qlib/workflow/cli.py                        |    5 +
 scripts/data_collector/pit/requirements.txt |    3 +-
 7 files changed, 1246 insertions(+), 7 deletions(-)
 create mode 100644 examples/tutorial/detailed_workflow.ipynb

diff --git a/README.md b/README.md
index fde015723a..9ae767771a 100644
--- a/README.md
+++ b/README.md
@@ -386,6 +386,8 @@ Dataset plays a very important role in Quant. Here is a list of the datasets bui
 Your PR to build new Quant dataset is highly welcomed.
 
 # More About Qlib
+If you want to have a quick glance at the most frequently used components of qlib, you can try notebooks [here](examples/tutorial/).
+
 The detailed documents are organized in [docs](docs/).
 [Sphinx](http://www.sphinx-doc.org) and the readthedocs theme is required to build the documentation in html formats. 
 ```bash
@@ -471,8 +473,10 @@ If you don't know how to start to contribute, you can refer to the following exa
 | Models |  [Implement a new model](https://github.com/microsoft/qlib/pull/689) | 
 
 [Good first issues](https://github.com/microsoft/qlib/labels/good%20first%20issue) are labelled to indicate that they are easy to start your contributions.
+
+You can find some impefect implementation in Qlib by  `rg 'TODO|FIXME' qlib`
  
-If you would like to become one of Qlib's maintainers to contribute more (e.g. help merge PR, triage issues), please contact us by email([qlib@microsoft.com](mailto:qlib@microsoft.com)).  We are glad to help you to set the right permission.
+If you would like to become one of Qlib's maintainers to contribute more (e.g. help merge PR, triage issues), please contact us by email([qlib@microsoft.com](mailto:qlib@microsoft.com)).  We are glad to help to upgrade your permission.
 
 ## Licence
 Most contributions require you to agree to a
diff --git a/examples/tutorial/detailed_workflow.ipynb b/examples/tutorial/detailed_workflow.ipynb
new file mode 100644
index 0000000000..f6bc58ac64
--- /dev/null
+++ b/examples/tutorial/detailed_workflow.ipynb
@@ -0,0 +1,1225 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2ac18d8f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#  Copyright (c) Microsoft Corporation.\n",
+    "#  Licensed under the MIT License."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "096c6260",
+   "metadata": {},
+   "source": [
+    "# Introduction\n",
+    "Though users can automatically run the whole Quant research worklfow based on configurations with Qlib.\n",
+    "\n",
+    "Some advanced users usally would like to carefully customize each component to explore more in Quant.\n",
+    "\n",
+    "If you just want a simple example of Qlib. [Quick start](https://github.com/microsoft/qlib#quick-start) and [workflow_by_code](https://github.com/microsoft/qlib/blob/main/examples/workflow_by_code.ipynb) may be a better choice for you.\n",
+    "\n",
+    "If you want to know more details about Quant research, this notebook may be a better place for you to start.\n",
+    "\n",
+    "We hope this script could be a tutorial for users who are interested in the details of Quant.\n",
+    "\n",
+    "This notebook tries to demonstrate how can we use Qlib to build components step by step. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b96a4196",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pprint import pprint\n",
+    "from pathlib import Path\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9e707694",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "MARKET = \"csi300\"\n",
+    "BENCHMARK = \"SH000300\"\n",
+    "EXP_NAME = \"tutorial_exp\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ff16a42b",
+   "metadata": {},
+   "source": [
+    "# Data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "df055d7d",
+   "metadata": {},
+   "source": [
+    "## Get data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e9898c23",
+   "metadata": {},
+   "source": [
+    "Users can follow [the steps](https://github.com/microsoft/qlib/tree/main/scripts#download-qlib-data) to download data with CLI.\n",
+    "\n",
+    "In this example we use the underlying API to automatically download data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a0bcfa97",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from qlib.tests.data import GetData\n",
+    "GetData().qlib_data(exists_skip=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "42b89646",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import qlib\n",
+    "qlib.init()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "90080d29",
+   "metadata": {},
+   "source": [
+    "## Inspect raw data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "413e99b8",
+   "metadata": {},
+   "source": [
+    "Currently, Qlib support several kinds of data source."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ecf241e9",
+   "metadata": {},
+   "source": [
+    "### Calendar"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0c386f38",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from qlib.data import D\n",
+    "D.calendar(start_time='2010-01-01', end_time='2017-12-31', freq='day')[:2]  # calendar data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f436b49d",
+   "metadata": {},
+   "source": [
+    "### Basic data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a889b763",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = D.features(['SH601216'], ['$open', '$high', '$low', '$close', '$factor'], start_time='2020-05-01', end_time='2020-05-31')   "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eceb43c8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import plotly.graph_objects as go\n",
+    "fig = go.Figure(data=[go.Candlestick(x=df.index.get_level_values(\"datetime\"),\n",
+    "                open=df['$open'],\n",
+    "                high=df['$high'],\n",
+    "                low=df['$low'],\n",
+    "                close=df['$close'])])\n",
+    "fig.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "768ef188",
+   "metadata": {},
+   "source": [
+    "### price adjustment"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d9a536a4",
+   "metadata": {},
+   "source": [
+    "Maybe you think the price is not what it looks like in real world.\n",
+    "\n",
+    "Due to the price adjustment, the price will be different from the real trading data ."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "45df33b5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import plotly.graph_objects as go\n",
+    "fig = go.Figure(data=[go.Candlestick(x=df.index.get_level_values(\"datetime\"),\n",
+    "                open=df['$open'] / df['$factor'],\n",
+    "                high=df['$high'] / df['$factor'],\n",
+    "                low=df['$low'] / df['$factor'],\n",
+    "                close=df['$close'] / df['$factor'])])\n",
+    "fig.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d6acffc3",
+   "metadata": {},
+   "source": [
+    "Please notice the price gap on [2020-05-26](http://vip.stock.finance.sina.com.cn/corp/view/vISSUE_ShareBonusDetail.php?stockid=601216&type=1&end_date=2020-05-20)\n",
+    "\n",
+    "If we want to represent the change of assets value by price, adjust prices are necesary.\n",
+    "By default, Qlib stores the adjusted prices."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "af5f063e",
+   "metadata": {},
+   "source": [
+    "### Static universe V.S. dynamic universe"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8b9b8ce5",
+   "metadata": {},
+   "source": [
+    "Dynamic universe"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "50d3ab70",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# dynamic universe\n",
+    "universe = D.list_instruments(D.instruments('csi100'),  start_time='2010-01-01', end_time='2020-12-31')\n",
+    "pprint(universe)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6be08f23",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(len(universe))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "28e7dd04",
+   "metadata": {},
+   "source": [
+    "Qlib use dynamic universe by default.\n",
+    "\n",
+    "csi100 has around 100 stocks each day(it is not that accurate due to the low precision of data)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ad8b8503",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = D.features(D.instruments('csi100'), ['$close'], start_time='2010-01-01', end_time='2020-12-31')   \n",
+    "df.groupby('datetime').size().plot()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5f37f2ef",
+   "metadata": {},
+   "source": [
+    "### Point-In-Time data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5dfa9e9d",
+   "metadata": {},
+   "source": [
+    "#### download data\n",
+    "NOTE: To run the test faster, we only download the data of two stocks"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "da0a9564",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "p = Path(\"~/.qlib/qlib_data/cn_data/financial\").expanduser()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4657fe13",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if not p.exists():\n",
+    "    !cd ../../scripts/data_collector/pit/ && pip install -r requirements.txt\n",
+    "    !cd ../../scripts/data_collector/pit/ && python collector.py download_data --source_dir ~/.qlib/stock_data/source/pit --start 2000-01-01 --end 2020-01-01 --interval quarterly --symbol_regex \"^(600519|000725).*\"\n",
+    "    !cd ../../scripts/data_collector/pit/ && python collector.py normalize_data --interval quarterly --source_dir ~/.qlib/stock_data/source/pit --normalize_dir ~/.qlib/stock_data/source/pit_normalized\n",
+    "    !cd ../../scripts/ && python dump_pit.py dump --csv_path ~/.qlib/stock_data/source/pit_normalized --qlib_dir ~/.qlib/qlib_data/cn_data --interval quarterly\n",
+    "    pass"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9358cb89",
+   "metadata": {},
+   "source": [
+    "#### querying data\n",
+    "using `roewa(performanceExpressROEWa,业绩快报净资产收益率ROE-加权)` as an example\n",
+    "\n",
+    "If we want to get fundamental data `in the most recent quarter` daily, we can use following example.\n",
+    "\n",
+    "Maitai release part of its fundamental data on [2019-07-13](http://www.cninfo.com.cn/new/disclosure/detail?stockCode=600519&announcementId=1206443183&orgId=gssh0600519&announcementTime=2019-07-13) and  release others on [2019-07-18](http://www.cninfo.com.cn/new/disclosure/detail?stockCode=600519&announcementId=1206456129&orgId=gssh0600519&announcementTime=2019-07-18)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "47ee1621",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "instruments = [\"sh600519\"]\n",
+    "data = D.features(instruments, ['P($$roewa_q)'], start_time=\"2019-01-01\", end_time=\"2019-07-19\", freq=\"day\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "752f4ffe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data.tail(15)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0e370d2d",
+   "metadata": {},
+   "source": [
+    "### experss engine\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a46d166c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "D.features([\"sh600519\"], ['(EMA($close, 12) - EMA($close, 26))/$close - EMA((EMA($close, 12) - EMA($close, 26))/$close, 9)/$close'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5ddcd1ea",
+   "metadata": {},
+   "source": [
+    "\n",
+    "## Dataset loading and preprocessing "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f54c6804",
+   "metadata": {},
+   "source": [
+    "Some heuristic principles of create features\n",
+    "- make the features comparable between instrumets: remove unit from the features.\n",
+    "- try to keep the distribution invariant\n",
+    "- keep the scale of features similar"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "93013dcd",
+   "metadata": {},
+   "source": [
+    "### data loader\n",
+    "\n",
+    "It's interface can be found [here](https://github.com/microsoft/qlib/blob/main/qlib/data/dataset/loader.py#L24) \n",
+    "\n",
+    "QlibDataLoader is an implementation which load data from Qlib's data source"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dcfa44a6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from qlib.data.dataset.loader import QlibDataLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5d78b4bf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "qdl = QlibDataLoader(config=(['$close / Ref($close, 10)'], ['RET10']))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3fb29d8e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "qdl.load(instruments=['sh600519'], start_time='20190101', end_time='20191231')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0dded8f5",
+   "metadata": {},
+   "source": [
+    "### data handler"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "78d6d2b0",
+   "metadata": {},
+   "source": [
+    "finance data can't be perfect.\n",
+    "\n",
+    "We have to process them before feeding them into Models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c078fa3b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = qdl.load(instruments=['sh600519'], start_time='20190101', end_time='20191231')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "45e5adf9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.isna().sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "514b85e6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.plot(kind='hist')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "db1625f7",
+   "metadata": {},
+   "source": [
+    "Datahander is responsible for data preprocessing and provides data fetching interface \n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "43b35c17",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from qlib.data.dataset.handler import DataHandlerLP\n",
+    "from qlib.data.dataset.processor import ZScoreNorm, Fillna"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "38a5f4b2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# NOTE: normally, the training & validation time range will be  `fit_start_time` ， `fit_end_time`\n",
+    "# however，all the components are decomposed, so the training & validation time range is unknown when preprocessing.\n",
+    "dh = DataHandlerLP(instruments=['sh600519'], start_time='20170101', end_time='20191231',\n",
+    "             infer_processors=[ZScoreNorm(fit_start_time='20170101', fit_end_time='20181231'), Fillna()],\n",
+    "             data_loader=qdl)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9469fd1e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = dh.fetch()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fc35b3c0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1dd8e11b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.isna().sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7208efc3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.plot(kind='hist')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1e0fb32c",
+   "metadata": {},
+   "source": [
+    "### dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0302a801",
+   "metadata": {},
+   "source": [
+    "#### basic dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "96ef76e4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from qlib.data.dataset import DatasetH, TSDatasetH"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "12fd8296",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds = DatasetH(dh, segments={\"train\": ('20180101', '20181231'), \"valid\": ('20190101', '20191231')})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4cc4c199",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds.prepare('train')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4639b6a7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds.prepare('valid')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a56001e4",
+   "metadata": {},
+   "source": [
+    "#### Time Series Dataset\n",
+    "\n",
+    "For different model, the required dataset format will be different.\n",
+    "\n",
+    "For example, Qlib provides a Time Series Dataset(TSDatasetH) to help users to create time-series dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "425135e1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds = TSDatasetH(step_len=10, handler=dh, segments={\"train\": ('20180101', '20181231'), \"valid\": ('20190101', '20191231')})\n",
+    "train_sampler = ds.prepare('train')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8f724041",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_sampler"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e5aa762c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_sampler[0] # Retrieving the first example"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eb64112c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_sampler['2018-01-08', 'sh600519']  # get the time series by <'timestamp', 'instrument_id'> index"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4e6ab197",
+   "metadata": {},
+   "source": [
+    "### Off-the-shelf dataset\n",
+    "\n",
+    "Qlib integrated some dataset alreadly"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2e21b45c",
+   "metadata": {
+    "code_folding": []
+   },
+   "outputs": [],
+   "source": [
+    "handler_kwargs = {\n",
+    "        \"start_time\": \"2008-01-01\",\n",
+    "        \"end_time\": \"2020-08-01\",\n",
+    "        \"fit_start_time\": \"2008-01-01\",\n",
+    "        \"fit_end_time\": \"2014-12-31\",\n",
+    "        \"instruments\": MARKET,\n",
+    "}\n",
+    "handler_conf = {\n",
+    "    \"class\": \"Alpha158\",\n",
+    "    \"module_path\": \"qlib.contrib.data.handler\",\n",
+    "    \"kwargs\": handler_kwargs,\n",
+    "}\n",
+    "pprint(handler_conf)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "35d9d248",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pprint(handler_conf)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "17077f0c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from qlib.utils import init_instance_by_config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6a20d9b2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hd = init_instance_by_config(handler_conf)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5aa02379",
+   "metadata": {},
+   "source": [
+    "Using config to create instance is a highly frequently used practice in Qlib (e.g. the [workflows configurations](https://github.com/microsoft/qlib/blob/main/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml) are based on it).\n",
+    "\n",
+    "\n",
+    "The above configuration is the same as the code below"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "480d35bf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from qlib.contrib.data.handler import Alpha158\n",
+    "hd = Alpha158(**handler_kwargs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b170153d",
+   "metadata": {},
+   "source": [
+    "This dataset has the same structure as the simple one with 1 column  we created just now."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "735758e5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = hd.fetch()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2b6de50c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cb1bc4fb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hd.data_loader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5a927f1c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hd.data_loader.fields"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f5a15747",
+   "metadata": {},
+   "source": [
+    "#### some details\n",
+    "\n",
+    "The training data may not be the same as the test data.\n",
+    "\n",
+    "e.g.\n",
+    "- the training dataset and test dataset use a different fitlering rules,  data processing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cf2defa0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hd.learn_processors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ef55a881",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hd.infer_processors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7af3b077",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hd.process_type # appending type"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "20127e19",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hd.fetch(col_set=\"label\", data_key=hd.DK_L)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "09c37d41",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hd.fetch(col_set=\"label\", data_key=hd.DK_I)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ec2cfb12",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset_conf = {\n",
+    "        \"class\": \"DatasetH\",\n",
+    "        \"module_path\": \"qlib.data.dataset\",\n",
+    "        \"kwargs\": {\n",
+    "            \"handler\": hd,\n",
+    "            \"segments\": {\n",
+    "                \"train\": (\"2008-01-01\", \"2014-12-31\"),\n",
+    "                \"valid\": (\"2015-01-01\", \"2016-12-31\"),\n",
+    "                \"test\": (\"2017-01-01\", \"2020-08-01\"),\n",
+    "            },\n",
+    "        },\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aca33c3c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = init_instance_by_config(dataset_conf)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0c89c15d",
+   "metadata": {},
+   "source": [
+    "# Model Training & Inference"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e916286",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from qlib.workflow import R\n",
+    "from qlib.workflow.record_temp import SignalRecord, PortAnaRecord, SigAnaRecord"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f6975911",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = init_instance_by_config({\n",
+    "        \"class\": \"LGBModel\",\n",
+    "        \"module_path\": \"qlib.contrib.model.gbdt\",\n",
+    "        \"kwargs\": {\n",
+    "            \"loss\": \"mse\",\n",
+    "            \"colsample_bytree\": 0.8879,\n",
+    "            \"learning_rate\": 0.0421,\n",
+    "            \"subsample\": 0.8789,\n",
+    "            \"lambda_l1\": 205.6999,\n",
+    "            \"lambda_l2\": 580.9768,\n",
+    "            \"max_depth\": 8,\n",
+    "            \"num_leaves\": 210,\n",
+    "            \"num_threads\": 20,\n",
+    "        },\n",
+    "})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2e2dafb6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# start exp to train model\n",
+    "with R.start(experiment_name=EXP_NAME):\n",
+    "    model.fit(dataset)\n",
+    "    R.save_objects(trained_model=model)\n",
+    "\n",
+    "    rec = R.get_recorder()\n",
+    "    rid = rec.id # save the record id\n",
+    "\n",
+    "    # Inference and saving signal\n",
+    "    sr = SignalRecord(model, dataset, rec)\n",
+    "    sr.generate()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6b6b9f3d",
+   "metadata": {},
+   "source": [
+    "# Evaluation:\n",
+    "- Signal-based\n",
+    "- Portfolio-based: backtest "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4328f881",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "###################################\n",
+    "# prediction, backtest & analysis\n",
+    "###################################\n",
+    "port_analysis_config = {\n",
+    "    \"executor\": {\n",
+    "        \"class\": \"SimulatorExecutor\",\n",
+    "        \"module_path\": \"qlib.backtest.executor\",\n",
+    "        \"kwargs\": {\n",
+    "            \"time_per_step\": \"day\",\n",
+    "            \"generate_portfolio_metrics\": True,\n",
+    "        },\n",
+    "    },\n",
+    "    \"strategy\": {\n",
+    "        \"class\": \"TopkDropoutStrategy\",\n",
+    "        \"module_path\": \"qlib.contrib.strategy.signal_strategy\",\n",
+    "        \"kwargs\": {\n",
+    "            \"signal\": \"<PRED>\",\n",
+    "            \"topk\": 50,\n",
+    "            \"n_drop\": 5,\n",
+    "        },\n",
+    "    },\n",
+    "    \"backtest\": {\n",
+    "        \"start_time\": \"2017-01-01\",\n",
+    "        \"end_time\": \"2020-08-01\",\n",
+    "        \"account\": 100000000,\n",
+    "        \"benchmark\": BENCHMARK,\n",
+    "        \"exchange_kwargs\": {\n",
+    "            \"freq\": \"day\",\n",
+    "            \"limit_threshold\": 0.095,\n",
+    "            \"deal_price\": \"close\",\n",
+    "            \"open_cost\": 0.0005,\n",
+    "            \"close_cost\": 0.0015,\n",
+    "            \"min_cost\": 5,\n",
+    "        },\n",
+    "    },\n",
+    "}\n",
+    "\n",
+    "# backtest and analysis\n",
+    "with R.start(experiment_name=EXP_NAME, recorder_id=rid, resume=True):\n",
+    "\n",
+    "    # signal-based analysis\n",
+    "    rec = R.get_recorder()\n",
+    "    sar = SigAnaRecord(rec)\n",
+    "    sar.generate()\n",
+    "    \n",
+    "    #  portfolio-based analysis: backtest\n",
+    "    par = PortAnaRecord(rec, port_analysis_config, \"day\")\n",
+    "    par.generate()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d66ad59d",
+   "metadata": {},
+   "source": [
+    "# Loading results & Analysis"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3d7d7dea",
+   "metadata": {},
+   "source": [
+    "## loading data\n",
+    "Because Qlib leverage MLflow to save model & data.\n",
+    "All the data can be access by `mlflow ui`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1ec9dbb6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load recorder\n",
+    "recorder = R.get_recorder(recorder_id=rid, experiment_name=EXP_NAME)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "25e72b0d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load previous results\n",
+    "pred_df = recorder.load_object(\"pred.pkl\")\n",
+    "report_normal_df = recorder.load_object(\"portfolio_analysis/report_normal_1day.pkl\")\n",
+    "positions = recorder.load_object(\"portfolio_analysis/positions_normal_1day.pkl\")\n",
+    "analysis_df = recorder.load_object(\"portfolio_analysis/port_analysis_1day.pkl\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dce3696b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Previous Model can be loaded. but it is not used.\n",
+    "loaded_model = recorder.load_object(\"trained_model\")\n",
+    "loaded_model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cf8eca78",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from qlib.contrib.report import analysis_model, analysis_position"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8d34b347",
+   "metadata": {},
+   "source": [
+    "## analysis position"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1cae642f",
+   "metadata": {},
+   "source": [
+    "### report"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "47c727b2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "analysis_position.report_graph(report_normal_df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "15b7ca14",
+   "metadata": {},
+   "source": [
+    "### risk analysis"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3f100690",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "analysis_position.risk_analysis_graph(analysis_df, report_normal_df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2ed48aee",
+   "metadata": {},
+   "source": [
+    "## analysis model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aec13561",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "label_df = dataset.prepare(\"test\", col_set=\"label\")\n",
+    "label_df.columns = ['label']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "56d57363",
+   "metadata": {},
+   "source": [
+    "### score IC"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7612533d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pred_label = pd.concat([label_df, pred_df], axis=1, sort=True).reindex(label_df.index)\n",
+    "analysis_position.score_ic_graph(pred_label)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "589664c4",
+   "metadata": {},
+   "source": [
+    "### model performance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "40258655",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "analysis_model.model_performance_graph(pred_label)"
+   ]
+  }
+ ],
+ "metadata": {
+  "jupytext": {
+   "formats": "ipynb,auto:percent"
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": true
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/workflow_by_code.ipynb b/examples/workflow_by_code.ipynb
index 3d48669303..ac653b0802 100644
--- a/examples/workflow_by_code.ipynb
+++ b/examples/workflow_by_code.ipynb
@@ -256,7 +256,6 @@
     "recorder = R.get_recorder(recorder_id=ba_rid, experiment_name=\"backtest_analysis\")\n",
     "print(recorder)\n",
     "pred_df = recorder.load_object(\"pred.pkl\")\n",
-    "pred_df_dates = pred_df.index.get_level_values(level='datetime')\n",
     "report_normal_df = recorder.load_object(\"portfolio_analysis/report_normal_1day.pkl\")\n",
     "positions = recorder.load_object(\"portfolio_analysis/positions_normal_1day.pkl\")\n",
     "analysis_df = recorder.load_object(\"portfolio_analysis/port_analysis_1day.pkl\")"
diff --git a/qlib/contrib/model/__init__.py b/qlib/contrib/model/__init__.py
index fab1af734c..c98f936acd 100644
--- a/qlib/contrib/model/__init__.py
+++ b/qlib/contrib/model/__init__.py
@@ -10,17 +10,19 @@
     from .gbdt import LGBModel
 except ModuleNotFoundError:
     DEnsembleModel, LGBModel = None, None
-    print("Please install necessary libs for DEnsembleModel and LGBModel, such as lightgbm.")
+    print(
+        "ModuleNotFoundError. DEnsembleModel and LGBModel are skipped. (optional: maybe installing lightgbm can fix it.)"
+    )
 try:
     from .xgboost import XGBModel
 except ModuleNotFoundError:
     XGBModel = None
-    print("Please install necessary libs for XGBModel, such as xgboost.")
+    print("ModuleNotFoundError. XGBModel is skipped(optional: maybe installing xgboost can fix it).")
 try:
     from .linear import LinearModel
 except ModuleNotFoundError:
     LinearModel = None
-    print("Please install necessary libs for LinearModel, such as scipy and sklearn.")
+    print("ModuleNotFoundError. LinearModel is skipped(optional: maybe installing scipy and sklearn can fix it).")
 # import pytorch models
 try:
     from .pytorch_alstm import ALSTM
@@ -36,6 +38,6 @@
     pytorch_classes = (ALSTM, GATs, GRU, LSTM, DNNModelPytorch, TabnetModel, SFM_Model, TCN, ADD)
 except ModuleNotFoundError:
     pytorch_classes = ()
-    print("Please install necessary libs for PyTorch models.")
+    print("ModuleNotFoundError.  PyTorch models are skipped (optional: maybe installing pytorch can fix it).")
 
 all_model_classes = (CatBoostModel, DEnsembleModel, LGBModel, XGBModel, LinearModel) + pytorch_classes
diff --git a/qlib/data/dataset/__init__.py b/qlib/data/dataset/__init__.py
index b1ec7383d4..7262640588 100644
--- a/qlib/data/dataset/__init__.py
+++ b/qlib/data/dataset/__init__.py
@@ -199,6 +199,9 @@ def prepare(
 
         col_set : str
             The col_set will be passed to self.handler when fetching data.
+            TODO: make it automatic:
+                - select DK_I for test data
+                - select DK_L for training data.
         data_key : str
             The data to fetch:  DK_*
             Default is DK_I, which indicate fetching data for **inference**.
diff --git a/qlib/workflow/cli.py b/qlib/workflow/cli.py
index d4df0153ec..e0a925b412 100644
--- a/qlib/workflow/cli.py
+++ b/qlib/workflow/cli.py
@@ -43,6 +43,11 @@ def sys_config(config, config_path):
 
 # workflow handler function
 def workflow(config_path, experiment_name="workflow", uri_folder="mlruns"):
+    """
+    This is a Qlib CLI entrance.
+    User can run the whole Quant research workflow defined by a configure file
+    - the code is located here ``qlib/workflow/cli.py`
+    """
     with open(config_path) as fp:
         config = yaml.safe_load(fp)
 
diff --git a/scripts/data_collector/pit/requirements.txt b/scripts/data_collector/pit/requirements.txt
index 0cd9b42f9c..8b652cbd19 100644
--- a/scripts/data_collector/pit/requirements.txt
+++ b/scripts/data_collector/pit/requirements.txt
@@ -6,4 +6,5 @@ pandas
 lxml
 loguru
 baostock
-yahooquery
\ No newline at end of file
+yahooquery
+beautifulsoup4

From ffe36a5e75b396b8f9dcf6d8bd9a18a26331ec1f Mon Sep 17 00:00:00 2001
From: Young <afe.young@gmail.com>
Date: Fri, 8 Apr 2022 09:21:05 +0800
Subject: [PATCH 2/2] Update tutorial

---
 examples/tutorial/detailed_workflow.ipynb | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/examples/tutorial/detailed_workflow.ipynb b/examples/tutorial/detailed_workflow.ipynb
index f6bc58ac64..f96e2a52cf 100644
--- a/examples/tutorial/detailed_workflow.ipynb
+++ b/examples/tutorial/detailed_workflow.ipynb
@@ -696,16 +696,6 @@
     "pprint(handler_conf)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "35d9d248",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "pprint(handler_conf)"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -895,7 +885,9 @@
    "id": "0c89c15d",
    "metadata": {},
    "source": [
-    "# Model Training & Inference"
+    "# Model Training & Inference\n",
+    "\n",
+    "[Model interface](https://github.com/microsoft/qlib/blob/main/qlib/model/base.py)"
    ]
   },
   {
@@ -975,10 +967,10 @@
     "###################################\n",
     "port_analysis_config = {\n",
     "    \"executor\": {\n",
+    "            \"time_per_step\"\n",
     "        \"class\": \"SimulatorExecutor\",\n",
     "        \"module_path\": \"qlib.backtest.executor\",\n",
-    "        \"kwargs\": {\n",
-    "            \"time_per_step\": \"day\",\n",
+    "        \"kwargs\": {: \"day\",\n",
     "            \"generate_portfolio_metrics\": True,\n",
     "        },\n",
     "    },\n",
@@ -1188,7 +1180,8 @@
  ],
  "metadata": {
   "jupytext": {
-   "formats": "ipynb,auto:percent"
+   "encoding": "# -*- coding: utf-8 -*-",
+   "formats": "ipynb,py:percent"
   },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",