Skip to content

Commit

Permalink
Resolve issues while running Automatic update of daily frequency data…
Browse files Browse the repository at this point in the history
… (from yahoo finance) for US region (microsoft#1358)

* Update YahooNormalizeUS1dExtend(microsoft#1196)

* Prevent pandas read_csv errors while running update_data_to_bin for US region

* Fix parse_index error while running update_data_to_bin for US region

* prevent pandas.read_csv error on specific symbol names

* Reordering parameters for better rendering

* removes prefix during feature_dir existence checking

* add explanation comments
  • Loading branch information
HyeongminMoon authored and qianyun210603 committed Mar 23, 2023
1 parent e807e16 commit 620c148
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 3 deletions.
2 changes: 1 addition & 1 deletion qlib/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -749,7 +749,7 @@ def exists_qlib_data(qlib_dir):
return False

# check instruments
code_names = set(map(lambda x: x.name.lower(), features_dir.iterdir()))
code_names = set(map(lambda x: fname_to_code(x.name.lower()), features_dir.iterdir()))
_instrument = instruments_dir.joinpath("all.txt")
miss_code = set(pd.read_csv(_instrument, sep="\t", header=None).loc[:, 0].apply(str.lower)) - set(code_names)
if miss_code and any(map(lambda x: "sht" not in x, miss_code)):
Expand Down
15 changes: 14 additions & 1 deletion scripts/data_collector/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,20 @@ def __init__(

def _executor(self, file_path: Path):
file_path = Path(file_path)
df = pd.read_csv(file_path)

# some symbol_field values such as TRUE, NA are decoded as True(bool), NaN(np.float) by pandas default csv parsing.
# manually defines dtype and na_values of the symbol_field.
default_na = pd._libs.parsers.STR_NA_VALUES
symbol_na = default_na.copy()
symbol_na.remove("NA")
columns = pd.read_csv(file_path, nrows=0).columns
df = pd.read_csv(
file_path,
dtype={self._symbol_field_name: str},
keep_default_na=False,
na_values={col: symbol_na if col == self._symbol_field_name else default_na for col in columns},
)

df = self._normalize_obj.normalize(df)
if df is not None and not df.empty:
if self._end_date is not None:
Expand Down
6 changes: 5 additions & 1 deletion scripts/data_collector/yahoo/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -817,6 +817,10 @@ class YahooNormalizeUS1d(YahooNormalizeUS, YahooNormalize1d):
pass


class YahooNormalizeUS1dExtend(YahooNormalizeUS, YahooNormalize1dExtend):
pass


class YahooNormalizeUS1min(YahooNormalizeUS, YahooNormalize1minOffline):
CALC_PAUSED_NUM = False

Expand Down Expand Up @@ -1196,7 +1200,7 @@ def update_data_to_bin(
importlib.import_module(f"data_collector.{_region}_index.collector"), "get_instruments"
)
for _index in index_list:
get_instruments(str(qlib_data_1d_dir), _index)
get_instruments(str(qlib_data_1d_dir), _index, market_index=f"{_region}_index")


if __name__ == "__main__":
Expand Down

0 comments on commit 620c148

Please sign in to comment.