Skip to content

Commit

Permalink
Store Unihan in dictionary structure
Browse files Browse the repository at this point in the history
This is a much more useful structure, and also unifies the file
structure with the augmentation file. I've opened a ticket with
unihan_etl asking to add dictionary structuring as an option:
cihai/unihan-etl#233.
  • Loading branch information
garfieldnate committed Dec 27, 2020
1 parent 988390d commit 6f66b8a
Showing 1 changed file with 14 additions and 6 deletions.
20 changes: 14 additions & 6 deletions uniunihan_db/build_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import requests
from unihan_etl.process import Packager as unihan_packager
from unihan_etl.process import export_json

from .lingua import japanese, mandarin

Expand Down Expand Up @@ -39,17 +40,26 @@


def unihan_download():
"""Download the famous Unihan database, from the Unicode Consortium,
"""Download the famous Unihan database from the Unicode Consortium,
and store it has a normalized JSON file"""

if UNIHAN_FILE.exists() and UNIHAN_FILE.stat().st_size > 0:
log.info(f"{UNIHAN_FILE.name} already exists; skipping download")
return

log.info(f"Downloading unihan to {UNIHAN_FILE}...")
log.info("Downloading unihan data...")
p = unihan_packager.from_cli(["-F", "json", "--destination", str(UNIHAN_FILE)])
p.download()
p.export()
# instruct packager to return data instead of writing to file
# https://github.com/cihai/unihan-etl/issues/233
p.options["format"] = "python"
unihan = p.export()

log.info("Converting unihan data to dictionary format...")
unihan_dict = {entry["char"]: entry for entry in unihan}

log.info(f"Writing unihan to {UNIHAN_FILE}...")
export_json(unihan_dict, UNIHAN_FILE)


def cjkvi_ids_download():
Expand Down Expand Up @@ -118,9 +128,7 @@ def expand_unihan():

log.info("Expanding Unihan data...")
new_data = {}
for entry in unihan:
key = entry["ucn"]
# new_data[key] = entry
for key, entry in unihan.items():
new_data[key] = {}
if on_list := entry.get("kJapaneseOn"):
kana_list = []
Expand Down

0 comments on commit 6f66b8a

Please sign in to comment.