Skip to content

Commit

Permalink
Use nicer slash syntax for making paths
Browse files Browse the repository at this point in the history
  • Loading branch information
garfieldnate committed Dec 27, 2020
1 parent 8338f59 commit 988390d
Show file tree
Hide file tree
Showing 2 changed files with 84 additions and 7 deletions.
14 changes: 7 additions & 7 deletions uniunihan_db/build_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,21 +11,21 @@
from .lingua import japanese, mandarin

PROJECT_DIR = Path(__file__).parents[1]
DATA_DIR = Path(PROJECT_DIR, "data")
DATA_DIR = PROJECT_DIR / "data"
DATA_DIR.mkdir(exist_ok=True)
LOG_FILE = Path(DATA_DIR, "log.txt")
LOG_FILE = DATA_DIR / "log.txt"

UNIHAN_FILE = Path(DATA_DIR, "unihan.json")
UNIHAN_AUGMENTATION_FILE = Path(DATA_DIR, "unihan_augmentation.json")
UNIHAN_FILE = DATA_DIR / "unihan.json"
UNIHAN_AUGMENTATION_FILE = DATA_DIR / "unihan_augmentation.json"

CJKVI_IDS_URL = "https://github.com/cjkvi/cjkvi-ids/archive/master.zip"
CJKV_IDS_ZIP_FILE = Path(DATA_DIR, "cjkvi-ids-master.zip")
CJKV_IDS_DIR = Path(DATA_DIR, "cjkvi-ids-master")
CJKV_IDS_ZIP_FILE = DATA_DIR / "cjkvi-ids-master.zip"
CJKV_IDS_DIR = DATA_DIR / "cjkvi-ids-master"

JUN_DA_CHAR_FREQ_URL = (
"https://lingua.mtsu.edu/chinese-computing/statistics/char/list.php"
)
JUN_DA_CHAR_FREQ_FILE = Path(DATA_DIR, "jun_da_char.tsv")
JUN_DA_CHAR_FREQ_FILE = DATA_DIR / "jun_da_char.tsv"

logging.basicConfig(
level=os.environ.get("LOGLEVEL", "INFO"),
Expand Down
77 changes: 77 additions & 0 deletions uniunihan_db/feature_extraction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import dataclasses
import json
from pathlib import Path

from .lingua import japanese, mandarin

PROJECT_DIR = Path(__file__).parents[1]

DATA_DIR = PROJECT_DIR / "data"


def _read_hsk(max_level):
if max_level < 1 or max_level > 6:
raise ValueError("max HSK level must be between 1 and 6")
char_set = set()
word_list = []
for level in range(1, max_level + 1):
with open(DATA_DIR / "hsk" / f"hsk-{level}.txt") as f:
for line in f:
word = line.strip()
word_list.append(word)
char_set.update(word)

return word_list, char_set


def _read_unihan():
with open(DATA_DIR / "unihan.json") as f:
unihan = json.load(f)
return unihan


def _get_pronunciation_feats(syl, prefix):
syl = dataclasses.asdict(syl)
return {f"{prefix}_{k}": v for k, v in syl.items()}


def get_feats(unihan_entry):
feats = {"char": unihan_entry["char"]}
# get traditional variant
# pronunciation: JP

if on_list := unihan_entry.get("kJapaneseOn"):
# TODO: allow using other pronunciations?
ime = japanese.alpha_to_alpha(on_list[0])
if han_syl := japanese.parse_han_syllable(ime):
feats |= _get_pronunciation_feats(han_syl, "jp")

# pronunciation: ZH
if s := unihan_entry.get("kMandarin", {}).get("zh-Hans"):
# TODO: allow using Taiwan pronunciations
if syl := mandarin.parse_syllable(s):
feats |= _get_pronunciation_feats(syl, "zh")

# list of top-level radicals
# list of recursively added radicals
return feats


def main():
# TODO: allow choosing character set
_, char_set = _read_hsk(6)
unihan = _read_unihan()
feature_sets = [get_feats(entry) for entry in unihan]
for s in feature_sets:
# collect all features
print(s)
exit()
# print(",".join(vector))
# print(word_list)
print(char_set)
# for char in char_list:
# pass


if __name__ == "__main__":
main()

0 comments on commit 988390d

Please sign in to comment.