Use nicer slash syntax for making paths

garfieldnate · Dec 27, 2020 · 988390d · 988390d
1 parent 8338f59
commit 988390d
Show file tree

Hide file tree

Showing 2 changed files with 84 additions and 7 deletions.
diff --git a/uniunihan_db/build_db.py b/uniunihan_db/build_db.py
@@ -11,21 +11,21 @@
 from .lingua import japanese, mandarin
 
 PROJECT_DIR = Path(__file__).parents[1]
-DATA_DIR = Path(PROJECT_DIR, "data")
+DATA_DIR = PROJECT_DIR / "data"
 DATA_DIR.mkdir(exist_ok=True)
-LOG_FILE = Path(DATA_DIR, "log.txt")
+LOG_FILE = DATA_DIR / "log.txt"
 
-UNIHAN_FILE = Path(DATA_DIR, "unihan.json")
-UNIHAN_AUGMENTATION_FILE = Path(DATA_DIR, "unihan_augmentation.json")
+UNIHAN_FILE = DATA_DIR / "unihan.json"
+UNIHAN_AUGMENTATION_FILE = DATA_DIR / "unihan_augmentation.json"
 
 CJKVI_IDS_URL = "https://github.com/cjkvi/cjkvi-ids/archive/master.zip"
-CJKV_IDS_ZIP_FILE = Path(DATA_DIR, "cjkvi-ids-master.zip")
-CJKV_IDS_DIR = Path(DATA_DIR, "cjkvi-ids-master")
+CJKV_IDS_ZIP_FILE = DATA_DIR / "cjkvi-ids-master.zip"
+CJKV_IDS_DIR = DATA_DIR / "cjkvi-ids-master"
 
 JUN_DA_CHAR_FREQ_URL = (
     "https://lingua.mtsu.edu/chinese-computing/statistics/char/list.php"
 )
-JUN_DA_CHAR_FREQ_FILE = Path(DATA_DIR, "jun_da_char.tsv")
+JUN_DA_CHAR_FREQ_FILE = DATA_DIR / "jun_da_char.tsv"
 
 logging.basicConfig(
     level=os.environ.get("LOGLEVEL", "INFO"),

diff --git a/uniunihan_db/feature_extraction.py b/uniunihan_db/feature_extraction.py
@@ -0,0 +1,77 @@
+import dataclasses
+import json
+from pathlib import Path
+
+from .lingua import japanese, mandarin
+
+PROJECT_DIR = Path(__file__).parents[1]
+
+DATA_DIR = PROJECT_DIR / "data"
+
+
+def _read_hsk(max_level):
+    if max_level < 1 or max_level > 6:
+        raise ValueError("max HSK level must be between 1 and 6")
+    char_set = set()
+    word_list = []
+    for level in range(1, max_level + 1):
+        with open(DATA_DIR / "hsk" / f"hsk-{level}.txt") as f:
+            for line in f:
+                word = line.strip()
+                word_list.append(word)
+                char_set.update(word)
+
+    return word_list, char_set
+
+
+def _read_unihan():
+    with open(DATA_DIR / "unihan.json") as f:
+        unihan = json.load(f)
+    return unihan
+
+
+def _get_pronunciation_feats(syl, prefix):
+    syl = dataclasses.asdict(syl)
+    return {f"{prefix}_{k}": v for k, v in syl.items()}
+
+
+def get_feats(unihan_entry):
+    feats = {"char": unihan_entry["char"]}
+    # get traditional variant
+    # pronunciation: JP
+
+    if on_list := unihan_entry.get("kJapaneseOn"):
+        # TODO: allow using other pronunciations?
+        ime = japanese.alpha_to_alpha(on_list[0])
+        if han_syl := japanese.parse_han_syllable(ime):
+            feats |= _get_pronunciation_feats(han_syl, "jp")
+
+    # pronunciation: ZH
+    if s := unihan_entry.get("kMandarin", {}).get("zh-Hans"):
+        # TODO: allow using Taiwan pronunciations
+        if syl := mandarin.parse_syllable(s):
+            feats |= _get_pronunciation_feats(syl, "zh")
+
+    # list of top-level radicals
+    # list of recursively added radicals
+    return feats
+
+
+def main():
+    # TODO: allow choosing character set
+    _, char_set = _read_hsk(6)
+    unihan = _read_unihan()
+    feature_sets = [get_feats(entry) for entry in unihan]
+    for s in feature_sets:
+        # collect all features
+        print(s)
+        exit()
+        # print(",".join(vector))
+    # print(word_list)
+    print(char_set)
+    # for char in char_list:
+    #     pass
+
+
+if __name__ == "__main__":
+    main()