-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Deposit working tools, initial read list and .csv
- Loading branch information
0 parents
commit b75724f
Showing
8 changed files
with
43,099 additions
and
0 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,306 @@ | ||
# name: dek_csv.py | ||
# author: nbehrnd@yahoo.com | ||
# license: MIT, 2020 | ||
# date: 2020-05-31 (YYYY-MM-DD) | ||
# edit: | ||
# | ||
""" Consolidation of dek_quick_csv.py's dek2anki.csv relational table. | ||
The content of file dek2anki.csv, written by script dek_quick_csv.py, | ||
is extended by this script with tags in a third column. Eventually, | ||
the lines in this relational table follow a pattern of | ||
Aufstand; <img src="DEK_VS_steno_svg_-_Aufstand.svg">; DEK_b auf st | ||
to relate a key (here, "Auftstand"), with the address of the .svg file | ||
(second column), and tags about this entry (third column). To allow a | ||
parallel use of either this project's Anki deck or others, each entry | ||
is tagged by "DEK_b". To ease self study, additional tags may be set, | ||
too. At present, this approach is based on the comparison of strings | ||
in the file name, | ||
+ to indicate entries contrasting symbolizations like the illustration | ||
of Automaten_ABER_Automatten.svg. | ||
+ to indicate the _possible_ occurence of a symbolization about groups | ||
of vowels, or whole syllabels. This identification, and the discern | ||
of symbolizations e.g., of "mp" but not "mpf", "dr" but not "ndr" is | ||
at an experimental stage. At present, not all 100 kuerzel, nor all | ||
vowel symbolizations are known to the set of rules here. Equally it | ||
is known that this simple approach equally yields "false positives" | ||
suggesting the presence of a special symbolization, than there is | ||
none (e.g., Bausparer does not use the symbolization of "aus"). It | ||
is the intent to improve the attribution gradually. | ||
Again, because Anki expects an .csv in UTF-8 and because of the use of | ||
special characters like umlauts, the scripts action, launched on the | ||
the CLI by | ||
python dev_csv.py | ||
is restricted to Python 3. """ | ||
|
||
import os | ||
import shutil | ||
import sys | ||
|
||
from hyphen import Hyphenator | ||
h_de = Hyphenator('de_DE') | ||
|
||
|
||
def check_python(): | ||
""" Assure the script is used with Python 3, only. """ | ||
if sys.version_info[0] == 2: | ||
print("\nThe script works with Python 3, only.\n") | ||
sys.exit() | ||
elif sys.version_info[0] == 3: | ||
pass | ||
else: | ||
print("\nBe sure to call the script with Python 3, only.\n") | ||
|
||
|
||
def only_check_presence_workshop(): | ||
""" This time, only probe if there is folder dek_workshop. """ | ||
presence_raw_data = False | ||
for element in os.listdir("."): | ||
if (str(element) == str("dek_workshop")) and os.path.isdir(element): | ||
presence_raw_data = True | ||
break | ||
if presence_raw_data is False: | ||
print("Folder 'dek_workshop' is missing. Exit.") | ||
sys.exit() | ||
|
||
|
||
def remove_from_list(): | ||
""" Remove files deemed incompatible to the Anki deck format. """ | ||
root = os.getcwd() | ||
old_register = [] | ||
new_register = [] | ||
# The black list about files to exclude from the relational table | ||
# intentionally uses the shorter, easier to maintain list of keys | ||
# instead of the lengthier file names. | ||
black_list = [ | ||
'2_Grundlinien', '2_Grundlinien_Linienstärke_sechs', | ||
'2_Grundlinien_groß', '3_Grundlinien_Linienstärke_sechs', | ||
'Grundlinien', 'SETZKASTEN_NUR_Buchstaben', 'SETZKASTEN_NUR_Kürzel', | ||
'SETZKASTEN_mit_vielfältigen_Buchstabenformen', '1_ABER_2', | ||
'a_b_br_d_e_f_g_gr_gl_h_j_k_l_ll_m_n_o_p_r_t_tr_v_w_ö', | ||
'a_b_br_d_e_f_g_gr_gl_h_j_k_l_ll_m_n_o_p_r_t_tr_v_w_ö_v2', | ||
'a_b_br_d_e_f_g_gr_gl_h_i_j_k_l_ll_m_n_o_p_r_t_tr_v_w_ö', | ||
'a_b_br_d_e_f_g_gr_gl_h_i_j_k_l_ll_m_n_o_p_r_t_tr_v_w_ö_v2', | ||
'a_b_br_d_e_f_g_gr_h_k_l_ll', 'a_b_br_d_f', 'ä,_ö_ü,_ei,_u,_e,_i,_o', | ||
'auf,_hat,_das,_für', 'Baumast_Bau-Mast', 'Baumast_Baum-Ast', | ||
'b_be-_r_er_f_für', | ||
'b_br_d_f_g_gr_h_j_k_l_m_n_p_r_rs_s_ss_t_Aufstrich_t_tr_v_w', | ||
'b_br_d_f_g_gr_h_j_k_l_m_n_p_r_rs_s_ss_t_tr_v_w', | ||
'b_br_d_f_g_gr_h_j_k_l_m_n_p_r_s_t_tr_v_w', | ||
'be-_das_dem_den_der_deutsch_die_er_er-_es', | ||
'betr_betreffend_betreffs_betrifft', 'br_cr_gr_tr_kr_rr', 'gl', 'll', | ||
'm_n_o_p_r_t_tr_w_ö', 'n_v2', 'pr_wr_schw_zw', 'r_n_d', 'schl', | ||
'sch_schm_schn_schw', | ||
'sch_schm_schn_schw_sp_z_zw_heit_ung_schr_spr_str_zr', | ||
'sie_APOSTROPH_s', 'Vokale_und_Diphthonge_v1a', | ||
'Vokale_und_Diphthonge_v1a1' | ||
'Vokale_und_Diphthonge_v1b1', 'Vokale_und_Diphthonge_v1c', | ||
'Vokale_und_Diphthonge_v1c1', 'Vokale_und_Diphthonge_v2a', | ||
'Vokale_und_Diphthonge_v2a1', 'Vokale_und_Diphthonge_v2c', | ||
'Vokale_und_Diphthonge_v2c1', 'Vokale_und_Diphthonge_v3a', | ||
'Vokale_und_Diphthonge_v4a', 'Vokale_und_Diphthonge_v4c', | ||
'Vokale_und_Diphthonge_v6c', 'Vokale_und_Diphthonge_v7c', | ||
'Vokale_und_Diphthonge_v8c', 'Vokale_und_Diphthonge_v9c', | ||
'Vokale_und_Diphthonge_v10c', 'Vokale_und_Diphthonge_v11c', | ||
'Vokale_und_Diphthonge_v12c', 'Vokale_und_Diphthonge_v13c', 'z_zr_zw' | ||
] | ||
|
||
os.chdir("dek_workshop") | ||
try: | ||
with open("dek2anki.csv", mode="r") as source: | ||
old_register = source.readlines() | ||
except IOError: | ||
print("\nFile 'dek2anki.csv' is inaccessible.") | ||
print("Maybe a run of dek_quick_csv.py solves this issue.\n") | ||
sys.exit() | ||
|
||
for line in old_register: | ||
global check | ||
check = str(line).strip() # remove, e.g. line feed | ||
check = check.split(";")[0] # identify the easier to use key | ||
if (check in black_list) or (len(str(check)) > 70): | ||
pass | ||
else: | ||
analysator() | ||
|
||
retain = ''.join([str(line).strip(), "; ", tag_line]) | ||
new_register.append(retain) | ||
|
||
with open("dek2anki.csv", mode="w") as newfile: | ||
for entry in new_register: | ||
retain = str("{}\n".format(entry)) | ||
newfile.write(retain) | ||
|
||
os.chdir(root) | ||
|
||
|
||
def folder_cleaner(): | ||
""" Remove obsolete files from 'dek_workshop'. """ | ||
root = os.getcwd() | ||
old_register2 = [] | ||
stem = str("DEK_VS_steno_svg_-_") | ||
|
||
os.chdir("dek_workshop") | ||
shutil.copy("dek2anki.csv", root) | ||
with open('dek2anki.csv', mode="r") as source: | ||
for line in source: | ||
retain = str(line).strip() | ||
retain = retain.split(";")[0] | ||
|
||
retain = ''.join([stem, retain, str(".svg")]) | ||
old_register2.append(retain) | ||
old_register2.sort() | ||
|
||
for file in os.listdir("."): | ||
if file.endswith(".svg") and (str(file) in old_register2): | ||
pass | ||
else: | ||
os.remove(file) | ||
|
||
|
||
# shutil.copy("dek2anki.csv", root) | ||
os.chdir(root) | ||
|
||
|
||
def remove_from_folder(): | ||
""" Optionally remove files no longer listed from dek_workshop """ | ||
print("\nAn annotated relation table, 'csv2anki.csv', was written.") | ||
print("\nTo remove files no longer listed from 'dek_workshop', press") | ||
print("[y]es to delete them now.") | ||
print("[n]o to keep them.") | ||
print("[q]uit to leave the program altogether.") | ||
print("\nConfirm your choice with ENTER.") | ||
|
||
choice = str(input()).lower() | ||
if str(choice) == str("y"): | ||
print("Obsolete files will be deleted.") | ||
folder_cleaner() | ||
elif str(choice) == str("n"): | ||
print("All files will be retained. Exit.") | ||
sys.exit() | ||
elif str(choice) == str("q"): | ||
print("Exit of the script.") | ||
sys.exit() | ||
else: | ||
print("Invalid input. The script closes.") | ||
sys.exit() | ||
|
||
|
||
def analysator(): | ||
""" Provide meaningful tags for column #3 in file 'csv2anki.csv'. """ | ||
global tag_line | ||
tag_line = str("DEK_b") | ||
|
||
# rule contrasting illustrations: | ||
if str("ABER") in check: | ||
tag_line += str(" Vergleich") | ||
|
||
# Identification of 17 non-ambigous symbolizations -- a concept. | ||
# | ||
# It is plausible that these lists are incomplete. | ||
# It is complemented by later rules discerning e.g., 'st' from 'str'. | ||
test = str(check).lower() | ||
grouped_consonants = [ | ||
'br', 'cr', 'fr', 'gr', 'kr', 'mpf', 'ndr', 'pfr', 'rdr', 'schl', | ||
'schm', 'schn', 'schr', 'spr', 'str', 'wr', 'zw' | ||
] | ||
|
||
# Incomplete list of 59, apparently easier to retrieve, kuerzel. | ||
# Again, there are some for this simple string-based approach is | ||
# not working well enough (e.g., 'wo' vs. 'woll' or 'worden'; or | ||
# 'in' vs. 'meine', 'deine'. 'hint', 'keine', 'seine' or 'sind'; | ||
# or 'un' vs. 'unter'; or reserved symbolizations like 'dem' which | ||
# is not used in 'demokratisch') thus not yet considered here. | ||
kuerzel = [ | ||
'also', 'ander', 'ant', 'auf', 'aus', 'besonder', 'bis', 'dar', | ||
'deine', 'dessen', 'deutsch', 'dies', 'doch', 'durch', 'fort', 'für', | ||
'gegen', 'heit', 'hint', 'ion', 'keine', 'konnt', 'lich', 'lung', | ||
'meine', 'mit', 'nichts', 'noch', 'nur', 'ohne', 'rung', 'schaft', | ||
'schon', 'seine', 'selbst', 'sich', 'sind', 'solch', 'soll', 'sonder', | ||
'über', 'unter', 'vielleicht', 'voll', 'vom', 'von', 'völl', 'wenn', | ||
'will', 'wird', 'woll', 'worden', 'wurd', 'zer', 'zum', 'zurück', | ||
'zurück', 'zusammen', 'zwischen' | ||
] | ||
check_list = grouped_consonants + kuerzel | ||
|
||
for element in check_list: | ||
if element in test: # check: | ||
tag_line += str(" {}".format(element)) | ||
|
||
# specialty rules, complementing the simpler ones above: | ||
# | ||
# "ge" at the beginning of the word, but not as "gegen" | ||
# | ||
# Pro: Identifies, e.g. "Gebiet", excludes entries like "Gegend", | ||
# or "gegenüber", and conjunctions to "ei" ("Geige"). | ||
# | ||
# Con: Detection of plausible matches like "Angebot", "angeboren" | ||
# is missed. Neither pure string comparison, or a syllable | ||
# based approach so far prevent collisions with false-positives | ||
# like "Türangel", or "Enge"; beside an open identification | ||
# of "ng" != ["lung", "rung"]. | ||
# | ||
if (test.startswith("ge")) and (test.startswith("gegen") is | ||
False) and (str(test[2]) is not str("i")): | ||
tag_line += str(" ge") | ||
|
||
# identification of "sch" as different from groups "schl", "schm", | ||
# "schn", "schr", and separate from kuerzel "schaft" and "deutsch" | ||
# | ||
if str("sch") in test: | ||
start = test.find("sch") | ||
try: | ||
if (str(test)[start + 3] in ["l", "m", "n", "r"]) or \ | ||
(str(test)[start : start + 6] == str("schaft")) or \ | ||
(str(test)[start - 4 : start + 3] == str("deutsch")): | ||
pass | ||
else: | ||
tag_line += str(" sch") | ||
except: | ||
pass | ||
|
||
# identification and discern of "st" from "str" | ||
# | ||
if str("st") in test: | ||
syllables = h_de.syllables(test) | ||
match = False | ||
for syllable in syllables: | ||
if (syllable.startswith("st")) and \ | ||
(syllable.startswith("str") is False): | ||
match = True | ||
break | ||
if match: | ||
tag_line += str(" st") | ||
|
||
# identification and discern of "tr" from "str" | ||
# | ||
if str("tr") in test: | ||
start = test.find("tr") | ||
try: | ||
if str(test)[start - 1] is not str("s"): | ||
tag_line += str(" tr") | ||
except: | ||
pass | ||
|
||
# identification of "un" besider "unter" as start of a word | ||
# | ||
if (test.startswith("un")) and (test.startswith("unter") is False): | ||
tag_line += str(" un") | ||
|
||
|
||
def main(): | ||
""" Joining the functions. """ | ||
check_python() | ||
only_check_presence_workshop() | ||
remove_from_list() | ||
remove_from_folder() | ||
|
||
|
||
main() |
Oops, something went wrong.