-
Notifications
You must be signed in to change notification settings - Fork 0
/
tools.py
178 lines (158 loc) · 7.03 KB
/
tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
import log # set up logging
import json # importing sign table
import regex # splitting text into words
import unicodedata # dealing with diacritics
# get logger from main file
logger = log.logger
def latin_to_linear_b(text, hyphens=False, debug=False):
# split text by spaces and hyphens
with open("sign-table.json", "r") as sign_file:
sign_dict = json.load(sign_file)
ids_dict = sign_dict["ids"]
numeral_dict = sign_dict["numerals"]
sign_dict = sign_dict["text"]
# logger.debug(text)
# \pL = Letter (any) \pM = Mark (any)
text = filter(None, regex.split(r"([^\p{L}\p{M}0-9_\-\*])", text))
# Print out the filtered text
# for char in text:
# logger.debug(f"<{char}>")
# try:
# logger.debug([unicodedata.name(c) for c in char])
# except:
# logger.debug("can't find char name")
output = ""
# for word in text:
# word = word.split("-")
# logger.debug(list(text))
# TODO convert ideograms!
for word in text:
# logger.debug(sign_dict[syllabogram])
# TODO if a word with funny characters then normalise
for char in word:
if not unicodedata.is_normalized("NFD", char):
logger.debug(f"<{char}>")
char_parts = unicodedata.decomposition(char).split(" ")
logger.debug(char_parts)
for part in char_parts:
# first part will be normalised letter/number, second will be combining diacritic
escaped = "\\u" + part
logger.debug(escaped.encode().decode("unicode-escape"))
# this just prints, need to keep hold and recombine after conversion
syllabograms = regex.split("(-)", word)
if debug:
logger.debug(syllabograms)
for syllabogram in syllabograms:
converted = ""
if syllabogram.lower() in sign_dict:
# output = output + sign_dict[syllabogram.lower()]
converted = sign_dict[syllabogram.lower()]
elif syllabogram.startswith("*"):
try:
# search for number in syllabogram, ignores e.g. VAS
# output = output + ids_dict[regex.search(r'\d+', syllabogram)[0]]
converted = ids_dict[regex.search(r'\d+', syllabogram)[0]]
except:
# output = output + syllabogram
converted = syllabogram
elif syllabogram.isnumeric():
try:
numeral = int(syllabogram)
# 99,999 is the maximum
if numeral < 100_000:
# get a list of each digit with leading zeroes
split_numeral = [int(digit) for digit in str(numeral).zfill(5)]
# assign each number
ten_thousands, thousands, hundreds, tens, ones = split_numeral[0] * 10_000, split_numeral[1] * 1_000, split_numeral[2] * 100, split_numeral[3] * 10, split_numeral[4]
# lookup each numeral and add to output
# output = output + numeral_dict.get(str(ten_thousands), "") + numeral_dict.get(str(thousands), "") + numeral_dict.get(str(hundreds), "") + numeral_dict.get(str(tens), "") + numeral_dict.get(str(ones), "")
converted = numeral_dict.get(str(ten_thousands), "") + numeral_dict.get(str(thousands), "") + numeral_dict.get(str(hundreds), "") + numeral_dict.get(str(tens), "") + numeral_dict.get(str(ones), "")
except:
# couldn't convert numeral
# output = output + syllabogram
converted = syllabogram
elif syllabogram == "-" and not hyphens:
# don't add hyphens to output when not requested
# output = output
converted = converted
else:
# output = output + syllabogram
converted = syllabogram
# logger.debug(f"{syllabogram} --> {converted}")
output = output + converted
return output
def linear_b_to_latin(text):
# split text by spaces and hyphens
with open("sign-table.json", "r") as sign_file:
sign_dict = json.load(sign_file)
# \pL = Letter (any) \pM = Mark (any)
text = filter(None, regex.split(r"([^\p{L}\p{M}0-9_\-\*])", text))
output = ""
# print out a list of words in the text - breaks the next part!
# logger.debug(list(text))
for word in text:
# logger.debug("word: " + word)
for char in word:
converted = ""
# try syllabic sounds
for key, value in sign_dict["text"].items():
if value == char:
converted = key + "-"
# try chars by id
if converted == "":
for key, value in sign_dict["ids"].items():
if value == char:
converted = "*" + key
# try aegean number
if regex.search(r'[\U00010107-\U00010133]', word, regex.IGNORECASE):
converted = 0
for numeral in word:
for key, value in sign_dict["numerals"].items():
if value == numeral:
converted += int(key)
converted = str(converted)
# not in sign table so punctuation, keep
if converted == "":
converted = char
# if char != " ":
# logger.debug(f"{char} --> {converted}")
output = output + converted
# remove trailing -, e.g. di-pa-
if output[-1:] == "-":
output = output[:-1]
return output
def numeral_syllabograms_to_sound(word):
# prepare syllabograms with numerals
# from Del Freo-Perna 2019 page 133
# equivalents for stem creation e.g. ra2 => ra
# TODO generate a-de-ra2 from stem a-de-r-
sounds = {
"a2": ["a", "a"],
"a3": ["ai", "a"],
"pu2": ["pu", "pu"],
"ra2": ["rra", "ra"], # TODO should rrai be possible too?
"ro2": ["rro", "ro"],
"ra3": ["rai", "ra"],
"ta2": ["sta", "ta"]
}
syllabograms = regex.split("(-)", word)
output = ""
normalised = ""
for syllabogram in syllabograms:
if syllabogram in sounds.keys():
output += sounds[syllabogram][0]
normalised += sounds[syllabogram][1]
else:
output += syllabogram
normalised += syllabogram
return {
"sound": output,
"normalised": normalised
}
if __name__ == "__main__":
logger.info("Converting 𐀐𐀩𐀪𐀡 to Latin letters:")
print(linear_b_to_latin("𐀐𐀩𐀪𐀡"))
logger.info("Converting Po-ti-ni-a to Linear B:")
print(latin_to_linear_b("Po-ti-ni-a"))
logger.info("Converting syllabograms with numerals in them to just sounds:")
print(numeral_syllabograms_to_sound("a-de-ra2"))