-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathconvertPronDict.py
124 lines (101 loc) · 6.69 KB
/
convertPronDict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import codecs
import pickle
import sys
from WordTranscriptionAndChanges import *
pronDict_original_filepath = sys.argv[1]
phone_mapping_filepath = sys.argv[2]
pronDict_intermediate_filepath = sys.argv[3]
# Read the phone_mapping file and create a dict object out of it
phone_mapping = {}
# The phone_mapping dictionary has a simple structure of: phone_mapping[original_segment]=intermediate_segment
with codecs.open(phone_mapping_filepath, 'r', 'utf-8') as phone_mapping_file:
print("Will read the phone mapping file.")
phone_lines = phone_mapping_file.read().splitlines()
phone_lines[0] = phone_lines[0].replace('\ufeff', "")
for phone_line in phone_lines:
phone_line = phone_line.split('\t')
if len(phone_line) != 2:
print(f"Error, the line `{phone_line}` in the phone mapping file is NOT tab-separated with two columns."
f"\nMake sure all your lines are tab-separated with two columns.")
raise ValueError('There is an error, check the log file.')
exit()
original_phone, intermediate_phone = phone_line
if original_phone in phone_mapping:
print(f"Error, the phone {original_phone} appears multiple times in the phone mapping file."
f"\nMake sure all the phones from the original language are unique.")
raise ValueError('There is an error, check the log file.')
exit()
phone_mapping[original_phone] = intermediate_phone
# Read the original pronunciation dictionary that had the original transcriptions.
# Create a new dictionary with intermediate transcriptions by applying the custom phone mapping
# Store information on the transcriptions that changed into a dict called `wordTranscriptions`
wordTranscriptions = {}
# The transcription_conversions dict is made up of words whose transcriptions did or did not change.
# Each word maps to a wordTranscription object.
# A wordTranscription object contains the word's original transcription, intermediate transcriptions, and the list of segment changes
# e.g. = transcription_conversions = { "ըսաւ": obj0, "ասաց": obj1, "բառ": obj2, "րա": obj3 , "ռա": obj3 }
# }
# The word ըսաւ has an unchanged transcription form ["ə","s","ɑ","v"]
# The word ասաց has a single alteration from ["ɑ","s","ɑ","t͡sʰ"] to ["ɑ","s","ɑ","tʃ"] at index 3
# The word բառ has a single intermediate forms with two alternations, from ["p", "ʏ", "r"] to ["p", "u", "ɾ"] at indexes 1,2
# The word րա has two intermediate forms, with zero or one alteration,
# from ["ɾ","ɑ"] to ["ɾ","ɑ"] with zero changes, or ["ɻ","ɑ"] to ["j","ɑ"] with change at index 0
# The word ռա has two identical intermediate forms from ["ɾ","ɑ"] or ["r","ɑ"] to ["ɾ","ɑ"] at index 0
def convert_transcription_to_intermediate(original_transcription):
# Convert an original transcription into the intermediate transcription (even if identical) by using the custom phone_mapping dict.
# Keep track of the changes if any
# transcriptionChange: original_transcription, intermediate_transcription, segments_that_change, didChange):
intermediate_transcription = []
segments_that_change = []
for i in range(len(original_transcription)):
original_segment = original_transcription[i]
if original_segment in phone_mapping:
intermediate_segment=phone_mapping[original_segment]
intermediate_transcription.append(intermediate_segment)
segments_that_change.append(i)
else:
intermediate_transcription.append(original_segment)
transcriptionChange = TranscriptionChange(original_transcription,intermediate_transcription,segments_that_change)
return transcriptionChange
print("Will read the original pronunciation dictionary and create an intermediate pronunciation dictionary")
# Read the original pronunciation dictionary and create a modified dictionary with English IPA symbols
print("Useful debugging info for conversion is printed into the convertPronDict.message.log")
# The printing code was taken from https://stackoverflow.com/a/2513511
old_stdout = sys.stdout
log_file = codecs.open("convertPronDict.message.log", "w", 'utf-8')
sys.stdout = log_file
with codecs.open(pronDict_original_filepath, 'r', 'utf-8') as pronDict_original_file:
with codecs.open(pronDict_intermediate_filepath, 'w', 'utf-8') as pronDict_intermediate_file:
dictionary_lines = pronDict_original_file.read().splitlines()
dictionary_lines[0] = dictionary_lines[0].replace('\ufeff', "")
for dictionary_line in dictionary_lines:
print(f"\tProcessing the following line from the original pronunciation dictionary: {dictionary_line}")
dictionary_line = dictionary_line.split('\t')
if len(phone_line) != 2:
print(f"\tError, the line `{dictionary_line}` in the original pronunciation dictionary file is NOT tab-separated with two columns."
f"\nMake sure all your lines are tab-separated with two columns.")
raise ValueError('There is an error, check the log file.')
exit()
word,original_transcription_string = dictionary_line[0].strip(), dictionary_line[1].strip()
original_transcription = original_transcription_string.split(' ')
transcriptionChange=convert_transcription_to_intermediate(original_transcription)
intermediate_transcription_string = ' '.join(transcriptionChange.intermediate_transcription)
if word not in wordTranscriptions:
print(f"\t\tThe word {word} is newly encountered")
newWordTranscription = WordTranscription(word)
print(f"\t\tIts original transcription {original_transcription_string} is converted to {intermediate_transcription_string}.")
newWordTranscription.add_transcription(transcriptionChange)
wordTranscriptions[word] = newWordTranscription
else:
print(f"\t\tThe word {word} is previously encountered")
print(f"\t\tIts original transcription {original_transcription_string} is converted to {intermediate_transcription_string}.")
wordTranscriptions[word].add_transcription(transcriptionChange)
pronDict_intermediate_file.write(f"{word}\t{intermediate_transcription_string}\n")
print(f"The word transcriptions are as follows:")
for wordTranscription in wordTranscriptions.items():
print(str(wordTranscription[1]))
sys.stdout = old_stdout
log_file.close()
print("The pickle object `wordTranscriptions.pkl` is created. It is a Python dictionary that keeps track of the changes.")
with open('wordTranscriptions.pkl', 'wb') as f:
pickle.dump(wordTranscriptions, f)