-
Notifications
You must be signed in to change notification settings - Fork 1
/
06_get_frequency_ratios.py
27 lines (22 loc) · 1.21 KB
/
06_get_frequency_ratios.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
wikipedia = {}
with open("01_frequencies_wikipedia.csv", "r") as f:
for line in f:
(freq_abs, freq, word) = line.strip().split(",")
wikipedia[word] = float(freq)
commonvoice = {}
with open("05_frequencies_common_voice.csv", "r") as f:
for line in f:
(freq_abs, freq, word) = line.strip().split(",")
commonvoice[word] = float(freq)
# the formula below is a measure of how many times the word must be added to the dataset for the frequencies to match
ratios = [(word, (commonvoice.get(word, 0) - wikipedia[word]) / (wikipedia[word] - 1)) for word,freq in wikipedia.items()]
ratios.sort(key=lambda k: k[1], reverse=True)
with open("07_common_voice_word_representation.csv", "w") as f:
for word in ratios:
f.write("{:.10f},{:.10f},{:.10f},{}\n".format(word[1], wikipedia[word[0]], commonvoice.get(word[0], 0), word[0]))
overrepresented = [word for word in ratios if word[1] < 0]
notrepresented = [word for word in ratios if word[0] not in commonvoice ]
extra = [word for word in commonvoice if word not in wikipedia]
print("OVERREPRESENTED: {}".format( len(overrepresented)) )
print("NOT REPRESENTED: {}".format( len(notrepresented) ))
print("NOT IN WIKIPEDIA: {}".format( len(extra) ))