-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuild_ngram_model.py
130 lines (104 loc) · 3.85 KB
/
build_ngram_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env python3
import nltk
import math
sentences = []
def open_files():
with open("dickens_training.txt", "r") as input_file:
for line in input_file.readlines():
text = line.lower()
text = text.replace("\n", "")
sentence = "<s> " + text + " </s>"
sentences.append(sentence)
return sentences
formatted_text = open_files()
# Unigrams
def unigrams(sentences):
uni_dictionary = {}
uni_word_total = 0
for sentence in sentences:
for word in sentence.split(" "):
if word not in uni_dictionary:
uni_dictionary[word] = 1
uni_word_total += 1
elif word in uni_dictionary:
uni_dictionary[word] = uni_dictionary[word] + 1
uni_word_total += 1
return uni_dictionary, uni_word_total
def reformat_list(sentences):
all_unigrams = []
for sentence in sentences:
all_unigrams += sentence.split(" ")
return all_unigrams
new_sentences = reformat_list(sentences)
bigrams = nltk.bigrams(new_sentences)
def bigramifier(bigrams):
bi_tally = 0
bi_everything = 0
bi_dictionary = {}
for bigram in bigrams:
if bigram[0] == "</s>":
continue
bi_everything += 1
if bigram[0] not in bi_dictionary:
bi_dictionary[bigram[0]] = {}
if bigram[1] not in bi_dictionary[bigram[0]]:
bi_dictionary[bigram[0]][bigram[1]] = 1
bi_tally += 1
elif bigram[1] in bi_dictionary[bigram[0]]:
bi_dictionary[bigram[0]][bigram[1]] += 1
return bi_dictionary, bi_tally, bi_everything
trigrams = nltk.trigrams(new_sentences)
(bi_result, bi_types, bi_tokens) = bigramifier(bigrams)
def trigraminator(trigrams):
tri_dictionary = {}
tri_tally = 0
tri_everything = 0
for word in trigrams:
first2 = word[0] + " " + word[1]
if word[0] == "</s>":
continue
if word[1] == "</s>" or word[1] == "<s>":
continue
if word[2] == "<s>":
continue
tri_everything += 1
if first2 not in tri_dictionary:
tri_dictionary[first2] = {}
if word[2] not in tri_dictionary[first2]:
tri_dictionary[first2][word[2]] = 1
tri_tally += 1
else:
tri_dictionary[first2][word[2]] += 1
return tri_dictionary, tri_tally, tri_everything
output = open("dickens_model.txt", "w")
output.write("\\data\\ \n")
(uni_result, uni_total) = unigrams(sentences)
uni_types = len(uni_result)
output.write("ngram 1: types= " + str(uni_types) + " tokens= " + str(uni_total) + "\n")
output.write("ngram 2: types= " + str(bi_types) + " tokens= " + str(bi_tokens) + "\n")
(tri_result, tri_types, tri_tokens) = trigraminator(trigrams)
output.write("ngram 3: types= " + str(tri_types) + " tokens= " + str(tri_tokens) + "\n")
output.write("\\1-grams \n")
for key in uni_result:
count = uni_result[key]
prob = float(count) / uni_total
log_prob = math.log10(prob)
unig = key
output.write(str(count) + " " + str(prob) + " " + str(log_prob) + " " + str(key) + "\n")
output.write("\\2-grams \n")
for key in bi_result:
for key2 in bi_result[key]:
bicount = bi_result[key][key2]
biprob = float(bicount) / uni_result[key]
bilog_prob = math.log10(biprob)
bikeys = str(key + " " + key2)
output.write(str(bicount) + " " + str(biprob) + " " + str(bilog_prob) + " " + bikeys + "\n")
output.write("\\3-grams \n")
for key in tri_result:
for key2 in tri_result[key]:
tricount = tri_result[key][key2]
split_key = key.split()
triprob = float(tricount) / bi_result[split_key[0]][split_key[1]]
trilog_prob = math.log10(triprob)
trikeys = str(key + " " + key2)
output.write(str(tricount) + " " + str(triprob) + " " + str(trilog_prob) + " " + trikeys + "\n")