-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_from_ngram.py
145 lines (125 loc) · 4.64 KB
/
generate_from_ngram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/bin/env python3
'''
William Bowers
generate_from_ngram.py
takes in a language model (output from build_ngram_model.py) and generates uni/bi/trigram sentences
3/25/2019
'''
import sys, random, time
input_file = sys.argv[1]
output_file = sys.argv[2]
#open file, split on new line, lowercase, start of sentence tags, end of sentence tags, split on each word
with open(input_file,'r') as text_file:
lines = text_file.read().split('\n')
#establishing boundaries within the text using index of identifying line in the dickens_model.txt file
unigram_start = lines.index('\\1-grams:') #index at 5
bigram_start = lines.index('\\2-grams:') #index at 43,885
trigram_start = lines.index('\\3-grams:') #index at 613,523
end = lines.index('\\end\\') #index at 2,217,785
#UNIGRAMS
'''
Process:
- start with <s>
- generate random float to use as a target number
- iterate through the list of n-grams, adding probabilites together until the random target number is met.
- add that word to sentence until </s> is generated
- make sure it starts on the start tag <s>
- another start tag <s> cannot exist until there is an end tag </s>
'''
#generating unigram based on random float
def generate_unigram(random):
probability_sum = 0.0
for line in lines[unigram_start + 1:bigram_start - 1]:
words = line.split(' ')
probability_sum += float(words[1])
unigram = words[3]
if probability_sum >= random:
return unigram
return None #if all unigrams have been iterated over and the random condition isnt satisfied
#stringing together unigrams to make a sentence
def unigram_sentence():
beg_sentence = '<s> '
sentence = beg_sentence + ''
current_word = ''
while current_word != '</s>':
current_word = generate_unigram(random.random())
if current_word != '<s>':
sentence += current_word + ' '
return sentence
#BIGRAMS
'''
Process:
- start with <s>
- generate random float to use as a target number
- iterate through the list of n-grams, adding probabilites of the 2nd word (given <s>) in bigram together until the random target number is met.
- when target is met, add word to sentence, then repeat with bigrams that start with that generated word
- make sure it starts on the start tag <s>
- another start tag <s> cannot exist until there is an end tag </s>
'''
#how do i make the condition read that <s> is given
#generates the second word of bigram, given the first
def generate_bigram(random, reduced_lines):
probability_sum = 0.0
for line in reduced_lines:
words = line.split(' ')
probability_sum += float(words[1])
if probability_sum >= random:
return words[4]
return None
#generate sentence
def bigram_sentence():
sentence = ['<s>']
while sentence[-1] != '</s>':
#find all bigrams with the most recent word as the first word
reduced_lines = list(filter(lambda x: x.split(' ')[3] == sentence[-1], lines[bigram_start + 1:trigram_start - 1]))
next_word = generate_bigram(random.random(), reduced_lines)
if next_word != '<s>' and next_word != None:
sentence.append(next_word)
return sentence
#TRIGRAMS
'''
Process:
- same as bigram
- use the bigram generator to find the first word after the <s>
'''
def generate_trigram(random, reduced_lines):
probability_sum = 0.0
for line in reduced_lines:
words = line.split(' ')
probability_sum += float(words[1])
if probability_sum >= random:
return words[5]
return None
def trigram_sentence():
sentence = ['<s>']
#find the bigrams of the trigram
while len(sentence) < 2:
reduced_lines = list(filter(lambda x: x.split(' ')[3] == sentence[-1], lines[bigram_start + 1:trigram_start - 1]))
next_word = generate_bigram(random.random(), reduced_lines)
if next_word != '<s>' and next_word != None:
sentence.append(next_word)
#find the 3rd word of trigram
while sentence[-1] != '</s>':
#filter based of two already coexisting words in a trigram
reduced_lines = list(filter(lambda x: x.split(' ')[3] == sentence[-2] and x.split(' ')[4] == sentence[-1], lines[trigram_start + 1:end - 1]))
next_word = generate_trigram(random.random(), reduced_lines)
if next_word != '<s>' and next_word != None:
sentence.append(next_word)
return sentence
#call functions and write outputs to file
def writer():
with open(output_file, 'w') as output:
begin = time.time()
output.write('\\1 grams:' + '\n')
for i in range(0,5):
output.write(unigram_sentence() + '\n')
print('Unigrams:', time.time() - begin)
output.write('\n' + '\\2 grams:' + '\n')
for i in range(0,5):
output.write(' '.join(bigram_sentence()) + '\n')
print('Bigrams:', time.time() - begin)
output.write('\n' + '\\3 grams:' + '\n')
for i in range(0,5):
output.write(' '.join(trigram_sentence()) + '\n')
print('Trigrams:', time.time() - begin)
writer()