-
Notifications
You must be signed in to change notification settings - Fork 0
/
MarkovModel.py
230 lines (206 loc) · 5.98 KB
/
MarkovModel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
import sys
import random
import wikipedia as wiki
import argparse
from decimal import *
class markovModel(object):
"""
Author: Vishakh Gopu
This class implements a Markov model of order two that
can be trained on text extracted from wikipedia or from file.
The path to the training text ,path to testing text ,
whether to train from cache, can all be specified as command line
arguments. The model assumes text is produced character by character.
"""
def __init__(self, wiki_page, corpus_path, passage_path, from_cache,chars_words):
"""
Initializes the nessesary paths and class fields to hold
the corpus text and training text. Also deals with loading
from file versus from a wikipedia page.
"""
#The class fields
self.words = chars_words
self.wiki_page = wiki_page
self.corpus_path = corpus_path
self.passage_path = passage_path
#From cache or from wikipedia
if from_cache == 0:
self.corpus = self.load_corpus()
if from_cache == 1 :
self.corpus = wiki_init
self.test_passage = self.load_test_sample()
self.model = self.train()
def wiki_init(self):
"""
Creates the training text from a wikipedia
page and saves to file.
It extracts the given page and every linked page.
"""
phil = wiki.page(self.wiki_page)
content = phil.content
links = phil.links
#Get text from all linked pages
for i in range(len(links)):
current_link = links[i]
try:
current_page = wiki.page(current_link)
current_content = current_page.content
content += current_content
except:
continue
content = content.encode('utf-8').strip()
f = open(self.corpus_path, 'w')
f.write(content)
f.close()
def load_test_sample(self):
"""
Loads the passage to test the model.
It splits the text into characters.
"""
f = open(self.passage_path, 'r')
train_sample = f.read()
if self.words == 0:
train_sample = list(train_sample)
if self.words ==1:
train_sample = train_sample.split()
f.close()
return train_sample
def load_corpus(self):
"""
Loads the training text from file.
Splits it into a list of characters.
"""
f = open(self.corpus_path, 'r')
corpus = f.read()
#generate characters
if self.words == 0:
corpus = list(corpus)
#generate words
if self.words ==1:
corpus = corpus.split()
return corpus
def train(self):
"""
Trains the model on the training text (corpus).
Creates a dictionary that simulates the
probability of transitioning to a new character
based on the probability of the previous two characters.
"""
corpus = self.corpus
model = {}
for i in range (len(corpus)-2):
first = corpus[i]
second = corpus[i+1]
third = corpus [i+2]
if(first, second) in model :
model[(first, second)].append(third)
else:
model[(first, second)] = [third]
return model
def generate_sequence(self):
"""
Generates a random text of size 200
"""
model = self.model
key1 = random.choice(model.keys())
first = key1[0]
second = key1[1]
markov_text = ""
for i in range(200) :
key = (first,second)
possiblities = model[key]
next = random.choice(possiblities)
if self.words == 1:
markov_text += " "+ next
if self.words ==0:
markov_text += nest
first = second
second = next
print markov_text
def get_bigram_probs (self):
"""
Get the frequency of bigrams
in the corpus.
"""
corpus = self.corpus
bigram_probs = {}
for i in range(len(corpus)-1):
first = corpus[i]
second = corpus[i+1]
key = (first, second)
if key in bigram_probs:
bigram_probs[key]+=1
else :
bigram_probs[key] = 1
bigram_probs['unk'] = 0
return bigram_probs
def get_trigram_probs(self):
"""
Get the frequency of trigrams
in the corpus.
"""
corpus = self.corpus
trigram_probs = {}
for i in range(len(corpus)-2) :
first = corpus[i]
second = corpus[i+1]
third = corpus [i+2]
key = (first,second,third)
if key in trigram_probs :
trigram_probs[key]+= 1
else:
trigram_probs[key] = 1
trigram_probs["unk"] = 0
return trigram_probs
def probability_of_passage(self):
"""
Gets the probability of a test
passage of being generated by multiplying
the probability of each trigram being formed.
"""
bigram = self.get_bigram_probs()
trigram = self.get_trigram_probs()
passage = self.test_passage
chain_probability = Decimal(1)
for i in range(len(passage)-3):
first = passage[i]
second = passage[i+1]
third = passage[i+2]
bigram_key = (first,second)
trigram_key = (first,second,third)
#Deal with unknown n-grams
if trigram_key not in trigram:
trigram["unk"] += 1
trigram_key = "unk"
if bigram_key not in bigram:
bigram['unk'] += 1
bigram_key = "unk"
count_bigram = bigram[bigram_key]
count_trigram = trigram[trigram_key]
trigram_prob = Decimal(count_trigram)/Decimal(count_bigram)
chain_probability = chain_probability*trigram_prob
print chain_probability
def main():
#Command line arguments
parser = argparse.ArgumentParser()
parser.add_argument('corpus_path', type=str,default = "corpus.txt",
help='path to the corpus text content',nargs='?')
parser.add_argument('passage_path', type=str, default ="passage.txt",
help = 'path to the training text',nargs='?')
parser.add_argument('from_cache', type=int, default = 0,
help = '0 :from file, 1 :from wikipedia', nargs='?')
parser.add_argument('wiki_page', type=str,default="Philosophy",
help = 'The wikipedia page to extract from',nargs='?')
parser.add_argument('word_or_char', type=str,default=1,
help = 'Generate words:1,Characters:0',nargs='?')
args = parser.parse_args()
###TOP LEVEL###
markov = markovModel(args.wiki_page, args.corpus_path,
args.passage_path, args.from_cache,args.word_or_char)
print "\nHERE IS A RANDOMLY GENERATED PASSAGE\n"
markov.generate_sequence()
if args.word_or_char == 0:
print "\nHERE IS THE PROBABILITY OF GIVEN PASSAGE"
markov.probability_of_passage()
if __name__ == "__main__":
main()