MarkovModel.py


import sys
import random
import wikipedia as wiki
import argparse 
from decimal import *

class markovModel(object):
	"""
	Author: Vishakh Gopu
	
	This class implements a Markov model of order two that 
	can be trained on text extracted from wikipedia or from file.
	The path to the training text ,path to testing text ,
	whether to train from cache, can all be specified as command line 
	arguments. The model assumes text is produced character by character. 
	"""
	def __init__(self, wiki_page, corpus_path, passage_path, from_cache,chars_words):
		"""
		Initializes the nessesary paths and class fields to hold
		the corpus text and training text. Also deals with loading 
		from file versus from a wikipedia page. 
		"""
		#The class fields
		self.words = chars_words
		self.wiki_page = wiki_page
		self.corpus_path = corpus_path
		self.passage_path = passage_path
		#From cache or from wikipedia
		if from_cache == 0:
			self.corpus = self.load_corpus()
		if from_cache == 1 :
			self.corpus = wiki_init 
		
		self.test_passage = self.load_test_sample()
		self.model = self.train()

	
	def wiki_init(self):
		"""
		Creates the training text from a wikipedia
		page and saves to file. 
		It extracts the given page and every linked page.
		"""
		phil = wiki.page(self.wiki_page)
		content = phil.content 
		links = phil.links
		#Get text from all linked pages
		for i in range(len(links)):
		    current_link = links[i]
		    try:
		        current_page = wiki.page(current_link)
		        current_content = current_page.content
		        content += current_content 
		    except:
		        continue	
		content = content.encode('utf-8').strip()
		f = open(self.corpus_path, 'w')
		f.write(content)
		f.close()

	def load_test_sample(self):
		"""
		Loads the passage to test the model.
		It splits the text into characters.
		"""
		f = open(self.passage_path, 'r')
		train_sample = f.read()
		if self.words == 0:
			train_sample = list(train_sample)
		if self.words ==1:
			train_sample = train_sample.split()
		f.close()
		return train_sample

	def load_corpus(self):
		"""
		Loads the training text from file.
		Splits it into a list of characters.
		"""
		f = open(self.corpus_path, 'r')
		corpus = f.read()
		#generate characters 
		if self.words == 0:
			corpus = list(corpus)
		#generate words
		if self.words ==1:
			corpus = corpus.split()
		return corpus

	def train(self):
		"""
		Trains the model on the training text (corpus).
		Creates a dictionary that simulates the 
		probability of transitioning to a new character
		based on the probability of the previous two characters.
		"""
		corpus = self.corpus 
		model = {}
		for i in range (len(corpus)-2):
			first = corpus[i]
			second = corpus[i+1]
			third = corpus [i+2]
			if(first, second) in model :
				model[(first, second)].append(third)
			else:
				model[(first, second)] = [third]
		return model

	def generate_sequence(self):
		"""
		Generates a random text of size 200
		"""
		model = self.model
		key1 = random.choice(model.keys())
		first = key1[0]
		second = key1[1]
		markov_text = ""
		for i in range(200) :
			key = (first,second)
			possiblities = model[key]
			next = random.choice(possiblities)
			if self.words == 1:
				markov_text += " "+ next
			if self.words ==0:
				markov_text += nest
			first = second 
			second = next

		print markov_text
	
	def get_bigram_probs (self):
		"""
		Get the frequency of bigrams 
		in the corpus.
		"""
		corpus = self.corpus
		bigram_probs = {}
		for i in range(len(corpus)-1):
			first = corpus[i]
			second = corpus[i+1]
			key = (first, second)

			if key in bigram_probs:
				bigram_probs[key]+=1
			else :
				bigram_probs[key] = 1
		bigram_probs['unk'] = 0
		return bigram_probs


	def get_trigram_probs(self):
		"""
		Get the frequency of trigrams 
		in the corpus.
		"""
		corpus = self.corpus
		trigram_probs = {}
		for i in range(len(corpus)-2) :
			first = corpus[i]
			second = corpus[i+1]
			third = corpus [i+2]
			key = (first,second,third)
			if key in trigram_probs :
				trigram_probs[key]+= 1
			else:
				trigram_probs[key] = 1
		trigram_probs["unk"] = 0
		return trigram_probs

	def probability_of_passage(self):
		"""
		Gets the probability of a test
		passage of being generated by multiplying 
		the probability of each trigram being formed.
		"""
		bigram = self.get_bigram_probs()
		trigram = self.get_trigram_probs()

		passage = self.test_passage 
		chain_probability = Decimal(1)
		for i in range(len(passage)-3):
			first = passage[i]
			second = passage[i+1]
			third = passage[i+2]
			bigram_key = (first,second)
			trigram_key = (first,second,third)
			#Deal with unknown n-grams
			if trigram_key not in trigram:
				trigram["unk"] += 1
				trigram_key = "unk"
			
			if bigram_key not in bigram:
				bigram['unk'] += 1
				bigram_key = "unk"
			
			count_bigram = bigram[bigram_key]
			count_trigram = trigram[trigram_key]
			trigram_prob = Decimal(count_trigram)/Decimal(count_bigram)
			chain_probability = chain_probability*trigram_prob
			
		print chain_probability

def main():
	
	#Command line arguments
	parser = argparse.ArgumentParser()
	parser.add_argument('corpus_path',  type=str,default = "corpus.txt", 
                  		 help='path to the corpus text content',nargs='?')
	parser.add_argument('passage_path', type=str, default ="passage.txt",
						help = 'path to the training text',nargs='?')
	parser.add_argument('from_cache', type=int, default = 0,
						help = '0 :from file, 1 :from wikipedia', nargs='?')
	parser.add_argument('wiki_page', type=str,default="Philosophy",
						help = 'The wikipedia page to extract from',nargs='?')
	parser.add_argument('word_or_char', type=str,default=1,
						help = 'Generate words:1,Characters:0',nargs='?')
	args = parser.parse_args()
	
	###TOP LEVEL### 
	markov = markovModel(args.wiki_page, args.corpus_path,
						args.passage_path, args.from_cache,args.word_or_char)
	print "\nHERE IS A RANDOMLY GENERATED PASSAGE\n"
	markov.generate_sequence()
	if args.word_or_char == 0:
		print "\nHERE IS THE PROBABILITY OF GIVEN PASSAGE"
		markov.probability_of_passage()

if __name__ == "__main__":
   main()