forked from pub12/MarkovText
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmarkov.py
85 lines (65 loc) · 2.26 KB
/
markov.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import nltk
import re
import pprint
import random
class Markov(object):
def __init__(self, order=2, dictFile="", maxWordInSentence=20):
self.table = {}
self.inputLineCount = 0
self.inputWordCount = 0
self.setOrder( order )
self.setMaxWordInSentence(maxWordInSentence)
if dictFile:
self.loadDictionary(dictFile)
def setOrder(self, order=2):
self.order = order
def loadDictionary(self, dictFile):
with open(dictFile, 'r') as inf:
self.table = eval(inf.read())
# print("Loaded dictionary file:"+dictFile)
# pprint.pprint(self.table)
def readFile(self, filename, fileEncoding="utf-8"):
with open(filename, "r", encoding=fileEncoding) as file:
strLine = " ".join(file)
self.processSection(strLine)
def processSection(self,line ):
# global lineCount, wordCount, table, keyLen
sent_text = nltk.sent_tokenize(line) # this gives us a list of sentences
for sentence in sent_text:
self.inputLineCount = self.inputLineCount + 1
tokens = sentence.split()
keyList = [ ];
#Add a special key with just beginning words
self.table.setdefault( '#BEGIN#', []).append(tokens[0:self.order ]);
#loop through each word, and if we have enough to add dictionary item, then add
for item in tokens:
if len(keyList) < self.order : #not enough items
keyList.append(item)
continue
#If we already have the item, then add it, otherwise add to empty list
self.table.setdefault( tuple(keyList), []).append(item)
#Remove the first word and push last word on to it
keyList.pop(0)
keyList.append(item)
self.inputWordCount = self.inputWordCount + 1
def setMaxWordInSentence(self, maxWordInSentence):
self.maxWordInSentence = maxWordInSentence
def genText(self):
key = list(random.choice( self.table['#BEGIN#'] ))
genStr = " ".join( key )
for _ in range( self.maxWordInSentence ):
newKey = self.table.setdefault( tuple(key), "")
if(newKey == ""):
break
newVal = random.choice( newKey )
genStr = genStr + " " + newVal
key.pop(0)
key.append(newVal)
return genStr
def getLineCount(self):
return self.inputLineCount
def getWordCount(self):
return self.inputWordCount
def outputDict(self, filename):
markovDictFile=open(filename, 'w')
pprint.pprint(self.table,markovDictFile)