-
Notifications
You must be signed in to change notification settings - Fork 14
/
markov.py
70 lines (57 loc) · 2.05 KB
/
markov.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from collections import defaultdict
from itertools import chain
from twython import Twython
import random
import re
import sys
import config
twitter = Twython(
config.TWITTER_CONSUMER_KEY,
config.TWITTER_CONSUMER_SECRET,
config.TWITTER_ACCESS_TOKEN,
config.TWITTER_ACCESS_SECRET
)
class MarkovChain(object):
def __init__(self, documents, **kwargs):
self.word_cache = defaultdict(list)
self.words = self.documents_to_words(documents)
self.word_size = len(self.words)
self.wordbase = self.wordbase()
def documents_to_words(self, documents):
"""Returns a list of words used in a given list of documents."""
words = []
for document in documents:
if document:
words.append(self.tokenize(document))
return list(chain.from_iterable(words))
def tokenize(self, document):
# don't want empty spaces
words = [w.strip() for w in document.split() if w.strip() != '']
return words
def yield_trigrams(self):
if len(self.words) < 3:
return
for i in range(len(self.words) - 3):
yield (self.words[i], self.words[i+1], self.words[i+2])
def wordbase(self):
for w1, w2, w3 in self.yield_trigrams():
self.word_cache[(w1, w2)].append(w3)
def generate_tweet(self, min_chars=100, max_chars=140):
seed = random.randint(0, len(self.words) - 3)
w1, w2 = self.words[seed], self.words[seed + 1]
tweet = ' '
# loop until it's a sentence
while tweet[-2] not in '.!?':
tweet += (w1 + ' ')
w1, w2 = w2, random.choice(self.word_cache[(w1, w2)])
# if it's too short or too long, try again
if len(tweet) < min_chars or len(tweet) > max_chars:
tweet = self.generate_tweet()
return tweet.strip()
def main():
with open(sys.argv[1]) as f:
text = [line for line in f]
tweet = MarkovChain(text).generate_tweet()
twitter.update_status(status=tweet)
if __name__ == '__main__':
main()