This repository was archived by the owner on Oct 25, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprocessTweetsNew.py
64 lines (50 loc) · 1.52 KB
/
processTweetsNew.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# processTweetsNew.py
# crawl the tweets, and compute the labMT vectors around keywords
# output with daily resolution
#
# NOTES
# uses the new 15-minute compressed format
#
# USAGE
# gzip -cd tweets.gz | python processTweetsNew.py 2014-01-01 keywords
#
# this will read keywords.txt and the tweets from stdin
# and save a frequency file, labMT vector in keywords/[keyword]
# for each keyword
# we'll use most of these
from json import loads
import codecs
import datetime
import re
import numpy
from labMTsimple.storyLab import *
import sys
def tweetreader(tweettext,keyWords,g):
for i in xrange(len(keyWords)):
if re.search(r"\b%s\b" % keyWords[i],tweettext,flags=re.IGNORECASE) is not None:
g.write(tweettext.replace('\n',' '))
g.write('\n')
# ends re match test
# ends for len(keyWords)
def gzipper(keyWords,outfile):
f = sys.stdin
for line in f:
try:
tweet = loads(line)
except:
print "failed to load a tweet"
try:
if tweet['text']:
tweetreader(tweet['text'],keyWords,outfile)
except:
# print "no text"
pass
if __name__ == '__main__':
# load the things
outfile = sys.argv[1]
keyWords = ['climate',]
keyWords = ['clinton','sanders','omalley','webb','chaffee','cruz','rand paul','rubio','carson','fiorina','huckabee','santorum',]
g = open('rawtweets/{0}.txt'.format(outfile),'w')
gzipper(keyWords,g)
g.close()
print "complete"