-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtwitter_utils.py
156 lines (120 loc) · 5.65 KB
/
twitter_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import time
from tweepy import API
from tweepy import OAuthHandler
from tweepy import TweepError
from tweepy import parsers
import file_utils as fx
import date_utils as dx
import apis
class TwitterManager(object):
def __init__(self, **kwargs):
"""Setup Twitter authentication.
Return: tweepy.OAuthHandler object
"""
api = apis.twitter_api()
auth = OAuthHandler(api['consumer_key'], api['consumer_secret'])
auth.set_access_token(api['access_token'], api['access_secret'])
self.api = API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True, parser=parsers.JSONParser())
self.initial = True
self.search = True
self.max_id = ''
# self.since_id = ''
self.attributes = kwargs
def get_tweet_id(self, query, count, date):
''' Function that gets the ID of a tweet. This ID can then be
used as a 'starting point' from which to search. The query is
required and has been set to a commonly used word by default.
The variable 'days_ago' has been initialized to the maximum
amount we are able to search back in time (9).'''
tweet = self.get_query(query, count, until=date)
if len(tweet['statuses']) > 0:
return tweet['statuses'][0]['id']
else:
return False
def get_query(self, query, max_tweets, **kwargs):
''' Function that takes in a search string 'query', the maximum
number of tweets 'max_tweets', and the minimum (i.e., starting)
tweet id. It returns a list of tweepy.models.Status objects. '''
return self.api.search(q=query, count=max_tweets, **kwargs)
def get_search_page(self, query, max_tweets, since_id, max_id):
# inspired in https://galeascience.wordpress.com/2016/03/18/collecting-twitter-data-with-python/
searched_tweets = []
while len(searched_tweets) < max_tweets:
remaining_tweets = max_tweets - len(searched_tweets)
try:
tweets = self.get_query(query, remaining_tweets, since_id=str(since_id), max_id=str(max_id-1))
print('found', len(tweets['statuses']), 'tweets')
if len(tweets['statuses']) == 0:
print('no tweets found')
self.search = False
self.initial = True
break
searched_tweets.extend(tweets['statuses'])
self.max_id = tweets['statuses'][-1]['id']
return searched_tweets
except TweepError as e:
print(e)
print('exception raised, waiting 15 minutes')
time.sleep(15*60)
break
def get_search_tweets(self, keywords, max_tweets, days_ago):
''' This is a script that continuously searches for tweets
that were created over a given number of days. The search
dates and search phrase can be changed below.
runtime limit in hours
number of tweets per search (will be iterated over) - maximum is 100
search limits e.g., from 7 to 8 gives current weekday from last week,
min_days_old=0 will search from right now
this geocode includes nearly all American states (and a large portion of Canada)
USA = '39.8,-95.583068847656,2500km'
'''
# loop over search items,
# creating a new file for each
for query in keywords:
self.search = True
print('Search phrase =', query)
fx.create_folder('data/' + query)
start = dx.get_date(0, date_object=True)
end = dx.get_date(days_ago, date_object=True)
range = dx.date_range_day(start, end, 1)
for day in range:
tweets_tosave = []
while self.search is True:
if self.initial is True:
start = dx.get_date(1)
end = day
self.max_id = self.get_tweet_id(query, 1, end)
since_id = self.get_tweet_id(query, 1, start)
self.initial = False
if since_id is False:
pass
tweets = self.get_search_page(query, max_tweets, since_id, self.max_id)
if tweets:
tweets_tosave.extend(tweets)
if len(tweets_tosave) >= 1000:
fname = dx.format_date_str(tweets_tosave[-1]['created_at'], '%a %b %d %H:%M:%S %z %Y', '%d-%m-%y_%H-%M-%S') + '_' + str(tweets_tosave[-1]['id'])
fx.save_json('data/' + query + '/' + str(fname), tweets_tosave)
tweets_tosave = []
else:
pass
def get_user_details(self, username):
try:
print('user id: ' + str(username))
userobj = self.api.get_user(username)
return userobj
except TweepError as e:
print(e)
# 63: user suspended
# 50: user not found
if e.api_code == 63 or e.api_code == 50:
return e
def get_user_timeline(self, username):
try:
print('reading: ' + str(username))
status = self.api.user_timeline(username)
return status
except TweepError as e:
if e.api_code == 63 or e.api_code == 50:
return True
print('error: ' + str(e))
return False