-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdatabt.py
221 lines (193 loc) · 6.91 KB
/
databt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
'''
This program pulls comments from reddit for further analysis using natural
language processing
Author Sombiri Enwemeka
'''
import sys
import os
import praw
import json
import argparse
import time
from datetime import timedelta
from unicodedata import normalize
from operator import itemgetter
from google.cloud.gapic.language.v1beta2 import enums
from google.cloud.gapic.language.v1beta2 import language_service_client
from google.cloud.proto.language.v1beta2 import language_service_pb2
from google.gax import errors
sys.path.append(os.path.join(os.getcwd(), '..'))
def norm(text):
'''
Cleans up encoding for input text
Input: Text to normalize
Return: Normalized text
'''
if text is None:
return None
return normalize('NFKD', text).encode('ascii', 'ignore')
def test_redditor(submission):
'''
Make sure the intended value exists
Input: PRAW submission object
Return: Normalized submission author name
'''
if submission is None:
return None
if submission.author is None:
return None
return norm(submission.author.name)
def get_entity_sentiment(text):
'''
Detects entities and sentiment about them from the provided text.
Input: Body of text to analyze
Return: List of entities sorted on salience (relevance to text body)
'''
language_client = language_service_client.LanguageServiceClient()
document = language_service_pb2.Document()
document.content = text.encode('utf-8')
document.type = enums.Document.Type.PLAIN_TEXT
encoding = enums.EncodingType.UTF32
if sys.maxunicode == 65535:
encoding = enums.EncodingType.UTF16
result = language_client.analyze_entity_sentiment(document, encoding)
e_list = []
for entity in result.entities:
if abs(entity.sentiment.magnitude - 0.0) > 0.001:
e_list.append({
"type": entity.type,
"name": entity.name,
"salience": entity.salience,
"sent_score": entity.sentiment.score,
"sent_mag": entity.sentiment.magnitude})
return sorted(e_list, key=itemgetter('salience'), reverse=True)
def get_reddit_comments(args):
'''
Uses PRAW to access reddit and download comments and associated data
checks if comment file already exist or the requested number of submissions
has been updated to prevent unnecessary queries.
Input: Argparse args this program was called with
Return: Comment data dictionary
Note: This function writes the downloaded comments to a JSON file
'''
# Check whether to update comment data
subm_data = []
num_entries = 0
if os.path.isfile(args.o):
f = open(args.o, "r")
subm_data = json.loads(f.read())
if subm_data[-1] is args.n and isinstance(subm_data[-2], int):
f.close()
return subm_data
api_keys = None
try:
with open(args.k, 'r') as fp:
api_keys = json.load(fp)
except IOError as oops:
print "Error loading Reddit Authentication\r\n%s" % oops
exit()
# Get all comment text from top n subreddit posts
reddit = praw.Reddit(
client_id=api_keys['reddit'][0],
client_secret=api_keys['reddit'][1],
user_agent=api_keys['reddit'][2])
subreddit = reddit.subreddit(args.subreddit)
# Put the reddit data into a dictionary
for submission in subreddit.top(time_filter='all', limit=args.n):
submission.comments.replace_more(limit=0)
sub_comments = []
num_entries += 1
aggregate = ''
for comment in submission.comments.list():
c_entry = {
"num_reports": comment.num_reports,
"score": int(comment.score),
"body": norm(comment.body),
"ups": int(comment.ups),
"downs": int(comment.downs),
"depth": int(comment.depth)
}
aggregate += c_entry['body'] + '\r\n'
sub_comments.append(c_entry)
num_entries += 1
subm_data.append({
"title": norm(submission.title),
"score": int(submission.score),
"upvote_ratio": submission.upvote_ratio,
"url": submission.url,
"author": test_redditor(submission),
"num_comments": int(submission.num_comments),
"selftext": norm(submission.selftext),
"comments": sub_comments,
"aggregate": norm(submission.title)
+ '\r\n' + norm(submission.selftext)
+ '\r\n' + aggregate
})
# Write comment data to file
subm_data.append(num_entries)
subm_data.append(args.n)
with open(args.o, "w") as f:
f.write(json.dumps(subm_data))
return subm_data
def main():
'''
Main script execution
'''
parser = argparse.ArgumentParser(description='Analyze reddit comments')
parser.add_argument(
'--subreddit',
default='the_donald',
help='Subreddit to examine')
parser.add_argument(
'-n',
default=10,
help='Number of posts to examine')
parser.add_argument(
'-o',
default='../commentData.json',
help='Output file name')
parser.add_argument(
'-k',
default='../apiKeys.json',
help='Path to API keys file')
args = parser.parse_args()
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../redditNLP.json'
start_time = time.time()
subm_data = get_reddit_comments(args)
# Add natural language processing results
nlp_calls = 0
outages = 0
for subm in subm_data:
while True:
try:
# Skip validation entry
if isinstance(subm, int):
break
x = time.time()
subm['ents_self'] = get_entity_sentiment(subm['selftext'])
nlp_calls += 1
print "Called NLP Service. %s total calls. " \
"Call took %0.3fs." % (nlp_calls, time.time()-x)
subm['ents_agg'] = get_entity_sentiment(subm['aggregate'])
nlp_calls += 1
print "Called NLP Service. %s total calls. " \
"Call took %0.3fs." % (nlp_calls, time.time()-x)
except errors.RetryError:
outages += 1
nlp_calls -= 1
print "The network is acting up again, let's give it a moment"
print "%s outages so far." % outages
time.sleep(30)
continue
break
# Write complete dataset to file
with open(args.o, "w") as f:
f.write(json.dumps(subm_data))
# Output some run information
print "Made %s NLP calls " % nlp_calls
print "Runtime: %s" % str(timedelta(seconds=time.time()-start_time))
print "There were %s network outages during this run" % outages
del os.environ['GOOGLE_APPLICATION_CREDENTIALS']
exit()
if __name__ == "__main__":
main()