-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_comments.py
155 lines (100 loc) · 4.32 KB
/
scrape_comments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import pandas as pd
import requests
import json
import time
import datetime
import pytz
def api_call(url, pause=1.0, max_tries=3):
"""
Make API request at 'url' and unpack response. In case of a bad response, make
several additional attempts, waiting 'pause' seconds between requests, in order to
avoid API limitations. Gives up after 'max_tries' failed attempts and returns an
empty list. If successful, returns a list of dictionaries representing
submissions/comments.
"""
success = False
tries = 0
while not success:
try:
r = requests.get(url)
data = json.loads(r.text)["data"]
success = True
except:
tries += 1
if tries == max_tries:
return []
time.sleep(pause)
return data
def scrape_comments(
start_date, end_date, subreddit, title, min_comments=100, pause=1.0
):
"""
Scrapes Reddit comments from threads with title matching 'title' in 'subreddit',
made on weekdays (M-F) between 'start date' and 'end date' using the reddit
Pushshift API. Matching threads are ignored if the number of comments is smaller
than 'min_comments'. The maximum allowed number of comments (1000) are retrieved per
API call, and the calls are made 'pause' seconds apart to avoid API limitations.
Returns a pandas dataframe containing the date, time (Unix timestamp), and body of
the retrieved comments.
"""
dates = pd.date_range(start=start_date, end=end_date, freq="B")
df = []
for date in dates:
# Initialize 'start' and 'end' timestamps to midnight ET on 'date' and midnight
# ET on the next day. These specify the timeframe in which to search for
# matching submissions/comments.
start = int(date.replace(tzinfo=pytz.timezone("America/New_York")).timestamp())
end = start + 86400
# Find matching submissions
url = (
"https://api.pushshift.io/reddit/search/submission/"
"?subreddit=%s&title=%s&after=%s&before=%s&limit=1000"
% (subreddit, title, str(start), str(end))
)
submissions = api_call(url)
time.sleep(pause)
for submission in submissions:
if submission["num_comments"] < min_comments:
continue
print("Title:", submission["title"])
print("Total Comments:", submission["num_comments"])
print("Downloading Comments...")
comments = []
# Retrieve comments from submission, 1000 at a time. Comments are returned
# chronologically by the API. After each batch of comments, set 'start'
# equal to the timestamp of the last retrieved comment, so that on the next
# request we only get comments made after the ones we already have. Continue
# until we have all comments made on 'date' (in ET).
while start < end:
url = (
"https://api.pushshift.io/reddit/search/comment/"
"?link_id=%s&after=%s&limit=1000" % (submission["id"], start)
)
new_comments = api_call(url)
if len(new_comments) == 0:
break
for comment in new_comments:
if comment["created_utc"] < end:
comments.append(comment)
start = new_comments[-1]["created_utc"]
print(len(comments), end="...", flush=True)
time.sleep(pause)
# Append all of the comments retrieved from the current submission to the
# main list of comments.
for comment in comments:
df.append([date, comment["created_utc"], comment["body"]])
print("\n")
# Convert the list of comments into a dataframe and rename the columns
df = pd.DataFrame(df)
df.columns = ["Date", "Time", "Body"]
return df
if __name__ == "__main__":
# Scrape comments from the r/wallstreetbets Daily Discussion threads between
# September 30, 2019 and September 30, 2020
df = scrape_comments(
datetime.date(2019, 9, 30),
datetime.date(2020, 9, 30),
"wallstreetbets",
"daily+discussion+thread",
)
df.to_csv("comments_2019-09-30_2020-09-30_new.csv")