-
Notifications
You must be signed in to change notification settings - Fork 1
/
Script_scrape_reddit.py
59 lines (43 loc) · 1.83 KB
/
Script_scrape_reddit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
"""Python script to scrape comments from a subreddit community.
Earl K. Brown, ekbrown byu edu
Note: You'll need to pip install 'praw' and 'pandas' before running this script.
"""
import praw
import pandas as pd
import datetime as dt
# SUPPLY YOUR APP'S INFO
reddit = praw.Reddit(client_id= 'HERE', # 14-character code
client_secret='HERE', # 27-character code
user_agent= 'HERE', # your app's name
username='HERE', # your reddit username
password='HERE') # your reddit password
# SPECIFY THE SUBREDDIT COMMUNITY YOU WANT TO SCRAPE
subreddit_to_search = 'linguistics'
# SPECIFY THE MAXIMUM NUMBER OF COMMENTS TO SCRAPE
max_comments = 500
# SPECIFY THE PATHWAY TO THE CSV FILE TO SAVE THE TEXT TO
outfile = '/Users/ekb5/Downloads/reddit_comments.csv'
### SAVE THIS SCRIPT AND RUN IT FROM THE COMMAND LINE ###
subreddit = reddit.subreddit(subreddit_to_search)
top_subs = subreddit.top(limit = max_comments)
topics_dict = { "title":[],
"score":[],
"id":[], "url":[],
"comms_num": [],
"created": [],
"body":[]}
for i in top_subs:
topics_dict["title"].append(i.title)
topics_dict["score"].append(i.score)
topics_dict["id"].append(i.id)
topics_dict["url"].append(i.url)
topics_dict["comms_num"].append(i.num_comments)
topics_dict["created"].append(i.created)
topics_dict["body"].append(i.selftext)
topics_data = pd.DataFrame(topics_dict)
def get_date(created):
return dt.datetime.fromtimestamp(created)
_timestamp = topics_data["created"].apply(get_date)
topics_data = topics_data.assign(timestamp = _timestamp)
topics_data.to_csv(outfile, index=False)
print(f"All done!\nYou should now have a file named {outfile} ready to be imported into spreadsheet software.")