-
Notifications
You must be signed in to change notification settings - Fork 60
/
get_urls.py
34 lines (26 loc) · 932 Bytes
/
get_urls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import praw
import psaw
import tqdm
import datetime
api = psaw.PushshiftAPI()
# all posts until the end of 2017
end_time = int(datetime.datetime(2018, 1, 1).timestamp())
query = api.search_submissions(before=end_time,
filter=['url', 'score'],
sort='desc',
score='>2',
is_self=False,
over_18=False)
with tqdm.tqdm() as pbar:
# download links from submissions
with open('urls.txt', 'w') as fh:
for subm in query:
url = subm.url
# weird issue with psaw/pushshift that breaks score=">2"
if subm.score < 3:
continue
#print(subm.score)
# pbar.write(str(datetime.datetime.fromtimestamp(subm.created_utc)))
pbar.update(1)
fh.write(url + '\n')
fh.flush()