-
Notifications
You must be signed in to change notification settings - Fork 1
/
xkcd_archive.py
112 lines (90 loc) · 3.3 KB
/
xkcd_archive.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#! python3
# @author Stephane Duguay // binarez
# Derived from downloadXkcd.py - Downloads every single XKCD comic.
# https://codereview.stackexchange.com/questions/178239/archives-xkcd-comics
"""
Webscraper that downloads xkcd comics.
Checks if comic already downloaded so for increased efficiency on rerun.
Two run modes: Full and Update
Full mode goes through every comic.
Update mode quits when it reaches the first comic that is already downloaded.
Derived from original project: https://automatetheboringstuff.com/chapter11/
@author: david.antonini // toonarmycaptain
"""
import time
import os
import requests
import bs4
import threading
print('This script searches xkcd.com and downloads each comic.')
# User input for full run or until finding already downloaded comic.
print('There are two mode options:\n'
'Update mode: Or "refresh mode", checks until it finds '
'a previously downloaded comic.\n'
'Full mode: Checks for every comic, downloads undownloaded comics.\n'
)
while True:
try:
print('Please select mode:\n'
'Enter 0 for Update mode, or 1 for Full mode')
run_mode_selection = input('Mode: ')
if int(run_mode_selection) == 0:
full_mode = False # Update mode
break
if int(run_mode_selection) == 1:
full_mode = True # Full mode
break
except ValueError:
continue
start = time.time()
os.makedirs('xkcd', exist_ok=True) # store comics in ./xkcd
def download_image(session, url, filename):
with open(os.path.join('xkcd', filename), 'xb') as image_file:
print('Downloading image ' + filename)
res = session.get(url)
res.raise_for_status()
for chunk in res.iter_content(100000):
image_file.write(chunk)
def write_alttext(alt_text, alt_text_filename):
with open(os.path.join('xkcd', alt_text_filename), 'xb') as alt_text_file:
print('Writing alt-text ' + alt_text_filename)
alt_text_file.write(alt_text.encode())
# Get latest comic number:
url = 'https://xkcd.com/archive/'
res = requests.get(url)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, 'lxml')
all_comics = soup.select("div[id=middleContainer] > a[title]")
with requests.Session() as session:
for comic in all_comics:
comic_num = int(comic.get('href')[1:-1])
try:
res = session.get('http://xkcd.com/' + str(comic_num))
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, 'lxml')
except requests.exceptions.HTTPError:
continue
comic_image = soup.select_one('#comic img[src]')
if not comic_image:
print('Could not find comic image #' + str(comic_num))
continue
comic_url = 'https:' + comic_image['src']
img_filename = 'xkcd.' + str(comic_num).zfill(4) + '.' + comic.get('title') + '.' + os.path.basename(comic_url)
try:
alt_text = comic_image['title']
write_alttext(alt_text, os.path.splitext(img_filename)[0] + '.alt-text.txt' )
except KeyError:
print('--- Missing alt-text ' + str(comic_num))
except FileExistsError:
pass
try:
download_image(session, comic_url, img_filename)
except requests.exceptions.MissingSchema:
print('--- Missing comic ' + str(comic_num))
continue # skip this comic
except FileExistsError:
if full_mode: # Full mode
continue # skip this comic
if not full_mode:
break
print('Done.')