-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrapy_urlglob.py
86 lines (75 loc) · 2.96 KB
/
scrapy_urlglob.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
"""Scrapy spider middleware to expand start urls using curl-like url globbing"""
import re
class ExpandStartUrlsMiddleware(object):
@classmethod
def from_crawler(cls, crawler):
enabled = crawler.settings.getbool('URLGLOB_ENABLED', True)
return cls(enabled)
def __init__(self, enabled=True):
self.enabled = enabled
def process_start_requests(self, start_requests, spider):
for request in start_requests:
if self.enabled:
for url in expand_url(request.url):
yield request.replace(url)
else:
yield request
def expand_url(url):
"""Expand url using curl-like url globbing.
See https://curl.haxx.se/docs/manpage.html#URL for expansion rules.
>>> for url in expand_url('https://example.com/{foo,bar}?page=[1-3]'):
... print(url)
https://example.com/foo?page=1
https://example.com/foo?page=2
https://example.com/foo?page=3
https://example.com/bar?page=1
https://example.com/bar?page=2
https://example.com/bar?page=3
>>> for url in expand_url('https://example.com?page=[01-10:3]'):
... print(url)
https://example.com?page=01
https://example.com?page=04
https://example.com?page=07
https://example.com?page=10
>>> for url in expand_url('https://[a-z:10].example.com'):
... print(url)
https://a.example.com
https://k.example.com
https://u.example.com
"""
m = re.search(r'{([^}]+)}', url)
if m:
# expand {braces}
for i in m.group(1).split(','):
s = m.string[:m.span()[0]] + i + m.string[m.span()[1]:]
for x in expand_url(s):
yield x
else:
m = re.search(r'\[(\d+)-(\d+)(?::(\d+))?\]', url)
if m:
# expand [brackets] numeric range
start = int(m.group(1))
end = int(m.group(2)) + 1
if start > end:
raise ValueError('Bad numeric range sequence in "%s"' % m.string)
step = int(m.group(3)) if m.group(3) else 1
width = len(m.group(1))
for n in range(start, end, step):
s = m.string[:m.span()[0]] + str(n).zfill(width) + m.string[m.span()[1]:]
for x in expand_url(s):
yield x
else:
m = re.search(r'\[([A-Za-z]+)-([A-Za-z]+)(?::(\d+))?\]', url)
if m:
# expand [brackets] alpha range
start = ord(m.group(1))
end = ord(m.group(2)) + 1
if start > end or end - start > ord('Z') - ord('A') + 1:
raise ValueError('Bad alpha range sequence in "%s"' % m.string)
step = int(m.group(3)) if m.group(3) else 1
for n in range(start, end, step):
s = m.string[:m.span()[0]] + chr(n) + m.string[m.span()[1]:]
for x in expand_url(s):
yield x
else:
yield url