-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathcrawl_moviepostersdb.py
38 lines (30 loc) · 1.09 KB
/
crawl_moviepostersdb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from harvestman.apps.spider import HarvestMan
from harvestman.lib.common.macros import *
import re
class MyCustomCrawler(HarvestMan):
""" A custom crawler """
def save_this_url(self, event, *args, **kwargs):
""" Custom callback function which modifies behaviour
of saving URLs to disk """
# Get the url object
url = event.url
ustr = str(url)
# If not image, save always
if ('/poster' not in ustr):
return False
if url.is_document() or url.is_cgi() or (url.is_image() and re.search('[/]t_[^_]+_[^_.]*.jpg',ustr)):
return True
print 'rejecting ', ustr
return False
# Set up the custom crawler
if __name__ == "__main__":
crawler = MyCustomCrawler()
crawler.initialize()
# Get the configuration object
config = crawler.get_config()
config.USER_AGENT='Firefox/v3.5'
# Register for 'save_url_data' event which will be called
# back just before a URL is saved to disk
crawler.register('save_url_data', crawler.save_this_url)
# Run
crawler.main()