forked from petterw/crawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfetcher.py
61 lines (56 loc) · 1.68 KB
/
fetcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from celery.decorators import task
from celery.task.sets import subtask
from BeautifulSoup import BeautifulSoup
import urlparse
import urllib2
@task
def fetch_document(url, useragent, return_html = False):
try:
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', useragent)]
response = opener.open(url)
html = response.read()
links = subtask(extract_urls).apply_async( [(url,html)] )
# avoid filling memory with useless html if we don't want it
if return_html:
return (url, html, links, len(html))
return (url, "", links, len(html))
except:
return (url,"", FakeAsyncResult(result = set()), 0)
@task
def fetch_robots_txt(url, useragent):
try:
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', useragent)]
response = opener.open(url)
robots_txt = response.read()
return (url, robots_txt)
except:
return (url,"")
@task
def extract_urls(doc_tuple):
urls = []
try:
soup = BeautifulSoup(doc_tuple[1])
for tag in soup.findAll('a', href=True):
urls.append(tag['href'])
# return unique urls on page
return set([(cleanup_url(urlparse.urljoin(doc_tuple[0], url)), doc_tuple[0]) for url in urls])
except:
# html too bad
return set()
def cleanup_url(url):
# quick fix for a problem urlparse has with utf-8
return url.split("#".encode("utf-8"))[0].encode("utf-8")
class FakeAsyncResult:
""" Limited placeholder for AsyncResult from celery, in case we don't want
to do anything, but the consumer of the result expects
an AsyncResult with its usual methods
"""
def __init__(self, result = [], ready = True):
self.result = result
self.isready = ready
def wait(self):
return self.value
def ready(self):
return self.isready