-
Notifications
You must be signed in to change notification settings - Fork 2
/
requestsdemo.py
69 lines (55 loc) · 2.47 KB
/
requestsdemo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import requests
import time
from BeautifulSoup import BeautifulSoup
class BSCrawler(object):
base_url = 'https://ep2015.europython.eu'
max_retries = 5
def __init__(self):
self.session = requests.Session()
def start_crawl(self):
# Speakers url is https://ep2015.europython.eu/en/speakers/
response = self.make_request(self.base_url + '/en/speakers/')
if response:
# Let's parse the received html
soup = BeautifulSoup(response.content)
# All the links are in a div with class=cms
conference_container = soup.find('div', attrs={'class': 'cms'})
# Inside that div, the ponent links are inside li tags
speaker_links = conference_container.findAll('li')
for speaker_link in speaker_links:
print self.get_speaker(speaker_link.a['href'])
def make_request(self, url, retried=0):
# Exception handling
try:
# Doing synchronous request, blocking every time we do a new request.
return self.session.get(url)
except (requests.exceptions.HTTPError, requests.exceptions.ConnectTimeout,
requests.exceptions.ConnectionError) as e:
if retried < self.max_retries:
retried += 1
print "Got %s. Waiting 5 seconds and retrying for the %s time..." % (e.message, retried)
time.sleep(5)
return self.make_request(url, retried=retried)
else:
print "Got %s. Giving up" % e.message
return
def get_speaker(self, url):
""" Get speaker details """
response = self.make_request(self.base_url + url)
if not response:
return
soup = BeautifulSoup(response.content)
item = {}
item['name'] = soup.find('section', attrs={'class': 'profile-name'}).h1.text
item['avatar'] = self.base_url + soup.find('img', attrs={'class': 'avatar'})['src']
item['url'] = url
item['talks'] = []
for talk in soup.find('div', attrs={'class': 'speaker-talks well'}).dl.dd.ul.li:
item['talks'].append({'name': talk.text, 'url': self.base_url + talk['href']})
for dl in soup.findAll('dl', attrs={'class': 'dl-horizontal'}):
for dt in dl.findChildren('dt'):
name = dt.text
value = dt.findNext('dd').text
item[name] = value
return item
BSCrawler().start_crawl()