-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscraper.py
116 lines (102 loc) · 4.41 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
###############################################################################
#
# Scrape the event sheffield site so we can reuse the data in other apps.
#
###############################################################################
import lxml.html
import urllib2
import urlparse
import datetime
def get_coords(root):
try:
goog = root.cssselect('a[target="_googlemapNgr"]')[0]
location = goog.getparent().getnext().text_content().replace(u'\xa0', ' ').split()
coords = location[5], location[7]
return coords
except:
return None
def scrape_table(root):
# For each row in the results table
for tr in root.cssselect("table[id='thedmsBrowseEvents'] tr"):
# Grab the data elements
tds = tr.cssselect("td")
# The first row in the results table only contains headings, so skip it
if len(tds) == 0:
# print "No data"
noop=1
else:
# print "Data row"
# A real data row
details_url = urlparse.urljoin(base_url, tds[1].getchildren()[0].attrib['href'])
details_title = ""
details_text = ""
venue_id = ""
try:
parsed_details_url = urlparse.urlparse(details_url)
details_url_qs = parsed_details_url.query
parsed_qs = urlparse.parse_qs(details_url_qs)
venue_id = parsed_qs['venue'][0]
details_html = urllib2.urlopen(details_url).read()
details_root = lxml.html.fromstring(details_html)
coords = get_coords(details_root)
details_top_panel = details_root.cssselect("div[id='thedmsTopPanel']")[0]
details_title = details_top_panel.cssselect("img")[0].attrib['title']
details_panel = details_root.cssselect("div[id='thedmsDetailsPanel']")[0]
details_text = details_panel.text_content()
except:
print "problem fetching ", details_url
data = {
'event_date' : tds[0].text_content(),
'event_name' : tds[1].text_content(),
'event_venue' : lxml.html.tostring(tds[2]).strip().replace('</td> ', '').replace('<td>','').replace('<br>', ' ').strip(),
'event_time' : tds[3].text_content(),
'event_contact' : tds[4].text_content(),
'details_title' : details_title,
'details_url' : details_url,
'details_text' : details_text,
'venue' : venue_id,
'coords': coords,
}
print data
# Ideally here we need to go and fetch the details url and exract the full description
# We also need to extract the venue id from the URL as it looks like a really useful identifier
# for co-referencing once we have other data sets
#
# Finally, push the data into the sqlite store
# scraperwiki.sqlite.save(unique_keys=['details_url'], data=data)
# Output the data for debugging
# print details_url
# scrape_and_look_for_next_link function: calls the scrape_table
# function, then hunts for a 'next' link: if one is found, calls itself again
def scrape_and_look_for_next_link(url):
html = urllib2.urlopen(url).read()
# print html
root = lxml.html.fromstring(html)
scrape_table(root)
next_links = root.cssselect("div.thedmsBrowsePaging")[0]
a_links = next_links.cssselect("a")
next_link = ""
# After page 1 there are 2 anchors that move backwards and forwards.
if ( len(a_links) == 1 ):
next_link = a_links[0]
else:
next_link = a_links[1]
# print next_link
if next_link is not None:
next_url = urlparse.urljoin(base_url, next_link.attrib.get('href'))
print next_url
return next_url
else:
print "No next link available"
return None
# Set up the base URL
base_url = "http://www.welcometosheffield.co.uk"
from_date = datetime.datetime.today().strftime('%d/%m/%Y')
to_date = (datetime.datetime.today()+datetime.timedelta(days=+31)).strftime('%d/%m/%Y')
print "from ", from_date, "to", to_date
# esurl = "http://www.welcometosheffield.co.uk/dms-connect/search?dms=12&startdate="+from_date+"&enddate="+to_date
esurl = "http://www.welcometosheffield.co.uk/dms-connect/search?dms=12"
print esurl
url = esurl
while url:
url = scrape_and_look_for_next_link(url)