This repository has been archived by the owner on May 4, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
140 lines (105 loc) · 4.25 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# coding=utf-8
import scraperwiki
import lxml.html
import sqlite3
import re
import urllib2
import time
BASE_URL = 'https://www.parliament.gov.za/group-details/2'
opener = urllib2.build_opener()
opener.addheaders = [('User-Agent', 'mySociety Scraper')]
response = opener.open(BASE_URL)
html = response.read()
ssRoot = lxml.html.fromstring(html)
PARTY_MAP = {
'ACDP': 'Q268613',
'AGANG SA': 'Q4969082',
'AIC': 'Q4689795',
'ANC': 'Q83162',
'APC': 'Q384266',
'COPE': 'Q1125988',
'DA': 'Q761877',
'EFF': 'Q15613585',
'FF PLUS': 'Q510163',
'IFP': 'Q654444',
'NFP': 'Q6972795',
'PAC': 'Q775460',
'UDM': 'Q1788070',
}
DISTRICT_MAP = {
'Eastern Cape': 'Q130840',
'Free State': 'Q160284',
'Gauteng': 'Q133083',
'KwaZulu-Natal': 'Q81725',
'Limpopo': 'Q134907',
'Mpumalanga': 'Q132410',
'North West': 'Q165956',
'Northern Cape': 'Q132418',
'Western Cape': 'Q127167',
}
linksList = ssRoot.cssselect('div.page-content li a')
parsedMembers = []
for link in linksList:
href = link.attrib['href']
pattern = re.compile("^\/person-details\/([0-9]+)$")
if pattern.match(href):
memberData = {}
idRegex = pattern.search(href)
memberData['id'] = idRegex.group(1)
memberData['url'] = 'https://www.parliament.gov.za/person-details/' + memberData['id']
opener = urllib2.build_opener()
opener.addheaders = [('User-Agent', 'mySociety Scraper')]
response = opener.open(memberData['url'])
html = response.read()
memberRoot = lxml.html.fromstring(html)
nameString = memberRoot.cssselect('div.page-header h4')[0].text.strip()
nameRegex = re.search('(.+?) (.+)', nameString)
memberData['name'] = nameRegex.group(2)
memberData['honorific'] = nameRegex.group(1)
content = memberRoot.cssselect('div.page-content')[0]
strongElements = content.cssselect('strong')
# # If the first strongElement text is not None, it's something interesting
# if strongElements[0].text != None:
# memberData['role'] = strongElements[0].text
# # Remove it so everything else lines up
# del strongElements[0]
# # National or provincial?
# if strongElements[3] == 'national list':
# memberData['type'] = 'national'
# else if strongElements[3] == 'provincial list':
# memberData['type'] = 'provincial'
partyRegex = re.search('Member of the <strong><a href="\/party-details\/(.+?)">(.+?)<\/a><\/strong>', html)
memberData['party_code'] = partyRegex.group(1)
memberData['party_name'] = partyRegex.group(2)
nationalRegex = re.compile('On the <strong>national list</strong>\.')
provinceRegex = re.compile('On the <strong>provincial list</strong> for the province of <strong>(.+?)<\/strong>\.')
if nationalRegex.search(html):
memberData['type'] = 'national'
elif provinceRegex.search(html):
memberData['type'] = 'provincial'
memberData['district'] = provinceRegex.search(html).group(1)
if memberData['district'] in DISTRICT_MAP:
memberData['district_id'] = DISTRICT_MAP[memberData['district']]
else:
print '(!) Missing district ID for {}'.format(memberData['district'])
else:
memberData['type'] = 'unknown'
print '(!) Unknown member type!'
partyRegex = re.search('Member of the <strong><a href="\/party-details\/(.+?)">(.+?)<\/a><\/strong>', html)
memberData['party_code'] = partyRegex.group(1)
memberData['party_name'] = partyRegex.group(2)
if memberData['party_code'] in PARTY_MAP:
memberData['party_id'] = PARTY_MAP[memberData['party_code']]
else:
print '(!) Missing party ID for {}'.format(memberData['party_code'])
print memberData['name']
parsedMembers.append(memberData)
time.sleep(0.5)
print 'Counted {} Members'.format(len(parsedMembers))
try:
scraperwiki.sqlite.execute('DELETE FROM data')
except sqlite3.OperationalError:
pass
scraperwiki.sqlite.save(
unique_keys=['id'],
data=parsedMembers)