-
Notifications
You must be signed in to change notification settings - Fork 15
/
update.py
executable file
·120 lines (89 loc) · 4.04 KB
/
update.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env python3
# Germany cities parser
# Script gets list of Germany cities from Wikipedia and basic information
# about them: name, subject, district, population and coordinates.
# Copyright Andrey Zhidenkov, 2019-2020 (c)
import os
import re
import sys
import json
import argparse
from lxml.html import fromstring
from parselab.parsing import BasicParser
from parselab.network import NetworkManager
from parselab.cache import FileCache
URL_ROOT = 'https://en.m.wikipedia.org'
class App(BasicParser):
data = list()
def __init__(self):
parser = argparse.ArgumentParser(description='Parse database export script')
parser.add_argument('--url', help='Process only this URL', type=str, required=False)
self.args = parser.parse_args()
self.cache = FileCache(namespace='germany-cities', path=os.environ.get('CACHE_PATH'))
self.net = NetworkManager()
def get_url(self, url):
if url.startswith('https://'):
return url
else:
return '%s%s' % (URL_ROOT, url)
def get_city_info(self, url):
def get_td(th):
td = th.xpath('./following-sibling::td[1]')
return td[0].text_content().strip()
def get_population(th):
td = th.getparent().xpath('./following-sibling::tr//td[1]')
return td[0].text_content().replace(',', '')
def get_area(th):
td = th.getparent().xpath('./following-sibling::tr//td[1]')
return td[0].text_content().split('\xa0km')[0]
info = {}
page = self.get_page(self.get_url(url))
if self.args.url:
print(self.cache.get_cached_filename(url), file=sys.stderr)
html = fromstring(page)
th1 = html.xpath('.//table[contains(@class, "geography")]//tbody//tr//th[1]//div[@style="display:inline"]')
info['name'] = th1[0].text_content().strip()
geo = html.xpath('.//span[@class="geo"]')
if geo is not None:
info['coords'] = {'lat': geo[0].text_content().split('; ')[0],
'lon': geo[0].text_content().split('; ')[1]}
for th in html.xpath('.//table[contains(@class, "geography")][1]//tr//th'):
title = th.text_content().strip()
if title == 'District':
info['district'] = get_td(th)
elif title.startswith('Population'):
info['population'] = get_population(th)
elif (title == 'Area' or title == 'Area[1]') and not 'area' in info.keys():
# We need to consider only the first occurance of 'Area',
# FIXME: check for 'Area[1]' is only for Berlin
info['area'] = get_area(th)
return info
def get_state(self, li):
return re.search('\(([^()]+)\)$', li.text_content())[1]
def run(self):
if self.args.url is not None:
info = self.get_city_info(self.args.url)
print(json.dumps(info))
sys.exit(0)
page = self.get_page('https://en.m.wikipedia.org/wiki/List_of_cities_and_towns_in_Germany')
html = fromstring(page)
for a1 in html.xpath('.//table//tbody//tr//ul//li//a[1]'):
# Skip pages that don't exist yet
if a1.get('class') == 'new':
continue
info = self.get_city_info(a1.get('href'))
# There are pages that don't contain "State" on it so we have to
# parse the main list page to get it
info['state'] = self.get_state(a1.getparent())
if info is not None:
print(info['name'], file=sys.stderr)
self.data.append(info)
else:
print("Couldn't get info", file=sys.stderr)
# As there can be other cities with the exact same name in a single
# state, sorting the list by just the city name is not appropriate
output = sorted(self.data, key=lambda k: '%s|%s' % (k['name'], k['state']))
print(json.dumps(output, ensure_ascii=False, sort_keys=True))
if __name__ == '__main__':
app = App()
app.run()