forked from mmznrSTAT/wikidataZH_2019
-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_imports.py
273 lines (225 loc) · 9.32 KB
/
data_imports.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
import pandas as pd
import requests
import zipfile
import os
import shutil
from SPARQLWrapper import SPARQLWrapper, JSON, XML
import logging
def import_kantonZH_api():
"""
Description: import data from Kanton Zürich API
Format: pandas dataframe
return: pd.Dataframe['date','id','name','population','wikidata_id']
"""
URL = "https://www.web.statistik.zh.ch:8443/gp/GP?type=EXPORT&indikatoren=133&raumtyp=1&export=csv"
r = requests.get(url=URL)
try:
os.mkdir('data')
except OSError as ex:
print(ex)
open('data/dataKt.zip', 'wb').write(r.content)
zfile = zipfile.ZipFile('data/dataKt.zip')
with zipfile.ZipFile('data/dataKt.zip', 'r') as f:
names = f.namelist()
df = pd.read_csv(zfile.open(names[0]), sep=';')
df = df.iloc[:, :-1]
df = pd.melt(df, id_vars=[df.columns[0],df.columns[1]], value_vars=df.columns[2:])
df = df.rename(columns={"value": "population", "variable":"date"})
df['date'] = df['date'].astype('str')+ '-12-31'
zfile.close()
shutil.rmtree('data')
logging.info('Import Kanton data: extracted {0} entries successful'.format(df['BFS_NR'].count()))
return df
def import_stadtZH_api():
"""
Description: import data from city Zürich API
Format: pandas dataframe
return: pd.Dataframe['BFS_NR','GEBIET_NAME','date','population']
"""
_population_resource_id = "570f006e-2f2a-4b1f-9233-c4916c753475"
_mapping_resource_id = "0090f2ed-1df9-4953-9561-5d413fd74758"
def _query_opendata_zurich(resource_id, parse_record):
result = requests.get(
f"https://data.stadt-zuerich.ch/api/3/action/datastore_search?limit=1000000&resource_id={resource_id}")
records = result.json()["result"]["records"]
parsed_rows = []
for record in records:
parsed_rows.append(parse_record(record))
return parsed_rows
def _parse_population(record):
return {"id": record["QuarSort"], "name": record["QuarLang"], "date": f"{record['StichtagDatJahr']}-12-31",
"population": record["AnzBestWir"], }
def _parse_mapping(record):
return {"id": record["QNr"], "wikidata_id": record["DataItemNr"]}
def _apply_wikidata_mapping(mapping_rows, population_rows):
mapping_dict = {row["id"]: row["wikidata_id"] for row in mapping_rows}
for population_row in population_rows:
population_row["wikidata_id"] = mapping_dict[population_row["id"]]
population_rows = _query_opendata_zurich(_population_resource_id, _parse_population)
mapping_rows = _query_opendata_zurich(_mapping_resource_id, _parse_mapping)
_apply_wikidata_mapping(mapping_rows, population_rows)
return pd.DataFrame(population_rows)
def import_swisstopowikidata_kantonZH():
"""
Description: returns BFS-ID and Wikidate-ID
Format: pandas dataframe
return: pd.Dataframe['Name','bfs','wikidata_id']
"""
endpoint_url = "https://ld.geo.admin.ch/query"
query = """
PREFIX schema: <http://schema.org/>
PREFIX gn: <http://www.geonames.org/ontology#>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wikibase: <http://wikiba.se/ontology#>
PREFIX geo: <http://www.opengis.net/ont/geosparql#>
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX st: <https://ld.geo.admin.ch/def/>
select ?Municipality ?Name ?bfs ?wikidata_id
where{
?Municipality gn:featureCode gn:A.ADM3 .
?Municipality schema:name ?Name .
?Municipality gn:population?Population .
?Municipality st:bfsNumber ?bfs .
?Municipality dct:issued ?date .
?Municipality schema:validUntil ?validuntil .
?Municipality gn:parentADM1 ?InCanton .
?InCanton schema:name ?CantonName .
FILTER (now()<=?validuntil)
FILTER (?CantonName = 'Zürich')
{SELECT DISTINCT (xsd:integer(?bfsWD) AS ?bfs)?wikidata_id WHERE {
SERVICE <https://query.wikidata.org/bigdata/namespace/wdq/sparql>
{
?wikidata_id wdt:P771 ?bfsWD .
?wikidata_id wdt:P31 wd:Q70208 .
#OPTIONAL { ?wikidata_id wdt:P18 ?Image. } .
}}}}
"""
def get_results(endpoint_url, query):
sparql = SPARQLWrapper(
endpoint_url,
agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36")
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
return sparql.query().convert()
results = get_results(endpoint_url, query)
result = results["results"]["bindings"]
mydata = []
for p in result:
mon = {'Name': p['Name']['value'],
'bfs': p['bfs']['value'],
'wikidata_id': p['wikidata_id']['value'].replace('http://www.wikidata.org/entity/', '')
}
mydata.append(mon)
df = pd.DataFrame(mydata)
df['bfs'] = df['bfs'].astype(int)
return df
def import_wikidata_kantonZH():
"""
Description: wikidata SPARQL Query to get population of Kanton Zürich
Format: pandas dataframe
return: pd.Dataframe['bfs_id','date','population','qualifier','refurl','refpublisher']
"""
# Q70208 Gemeinde
# Q11943 Kanton Zürich
# P1082 Einwohner
# P585 Zeitmpunkt
query = """
SELECT ?bfs_id ?wikidata_id ?date ?population ?qualifier (COALESCE(?refurl, "NA") as ?refurl) (COALESCE(?refpublisher, "NA") as ?refpublisher)
WHERE
{
?wikidata_id wdt:P771 ?bfs_id.
?wikidata_id wdt:P31 wd:Q70208.
?wikidata_id wdt:P131 wd:Q11943.
?wikidata_id p:P1082 ?myvalue.
?myvalue pq:P585 ?date.
?myvalue ps:P1082 ?population.
?myvalue wikibase:rank ?qualifier.
?myvalue prov:wasDerivedFrom ?refnode.
OPTIONAL{?refnode pr:P854 ?refurl.}
OPTIONAL{?refnode pr:P123 ?refpublisher.}
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
order by ?bfs_id ?date
"""
sparql = SPARQLWrapper("https://query.wikidata.org/sparql", agent = "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36")
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
result = results['results']['bindings']
population = []
for p in result:
mon = {
'wikidata_id': p['wikidata_id']['value'].replace('http://www.wikidata.org/entity/', ''),
'bfs_id' : p['bfs_id']['value'],
'date': p['date']['value'],
'population': p['population']['value'],
'qualifier': p['qualifier']['value'].replace('http://wikiba.se/ontology#', ''),
'refurl' : p['refurl']['value'],
'refpublisher' : p['refpublisher']['value'],
}
population.append(mon)
pop = pd.DataFrame(population)
pop['date'] = pop.date.str.slice(0, 10)
pop['bfs_id'] = pop['bfs_id'].astype(int)
logging.info('Import Wikidata: extracted {0} entries successful'.format(pop['date'].count()))
return pop
def import_wikidata_stadtZH():
"""
Description: wikidata SPARQL Query to get population of city Zürich
Format: pandas dataframe
return: pd.Dataframe['date','popultion','qualifier','wikidata_id']
"""
# Q19644586 Quartier
# P1082 Einwohner
# P585 Zeitmpunkt
query = """
SELECT ?wikidata_id ?date ?population ?qualifier
WHERE
{
?wikidata_id wdt:P31 wd:Q19644586.
?wikidata_id p:P1082 ?myvalue.
?myvalue pq:P585 ?date.
?myvalue ps:P1082 ?population.
?myvalue wikibase:rank ?qualifier.
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
"""
sparql = SPARQLWrapper("https://query.wikidata.org/sparql", agent = "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36")
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
result = results['results']['bindings']
population = []
for p in result:
mon = {
'wikidata_id': p['wikidata_id']['value'].replace('http://www.wikidata.org/entity/', ''),
'date': p['date']['value'].replace('-01-01', '-12-31'),
'population': p['population']['value'],
'qualifier': p['qualifier']['value'].replace('http://wikiba.se/ontology#', ''),
}
population.append(mon)
pop = pd.DataFrame(population)
pop['date'] = pop.date.str.slice(0, 10)
logging.info('Import Wikidata: extracted {0} entries successful'.format(pop['date'].count()))
return pop
def main():
# API
print("Stadt API")
data_frame = import_stadtZH_api().head()
print(data_frame)
print("Kanton API ")
data_frame = import_kantonZH_api().head()
print(data_frame)
# Linked Data
print("Kanton geo.admin.data")
data_frame = import_swisstopowikidata_kantonZH().head()
print(data_frame)
print("Kanton Wikidata")
data_frame = import_wikidata_kantonZH().head()
print(data_frame)
print("Stadt Wikidata")
data_frame = import_wikidata_stadtZH().head()
print(data_frame)
if __name__ == "__main__":
main()