-
Notifications
You must be signed in to change notification settings - Fork 1
/
metadata.py
203 lines (171 loc) · 8.24 KB
/
metadata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
import pandas as pd
import time
from shapely.geometry import Polygon
import esy.osm.pbf
from subprocess import Popen
import requests
import os
from tqdm import tqdm
import osm_tags
def read_metadata(country_codes):
return_df = pd.concat([
pd.read_csv(f"country_metadata/{c}-metadata.csv.gz",
converters={"poi_groups": eval}) for c in country_codes
])
return return_df
def download_and_preprocess_metadata(country_codes,
redownload,
reprocess,
cleanup=False):
# load index
mapping = pd.read_csv("other_data/country_code_mapping.csv",
squeeze=True,
index_col="iso").to_dict()
# create country metadata dir
dir_name = "country_metadata"
if not os.path.exists(dir_name):
os.mkdir(dir_name)
osm_mapping = osm_tags.get_osm_tag_mapping()
# other data-structures that are needed later on
osm_dict = osm_mapping.set_index("osm_tag")["route-report-group"].to_dict()
all_osm_tags = set(osm_mapping["osm_tag"])
# convert the osm-tags that are requested into a string that osmosis can read
osm_mapping[["primary",
"secondary"]] = osm_mapping["osm_tag"].str.split("=",
expand=True)
osm_mapping = osm_mapping.groupby("primary")["secondary"].apply(
lambda x: ",".join(x.drop_duplicates())).reset_index()
osm_mapping[
"merged"] = osm_mapping["primary"] + "=" + osm_mapping["secondary"]
osmosis_string = " ".join(osm_mapping["merged"])
def get_poi_group(row):
# TODO can this function be faster?
# find osm_tags that we are interested in
tags = set([i + "=" + j for i, j in zip(index, row)])
osm_tags_in_common = list(tags.intersection(all_osm_tags))
# translate osm_tag to group and drop dupes
poi_groups = list(
set([osm_dict[osm_tag] for osm_tag in osm_tags_in_common]))
# sanity checks
assert len(poi_groups) > 0, "could not find poi group"
# we assume there is only one matching (see above)
return tuple(poi_groups)
for code in tqdm(country_codes,
desc="Checking country metadata",
leave=False):
print(f"working on {code}")
fname = f"{dir_name}/{code}-metadata-unfiltered.osm.pbf"
filter_fname = f"{dir_name}/{code}-metadata.osm.pbf"
filter_fname_csv = f"{dir_name}/{code}-metadata.csv.gz"
# get URL for county code
url = mapping[code]
# ------------------------------------------------------------
# 1) Download complete file
# ------------------------------------------------------------
# download file
# progress bar adopted from https://stackoverflow.com/a/62113293/8832008
# if next file not there or redownload
# filtered thing is always present in both versions
assert (not os.path.exists(filter_fname)
and not os.path.exists(filter_fname_csv)) or (
os.path.exists(filter_fname)
and os.path.exists(filter_fname_csv))
if redownload or (not os.path.exists(fname) and reprocess) or (
not os.path.exists(fname) and
(not os.path.exists(filter_fname)
and not os.path.exists(filter_fname_csv))):
resp = requests.get(url, stream=True)
total = int(resp.headers.get('content-length', 0))
with open(fname, 'wb') as file, tqdm(desc=f"Downloading {code}",
total=total,
unit='iB',
unit_scale=True,
unit_divisor=1024,
leave=False) as bar:
for data in resp.iter_content(chunk_size=1024):
size = file.write(data)
bar.update(size)
# ------------------------------------------------------------
# 2) Filter osm file by our tags and save using country code
# ------------------------------------------------------------
# TODO maybe try to skip osmosis and do filtering with pandas
if not os.path.exists(filter_fname) or reprocess:
# TODO add progress bar for this
# first, we look for ways and then the nodes that define the way/area using --used-node
# next, we look for nodes that match our filter
# third, we merge both and save
# (the remove new line part is only there so that we can use new
# lines. the shell does not like new lines)
osmosis_string_temp = f"""osmosis
--read-pbf-fast country_metadata/{code}-metadata-unfiltered.osm.pbf workers=8
--tf accept-ways {osmosis_string}
--tf reject-relations
--used-node
--read-pbf-fast country_metadata/{code}-metadata-unfiltered.osm.pbf workers=8
--tf accept-nodes {osmosis_string}
--tf reject-relations
--tf reject-ways
--merge
--write-pbf country_metadata/{code}-metadata.osm.pbf""".replace("\n", "")
# launch osmosis process and wait until its done
Popen(osmosis_string_temp, shell=True).wait()
# ------------------------------------------------------------
# 3) Convert to CSV and get poi-groups
# ------------------------------------------------------------
# read poi file and process --> then save as csv
# ------------------------------------------------------------
osm = esy.osm.pbf.File(filter_fname)
# assumption that does not hold: nodes without tags are waypoints for ways
# first get waypoint IDs
waypoint_ids = []
for item in osm:
if isinstance(item, esy.osm.pbf.Way):
waypoint_ids += item.refs
waypoint_ids = set(waypoint_ids)
# then extract waypoints
waypoints = dict()
for item in osm:
if item.id in waypoint_ids and isinstance(
item, esy.osm.pbf.Node):
# ID -> (lon,lat)
waypoints[item.id] = item.lonlat
tags = []
lonlat = []
ids = []
# get nodes and waypoints
for item in osm:
# skip waypoints
if item.id in waypoint_ids:
continue
# if we have nodes simply add them to list
if isinstance(item, esy.osm.pbf.Node):
tags.append(item.tags)
lonlat.append(item.lonlat)
ids.append(item.id)
# if we have ways then get way points and compute center
if isinstance(item, esy.osm.pbf.Way):
# polygon needs at least 3 points
if len(item.refs) >= 3:
center = Polygon([
waypoints[waypointid] for waypointid in item.refs
]).centroid
lonlat.append((center.x, center.y))
else:
# just taking the first waypoint if there is less than 3
lonlat.append(waypoints[item.refs[0]])
tags.append(item.tags)
ids.append(item.id)
poi = pd.DataFrame.from_dict(tags)
index = list(poi.columns)
poi = poi.fillna("")
poi["poi_groups"] = poi.apply(get_poi_group, axis=1)
poi["lonlat"] = lonlat
poi["id"] = ids
poi["longitude"] = poi["lonlat"].str[0]
poi["latitude"] = poi["lonlat"].str[1]
poi[["id", "name", "longitude", "latitude",
"poi_groups"]].to_csv(filter_fname_csv,
compression="gzip",
index=False)
if cleanup:
os.remove(fname)