-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathgtfs2geojson.py
293 lines (234 loc) · 8.46 KB
/
gtfs2geojson.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
#!/usr/bin/env python
"""
gtfs2geojson
Converts GTFS data into GeoJSON format
Copyright 2014-2015 Michael Farrell <http://micolous.id.au>
License: 3-clause BSD, see COPYING
"""
import geojson, argparse, csv, zipfile
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
from datetime import timedelta
from decimal import Decimal
def gtfs_stops(gtfs, output_f):
"""
For each stop, convert it into a GeoJSON Point, and make all of it's attributes available.
:param gtfs file: Input GTFS ZIP.
:param output_f file: Output GeoJSON file stream.
"""
stops_c = csv.reader(swallow_windows_unicode(gtfs.open('stops.txt', 'r')))
output_layer = geojson.FeatureCollection([])
# assume WGS84 CRS
output_layer.crs = geojson.crs.Named('urn:ogc:def:crs:OGC:1.3:CRS84')
header = stops_c.next()
lat_col = header.index('stop_lat')
lng_col = header.index('stop_lon')
id_col = header.index('stop_id')
for row in stops_c:
lat, lng = Decimal(row[lat_col]), Decimal(row[lng_col])
# make dict of other properties
props = dict()
for i, h in enumerate(header):
if h in ('stop_lat', 'stop_lon'):
continue
if row[i] != '':
props[h] = row[i]
output_layer.features.append(geojson.Feature(
geometry=geojson.Point(
coordinates=(lng, lat)
),
properties=props,
id=row[id_col]
))
geojson.dump(output_layer, output_f)
def time_as_timedelta(time):
# We need to be able to handle values above 24:00:00, as they mean "tomorrow".
try:
h, m, s = (int(x) for x in time.split(':'))
except ValueError:
return None
return timedelta(hours=h, minutes=m, seconds=s)
def gtfs_routes(gtfs, output_f):
"""
For each route, convert it's 'shape' into a GeoJSON LineString, and make all
of it's attributes available.
:param gtfs file: Input GTFS ZIP
:param output_f file: Output GeoJSON file stream.
"""
# Load up the stop times so we can find which are the best routes.
stoptimes_c = csv.reader(swallow_windows_unicode(gtfs.open('stop_times.txt', 'r')))
header = stoptimes_c.next()
trip_id_col = header.index('trip_id')
arrtime_col = header.index('arrival_time')
deptime_col = header.index('departure_time')
stopseq_col = header.index('stop_sequence')
trip_times = {}
for row in stoptimes_c:
if row[trip_id_col] not in trip_times:
# earliest seq, latest seq, earliest seq dep time, latest seq dep time
trip_times[row[trip_id_col]] = [None, None, None, None]
arrtime = time_as_timedelta(row[arrtime_col])
deptime = time_as_timedelta(row[deptime_col])
if arrtime is None or deptime is None:
# bad data, skip!
continue
seq = int(row[stopseq_col])
# Find if this is an earlier item in the sequence
if trip_times[row[trip_id_col]][0] is None or trip_times[row[trip_id_col]][0] > seq:
trip_times[row[trip_id_col]][0] = seq
trip_times[row[trip_id_col]][2] = deptime
# Find if this is an later item in the sequence
if trip_times[row[trip_id_col]][1] is None or trip_times[row[trip_id_col]][1] < seq:
trip_times[row[trip_id_col]][1] = seq
trip_times[row[trip_id_col]][3] = arrtime
# Load the shapes into a map that we can lookup.
# We should do all the geometry processing here so that we only have to do
# this once-off.
shapes_c = csv.reader(swallow_windows_unicode(gtfs.open('shapes.txt', 'r')))
header = shapes_c.next()
shape_id_col = header.index('shape_id')
shape_lat_col = header.index('shape_pt_lat')
shape_lng_col = header.index('shape_pt_lon')
shape_seq_col = header.index('shape_pt_sequence')
shape_dist_col = header.index('shape_dist_traveled') if 'shape_dist_traveled' in header else None
shapes = {}
shape_lengths = {}
for row in shapes_c:
if row[shape_id_col] not in shapes:
shapes[row[shape_id_col]] = {}
shapes[row[shape_id_col]][int(row[shape_seq_col])] = (Decimal(row[shape_lng_col]), Decimal(row[shape_lat_col]))
# Calculate length according to GTFS
# This could also be calculated by the geometry, but we trust GTFS, right...
if shape_dist_col is not None:
length = Decimal(row[shape_dist_col])
if row[shape_id_col] not in shape_lengths or shape_lengths[row[shape_id_col]] < length:
shape_lengths[row[shape_id_col]] = length
# translate the shapes into a LineString for use by the GeoJSON module
for shape_id in shapes.iterkeys():
shape_keys = shapes[shape_id].keys()
shape_keys.sort()
shape = []
for ordinal in shape_keys:
shape.append(shapes[shape_id][ordinal])
shapes[shape_id] = shape
# Make a matching dict between routes and shapes
trips = {}
trips_ref = {}
route_time = {}
trips_c = csv.reader(swallow_windows_unicode(gtfs.open('trips.txt', 'r')))
header = trips_c.next()
route_id_col = header.index('route_id')
shape_id_col = header.index('shape_id')
trip_id_col = header.index('trip_id')
for row in trips_c:
# reference count the shapes
if row[route_id_col] not in trips_ref:
# route is unknown, create dict
trips_ref[row[route_id_col]] = {}
route_time[row[route_id_col]] = trip_times[row[trip_id_col]]
if row[shape_id_col] not in trips_ref[row[route_id_col]]:
# shape is unknown, create counter
trips_ref[row[route_id_col]][row[shape_id_col]] = 0
# increment counter
trips_ref[row[route_id_col]][row[shape_id_col]] += 1
# now we're done, iterate through the reference-counters and find the best
# shape
for route_id, candidate_shapes in trips_ref.iteritems():
popular_shape, popular_shape_refs = None, 0
for shape_id, refs in candidate_shapes.iteritems():
if refs > popular_shape_refs:
popular_shape, popular_shape_refs = shape_id, refs
# now we should have the route's shape
assert popular_shape is not None, 'Couldn\'t find a shape for route %r' % route_id
trips[route_id] = popular_shape
# Cleanup unused variables
del trip_times
# lets setup our output file
output_layer = geojson.FeatureCollection([])
# assume WGS84 CRS
output_layer.crs = geojson.crs.Named('urn:ogc:def:crs:OGC:1.3:CRS84')
# now we have all the shapes available, translate the routes
routes_c = csv.reader(swallow_windows_unicode(gtfs.open('routes.txt', 'r')))
header = routes_c.next()
route_id_col = header.index('route_id')
for row in routes_c:
# make dict of other properties
props = dict()
for i, h in enumerate(header):
if row[i] != '':
props[h] = row[i]
if row[route_id_col] not in trips:
# Route has no trips!
print "Warning: route has no trips, skipping: %r" % (row,)
continue
props['shape_id'] = trips[row[route_id_col]]
props['shape_refs'] = trips_ref[row[route_id_col]][props['shape_id']]
if shape_dist_col is not None:
props['shape_length'] = shape_lengths[props['shape_id']]
props['duration_sec'] = (route_time[row[route_id_col]][3] - route_time[row[route_id_col]][2]).total_seconds()
output_layer.features.append(geojson.Feature(
geometry=geojson.LineString(
coordinates=shapes[trips[row[route_id_col]]]
),
properties=props,
id=row[route_id_col]
))
# now flush the GeoJSON layer to a file.
geojson.dump(output_layer, output_f)
def swallow_windows_unicode(fileobj, rewind=True):
"""
Windows programs (specifically, Notepad) puts '\xef\xbb\xbf' at the start of
a Unicode text file. This is used to handle "utf-8-sig" files.
This function looks for those bytes and advances the stream past them if
they are present.
Returns fileobj, fast-forwarded past the characters.
"""
if rewind:
try:
pos = fileobj.tell()
except:
pos = None
try:
bom = fileobj.read(3)
except:
# End of file, revert!
fileobj.seek(pos)
if bom == '\xef\xbb\xbf':
return fileobj
# Bytes not present, rewind the stream
if rewind:
if pos is None:
# .tell is not supported, dump the file contents into a cStringID
fileobj = StringIO(bom + fileobj.read())
else:
fileobj.seek(pos)
return fileobj
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-o', '--output',
required=True,
type=argparse.FileType('wb'),
help='Output GeoJSON file'
)
parser.add_argument('input_gtfs',
type=argparse.FileType('rb'),
help='Path to GTFS ZIP file to extract data from.')
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('-r', '--routes',
action='store_true',
help='Route conversion mode')
group.add_argument('-s', '--stops',
action='store_true',
help='Stop conversion mode')
options = parser.parse_args()
assert options.routes or options.stops
# Open ZIP
gtfs = zipfile.ZipFile(options.input_gtfs, 'r')
if options.routes:
gtfs_routes(gtfs, options.output)
elif options.stops:
gtfs_stops(gtfs, options.output)
if __name__ == '__main__':
main()