Skip to content

Commit

Permalink
Profiler; optimize generate_edge_and_wait_values (#14)
Browse files Browse the repository at this point in the history
* new profiler method

* test perf script

* indexing optimization

* drop sort operations

* update comments

* rename profile script

* undo change in file name and dir

* py 3.5 compatable

* update test_generate_summary_graph_elements

* drop f string
  • Loading branch information
kuanb authored Dec 25, 2017
1 parent e1ee37d commit eade24e
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 24 deletions.
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
test:
PYTHONPATH=. MPLBACKEND="agg" coverage run --source peartree -m py.test --verbose

performance:
PYTHONPATH=. MPLBACKEND="agg" pytest profiler/test_graph_assembly.py -s
39 changes: 15 additions & 24 deletions peartree/summarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,14 +238,22 @@ def generate_summary_wait_times(df: pd.DataFrame) -> pd.DataFrame:
def generate_edge_and_wait_values(feed: ptg.gtfs.feed,
target_time_start: int,
target_time_end: int) -> Tuple[pd.DataFrame]:
ftrips = feed.trips.copy()
ftrips = ftrips[~ftrips['route_id'].isnull()]
ftrips = ftrips.set_index('route_id', drop=False)

all_edge_costs = None
all_wait_times = None
for i, route in feed.routes.iterrows():
log('Processing on route {}.'.format(route.route_id))

# Get all the subset of trips that are related to this route
route_match_mask = (feed.trips.route_id == route.route_id)
trips = feed.trips[route_match_mask]
trips = ftrips.loc[route.route_id]

# Pandas will try and make returned result a Series if there
# is only one result - prevent this from happening
if isinstance(trips, pd.Series):
trips = trips.to_frame().T

# Get just the stop times related to this trip
st_trip_id_mask = feed.stop_times.trip_id.isin(trips.trip_id)
Expand All @@ -256,8 +264,7 @@ def generate_edge_and_wait_values(feed: ptg.gtfs.feed,
end_time_mask = (stimes_init.arrival_time <= target_time_end)
stimes = stimes_init[start_time_mask & end_time_mask]

# Let user know how it is going
# TODO: Make these logger.info statements
# Report on progress if requested
a = len(stimes_init.trip_id.unique())
b = len(stimes.trip_id.unique())
log('\tReduced trips in consideration from {} to {}.'.format(a, b))
Expand All @@ -272,26 +279,10 @@ def generate_edge_and_wait_values(feed: ptg.gtfs.feed,
how='inner',
on='stop_id')

sort_values_list = ['stop_sequence',
'arrival_time',
'departure_time']
trips_and_stop_times = trips_and_stop_times.sort_values(
sort_values_list)
trips_and_stop_times = pd.merge(trips,
stimes,
how='inner',
on='trip_id')

trips_and_stop_times = pd.merge(trips_and_stop_times,
feed.stops,
how='inner',
on='stop_id')

sort_values_list = ['stop_sequence',
'arrival_time',
'departure_time']
trips_and_stop_times = trips_and_stop_times.sort_values(
sort_values_list)
sort_list = ['stop_sequence',
'arrival_time',
'departure_time']
trips_and_stop_times = trips_and_stop_times.sort_values(sort_list)

wait_times = generate_wait_times(trips_and_stop_times)
trips_and_stop_times['wait_dir_0'] = wait_times[0]
Expand Down
13 changes: 13 additions & 0 deletions tests/test_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,16 @@ def test_generate_summary_graph_elements():
t = summary_edge_costs.to_stop_id
z = list(zip(f, t))
assert len(list(set(z))) == len(z)

# Ensure that the wait times dataframe looks as it should
wt_cols = ['avg_cost', 'stop_id']
for c in wt_cols:
assert c in wait_times_by_stop.columns

# Sanity check edge costs
mask = (wait_times_by_stop.avg_cost < 0)
assert len(wait_times_by_stop[mask]) == 0

# Make sure that there are stop ids unique
u = wait_times_by_stop.stop_id.unique()
assert len(u) == len(wait_times_by_stop)
65 changes: 65 additions & 0 deletions tests/test_graph_assembly.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import os
from time import time

from peartree.graph import generate_empty_md_graph, populate_graph
from peartree.paths import get_representative_feed
from peartree.summarizer import (generate_edge_and_wait_values,
generate_summary_edge_costs,
generate_summary_wait_times)


def fixture(filename):
return os.path.join('tests', 'fixtures', filename)


def test_feed_to_graph_performance():
# Replicate the original workflow of the graph creation path
# but open up to expose to benchmarking/performance profiling
start = 7 * 60 * 60
end = 10 * 60 * 60

print('Running time profiles on each major '
'function in graph generation workflow')

a = time()
path = fixture('samtrans-2017-11-28.zip')
feed = get_representative_feed(path)
elapsed = round(time() - a, 2)
print('Perf of get_representative_feed: {}s'.format(elapsed))

fl = len(feed.routes)
print('Iteration on {} routes.'.format(fl))

a = time()
(all_edge_costs,
all_wait_times) = generate_edge_and_wait_values(feed,
start,
end)
elapsed = round(time() - a, 2)
print('Perf of generate_edge_and_wait_values: {}s'.format(elapsed))

a = time()
summary_edge_costs = generate_summary_edge_costs(all_edge_costs)
elapsed = round(time() - a, 2)
print('Perf of generate_summary_edge_costs: {}s'.format(elapsed))

a = time()
wait_times_by_stop = generate_summary_wait_times(all_wait_times)
elapsed = round(time() - a, 2)
print('Perf of generate_summary_wait_times: {}s'.format(elapsed))

a = time()
G = generate_empty_md_graph('foo')
elapsed = round(time() - a, 2)
print('Perf of generate_empty_md_graph: {}s'.format(elapsed))

a = time()
G = populate_graph(G,
'bar',
feed,
wait_times_by_stop,
summary_edge_costs,
50,
4.5)
elapsed = round(time() - a, 2)
print('Perf of populate_graph: {}s'.format(elapsed))

0 comments on commit eade24e

Please sign in to comment.