diff --git a/Makefile b/Makefile index ca784ad..df09bbd 100644 --- a/Makefile +++ b/Makefile @@ -1,2 +1,5 @@ test: PYTHONPATH=. MPLBACKEND="agg" coverage run --source peartree -m py.test --verbose + +performance: + PYTHONPATH=. MPLBACKEND="agg" pytest profiler/test_graph_assembly.py -s diff --git a/peartree/summarizer.py b/peartree/summarizer.py index 9170e98..de176cd 100644 --- a/peartree/summarizer.py +++ b/peartree/summarizer.py @@ -238,14 +238,22 @@ def generate_summary_wait_times(df: pd.DataFrame) -> pd.DataFrame: def generate_edge_and_wait_values(feed: ptg.gtfs.feed, target_time_start: int, target_time_end: int) -> Tuple[pd.DataFrame]: + ftrips = feed.trips.copy() + ftrips = ftrips[~ftrips['route_id'].isnull()] + ftrips = ftrips.set_index('route_id', drop=False) + all_edge_costs = None all_wait_times = None for i, route in feed.routes.iterrows(): log('Processing on route {}.'.format(route.route_id)) # Get all the subset of trips that are related to this route - route_match_mask = (feed.trips.route_id == route.route_id) - trips = feed.trips[route_match_mask] + trips = ftrips.loc[route.route_id] + + # Pandas will try and make returned result a Series if there + # is only one result - prevent this from happening + if isinstance(trips, pd.Series): + trips = trips.to_frame().T # Get just the stop times related to this trip st_trip_id_mask = feed.stop_times.trip_id.isin(trips.trip_id) @@ -256,8 +264,7 @@ def generate_edge_and_wait_values(feed: ptg.gtfs.feed, end_time_mask = (stimes_init.arrival_time <= target_time_end) stimes = stimes_init[start_time_mask & end_time_mask] - # Let user know how it is going - # TODO: Make these logger.info statements + # Report on progress if requested a = len(stimes_init.trip_id.unique()) b = len(stimes.trip_id.unique()) log('\tReduced trips in consideration from {} to {}.'.format(a, b)) @@ -272,26 +279,10 @@ def generate_edge_and_wait_values(feed: ptg.gtfs.feed, how='inner', on='stop_id') - sort_values_list = ['stop_sequence', - 'arrival_time', - 'departure_time'] - trips_and_stop_times = trips_and_stop_times.sort_values( - sort_values_list) - trips_and_stop_times = pd.merge(trips, - stimes, - how='inner', - on='trip_id') - - trips_and_stop_times = pd.merge(trips_and_stop_times, - feed.stops, - how='inner', - on='stop_id') - - sort_values_list = ['stop_sequence', - 'arrival_time', - 'departure_time'] - trips_and_stop_times = trips_and_stop_times.sort_values( - sort_values_list) + sort_list = ['stop_sequence', + 'arrival_time', + 'departure_time'] + trips_and_stop_times = trips_and_stop_times.sort_values(sort_list) wait_times = generate_wait_times(trips_and_stop_times) trips_and_stop_times['wait_dir_0'] = wait_times[0] diff --git a/tests/test_graph.py b/tests/test_graph.py index 7b6115f..dfac61f 100644 --- a/tests/test_graph.py +++ b/tests/test_graph.py @@ -36,3 +36,16 @@ def test_generate_summary_graph_elements(): t = summary_edge_costs.to_stop_id z = list(zip(f, t)) assert len(list(set(z))) == len(z) + + # Ensure that the wait times dataframe looks as it should + wt_cols = ['avg_cost', 'stop_id'] + for c in wt_cols: + assert c in wait_times_by_stop.columns + + # Sanity check edge costs + mask = (wait_times_by_stop.avg_cost < 0) + assert len(wait_times_by_stop[mask]) == 0 + + # Make sure that there are stop ids unique + u = wait_times_by_stop.stop_id.unique() + assert len(u) == len(wait_times_by_stop) diff --git a/tests/test_graph_assembly.py b/tests/test_graph_assembly.py new file mode 100644 index 0000000..e83343c --- /dev/null +++ b/tests/test_graph_assembly.py @@ -0,0 +1,65 @@ +import os +from time import time + +from peartree.graph import generate_empty_md_graph, populate_graph +from peartree.paths import get_representative_feed +from peartree.summarizer import (generate_edge_and_wait_values, + generate_summary_edge_costs, + generate_summary_wait_times) + + +def fixture(filename): + return os.path.join('tests', 'fixtures', filename) + + +def test_feed_to_graph_performance(): + # Replicate the original workflow of the graph creation path + # but open up to expose to benchmarking/performance profiling + start = 7 * 60 * 60 + end = 10 * 60 * 60 + + print('Running time profiles on each major ' + 'function in graph generation workflow') + + a = time() + path = fixture('samtrans-2017-11-28.zip') + feed = get_representative_feed(path) + elapsed = round(time() - a, 2) + print('Perf of get_representative_feed: {}s'.format(elapsed)) + + fl = len(feed.routes) + print('Iteration on {} routes.'.format(fl)) + + a = time() + (all_edge_costs, + all_wait_times) = generate_edge_and_wait_values(feed, + start, + end) + elapsed = round(time() - a, 2) + print('Perf of generate_edge_and_wait_values: {}s'.format(elapsed)) + + a = time() + summary_edge_costs = generate_summary_edge_costs(all_edge_costs) + elapsed = round(time() - a, 2) + print('Perf of generate_summary_edge_costs: {}s'.format(elapsed)) + + a = time() + wait_times_by_stop = generate_summary_wait_times(all_wait_times) + elapsed = round(time() - a, 2) + print('Perf of generate_summary_wait_times: {}s'.format(elapsed)) + + a = time() + G = generate_empty_md_graph('foo') + elapsed = round(time() - a, 2) + print('Perf of generate_empty_md_graph: {}s'.format(elapsed)) + + a = time() + G = populate_graph(G, + 'bar', + feed, + wait_times_by_stop, + summary_edge_costs, + 50, + 4.5) + elapsed = round(time() - a, 2) + print('Perf of populate_graph: {}s'.format(elapsed))