Profiler; optimize generate_edge_and_wait_values (#14)

* new profiler method * test perf script * indexing optimization * drop sort operations * update comments * rename profile script * undo change in file name and dir * py 3.5 compatable * update test_generate_summary_graph_elements * drop f string
kuanb · Dec 25, 2017 · eade24e · eade24e
1 parent e1ee37d
commit eade24e
Show file tree

Hide file tree

Showing 4 changed files with 96 additions and 24 deletions.
diff --git a/Makefile b/Makefile
@@ -1,2 +1,5 @@
 test:
 	PYTHONPATH=. MPLBACKEND="agg" coverage run --source peartree -m py.test --verbose
+
+performance:
+	PYTHONPATH=. MPLBACKEND="agg" pytest profiler/test_graph_assembly.py -s	
diff --git a/peartree/summarizer.py b/peartree/summarizer.py
@@ -238,14 +238,22 @@ def generate_summary_wait_times(df: pd.DataFrame) -> pd.DataFrame:
 def generate_edge_and_wait_values(feed: ptg.gtfs.feed,
                                   target_time_start: int,
                                   target_time_end: int) -> Tuple[pd.DataFrame]:
+    ftrips = feed.trips.copy()
+    ftrips = ftrips[~ftrips['route_id'].isnull()]
+    ftrips = ftrips.set_index('route_id', drop=False)
+
     all_edge_costs = None
     all_wait_times = None
     for i, route in feed.routes.iterrows():
         log('Processing on route {}.'.format(route.route_id))
 
         # Get all the subset of trips that are related to this route
-        route_match_mask = (feed.trips.route_id == route.route_id)
-        trips = feed.trips[route_match_mask]
+        trips = ftrips.loc[route.route_id]
+
+        # Pandas will try and make returned result a Series if there
+        # is only one result - prevent this from happening
+        if isinstance(trips, pd.Series):
+            trips = trips.to_frame().T
 
         # Get just the stop times related to this trip
         st_trip_id_mask = feed.stop_times.trip_id.isin(trips.trip_id)
@@ -256,8 +264,7 @@ def generate_edge_and_wait_values(feed: ptg.gtfs.feed,
         end_time_mask = (stimes_init.arrival_time <= target_time_end)
         stimes = stimes_init[start_time_mask & end_time_mask]
 
-        # Let user know how it is going
-        # TODO: Make these logger.info statements
+        # Report on progress if requested
         a = len(stimes_init.trip_id.unique())
         b = len(stimes.trip_id.unique())
         log('\tReduced trips in consideration from {} to {}.'.format(a, b))
@@ -272,26 +279,10 @@ def generate_edge_and_wait_values(feed: ptg.gtfs.feed,
                                         how='inner',
                                         on='stop_id')
 
-        sort_values_list = ['stop_sequence',
-                            'arrival_time',
-                            'departure_time']
-        trips_and_stop_times = trips_and_stop_times.sort_values(
-            sort_values_list)
-        trips_and_stop_times = pd.merge(trips,
-                                        stimes,
-                                        how='inner',
-                                        on='trip_id')
-
-        trips_and_stop_times = pd.merge(trips_and_stop_times,
-                                        feed.stops,
-                                        how='inner',
-                                        on='stop_id')
-
-        sort_values_list = ['stop_sequence',
-                            'arrival_time',
-                            'departure_time']
-        trips_and_stop_times = trips_and_stop_times.sort_values(
-            sort_values_list)
+        sort_list = ['stop_sequence',
+                     'arrival_time',
+                     'departure_time']
+        trips_and_stop_times = trips_and_stop_times.sort_values(sort_list)
 
         wait_times = generate_wait_times(trips_and_stop_times)
         trips_and_stop_times['wait_dir_0'] = wait_times[0]

diff --git a/tests/test_graph.py b/tests/test_graph.py
@@ -36,3 +36,16 @@ def test_generate_summary_graph_elements():
     t = summary_edge_costs.to_stop_id
     z = list(zip(f, t))
     assert len(list(set(z))) == len(z)
+
+    # Ensure that the wait times dataframe looks as it should
+    wt_cols = ['avg_cost', 'stop_id']
+    for c in wt_cols:
+        assert c in wait_times_by_stop.columns
+
+    # Sanity check edge costs
+    mask = (wait_times_by_stop.avg_cost < 0)
+    assert len(wait_times_by_stop[mask]) == 0
+
+    # Make sure that there are stop ids unique
+    u = wait_times_by_stop.stop_id.unique()
+    assert len(u) == len(wait_times_by_stop)
diff --git a/tests/test_graph_assembly.py b/tests/test_graph_assembly.py
@@ -0,0 +1,65 @@
+import os
+from time import time
+
+from peartree.graph import generate_empty_md_graph, populate_graph
+from peartree.paths import get_representative_feed
+from peartree.summarizer import (generate_edge_and_wait_values,
+                                 generate_summary_edge_costs,
+                                 generate_summary_wait_times)
+
+
+def fixture(filename):
+    return os.path.join('tests', 'fixtures', filename)
+
+
+def test_feed_to_graph_performance():
+    # Replicate the original workflow of the graph creation path
+    # but open up to expose to benchmarking/performance profiling
+    start = 7 * 60 * 60
+    end = 10 * 60 * 60
+
+    print('Running time profiles on each major '
+          'function in graph generation workflow')
+
+    a = time()
+    path = fixture('samtrans-2017-11-28.zip')
+    feed = get_representative_feed(path)
+    elapsed = round(time() - a, 2)
+    print('Perf of get_representative_feed: {}s'.format(elapsed))
+
+    fl = len(feed.routes)
+    print('Iteration on {} routes.'.format(fl))
+
+    a = time()
+    (all_edge_costs,
+     all_wait_times) = generate_edge_and_wait_values(feed,
+                                                     start,
+                                                     end)
+    elapsed = round(time() - a, 2)
+    print('Perf of generate_edge_and_wait_values: {}s'.format(elapsed))
+
+    a = time()
+    summary_edge_costs = generate_summary_edge_costs(all_edge_costs)
+    elapsed = round(time() - a, 2)
+    print('Perf of generate_summary_edge_costs: {}s'.format(elapsed))
+
+    a = time()
+    wait_times_by_stop = generate_summary_wait_times(all_wait_times)
+    elapsed = round(time() - a, 2)
+    print('Perf of generate_summary_wait_times: {}s'.format(elapsed))
+
+    a = time()
+    G = generate_empty_md_graph('foo')
+    elapsed = round(time() - a, 2)
+    print('Perf of generate_empty_md_graph: {}s'.format(elapsed))
+
+    a = time()
+    G = populate_graph(G,
+                       'bar',
+                       feed,
+                       wait_times_by_stop,
+                       summary_edge_costs,
+                       50,
+                       4.5)
+    elapsed = round(time() - a, 2)
+    print('Perf of populate_graph: {}s'.format(elapsed))