new technical notes

cal-itp · Dec 17, 2024 · e8b3c7f · e8b3c7f
1 parent d03d5a4
commit e8b3c7f
Show file tree

Hide file tree

Showing 6 changed files with 105 additions and 17 deletions.
diff --git a/high_quality_transit_areas/09_enforce_collinearity.ipynb b/high_quality_transit_areas/09_enforce_collinearity.ipynb
@@ -277,6 +277,16 @@
     "(s[s<11]).hist()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d6c8f6fd-fc7f-4021-9f56-3aec9ab206ba",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "s.quantile(.96)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "d839566e-7cc0-4345-b835-ce5a77ab70cc",
@@ -963,7 +973,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# m2"
+    "m2"
    ]
   },
   {
@@ -993,7 +1003,18 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# one_pts.explore(column='hqta_type')"
+    "one_pts.explore(column='hqta_type')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0131bb42-d63b-4142-915f-e4ce29972d35",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "areas.to_file('hqta_areas_update.geojson')\n",
+    "hqta_points.to_file('hqta_points_update.geojson')"
    ]
   },
   {

diff --git a/high_quality_transit_areas/Makefile b/high_quality_transit_areas/Makefile
@@ -1,6 +1,6 @@
 hqta_data:
-#	python rail_ferry_brt_stops.py
-#	python create_hqta_segments.py
+	python rail_ferry_brt_stops.py
+	python create_hqta_segments.py
 	python create_aggregate_stop_frequencies.py
 	python sjoin_stops_to_segments.py
 	python prep_pairwise_intersections.py

diff --git a/high_quality_transit_areas/README.md b/high_quality_transit_areas/README.md
@@ -31,8 +31,11 @@ For more details, see [links](#High-Quality-Transit-Areas-Relevant-Statutes) to
 * Corridors are cut for each operator-route-direction every 1,250 meters.
    * Stops have a 50 meter buffer drawn around them and are spatially joined to corridors.
    * For each 1,250 meter corridor segment, the highest frequency stop is counted (if there are multiple stops).
-* We count _all_ trips at the highest frequency stop in each corridor segment, even if they're marketed as different "routes".
-    * For example, if Route A serves the stop twice an hour, and Route B twice an hour, we consider the stop frequency to be 4 trips/hour, qualifying it as a high quality transit corridor and potentially a major bus stop (if an intersecting corridor is present)
+* We count trips at the highest frequency stop in each corridor segment. Where routes providing similar service share a stop, we combine their trips for frequency calculation. However, we don't combine trips across routes where those routes do not provide similar service. We currently do this by counting the number of stops that different routes share in a given direction.
+    * For example, suppose that Route A and Route B share many stops in the same direction. They may be two routes coming together to provide combined service on a corridor while branching elsewhere, or a local and rapid route on the same corridor.
+    * In that case if Route A serves the stop twice an hour, and Route B twice an hour, we consider the stop frequency to be 4 trips/hour, qualifying it as a high quality transit corridor and potentially a major bus stop (if an intersecting corridor is present)
+    * If, however, Route A and Route B only shared one or two stops, for example starting together at a transit center but going off in different directions, we would not combine them for stop frequency calculations.
+    * See [technical_notes.md](technical_notes.md) for additional details.
 
 ### Why Not one "Route" at a Time?
 

diff --git a/high_quality_transit_areas/create_aggregate_stop_frequencies.py b/high_quality_transit_areas/create_aggregate_stop_frequencies.py
@@ -10,7 +10,7 @@
 
 from update_vars import (analysis_date, AM_PEAK, PM_PEAK, EXPORT_PATH, GCS_FILE_PATH, PROJECT_CRS,
 SEGMENT_BUFFER_METERS, AM_PEAK, PM_PEAK, HQ_TRANSIT_THRESHOLD, MS_TRANSIT_THRESHOLD, SHARED_STOP_THRESHOLD,
-KEYS_TO_DROP)
+ROUTE_COLLINEARITY_KEYS_TO_DROP)
 
 am_peak_hrs = list(range(AM_PEAK[0].hour, AM_PEAK[1].hour))
 pm_peak_hrs = list(range(PM_PEAK[0].hour, PM_PEAK[1].hour))
@@ -90,7 +90,7 @@ def get_explode_multiroute_only(
     multi_only_explode = (multi_only[['schedule_gtfs_dataset_key', 'stop_id', 'route_dir']].explode('route_dir'))
     multi_only_explode = multi_only_explode.merge(single_hourly, on = ['route_dir', 'schedule_gtfs_dataset_key', 'stop_id'])
     multi_only_explode = multi_only_explode.sort_values(['schedule_gtfs_dataset_key','stop_id', 'route_dir']) #  sorting crucial for next step
-    print(f'{multi_only_explode.stop_id.nunique()} stops may qualify with multi-route aggregation')
+    # print(f'{multi_only_explode.stop_id.nunique()} stops may qualify with multi-route aggregation')
     return multi_only_explode
 
 def accumulate_share_count(route_dir_exploded: pd.DataFrame) -> None:
@@ -138,17 +138,17 @@ def feed_level_filter(
     st_prepped = st_prepped >> filter(_.schedule_gtfs_dataset_key == gtfs_dataset_key,
                                       _.stop_id.isin(stops_to_eval.stop_id),
                                      )
-    print(f'{st_prepped.shape}')
+    # print(f'{st_prepped.shape}')
     st_to_eval = st_prepped >> filter(_.route_dir.isin(any_appearance))
-    print(f'{st_to_eval.shape}')
+    # print(f'{st_to_eval.shape}')
     #  cut down problem space by checking if stops still could qual after filtering for any appearance
     min_rows = min(frequency_thresholds) * len(both_peaks_hrs)
     st_could_qual = (st_to_eval >> group_by(_.stop_id)
      >> mutate(could_qualify = _.shape[0] >= min_rows)
      >> ungroup()
      >> filter(_.could_qualify)
     )
-    print(f'{st_could_qual.shape}')
+    # print(f'{st_could_qual.shape}')
     return st_could_qual, qualify_pairs
 
 def check_stop(this_stop_route_dirs, qualify_pairs):
@@ -221,6 +221,9 @@ def filter_all_prepare_export(
     max_arrivals_by_stop_single: pd.DataFrame,
     frequency_thresholds: tuple
 ):
+    '''
+    Apply collinearity 
+    '''
     #  %%time 90 seconds (on default user) is not too bad! 
     all_collinear = pd.DataFrame()
     for gtfs_dataset_key in feeds_to_filter:
@@ -318,7 +321,8 @@ def stop_times_aggregation_max_by_stop(
     share_counts = {}
     multi_only_explode.groupby(['schedule_gtfs_dataset_key', 'stop_id']).apply(accumulate_share_count)
     qualify_dict = {key: share_counts[key] for key in share_counts.keys() if share_counts[key] >= SHARED_STOP_THRESHOLD}
-    for key in KEYS_TO_DROP: qualify_dict.pop(key) #  will error if key not present, check if situation still present and update key if needed
+    #  will error if key not present, check if situation still present and update key if needed
+    for key in ROUTE_COLLINEARITY_KEYS_TO_DROP: qualify_dict.pop(key)
     feeds_to_filter = np.unique([key.split('__')[0] for key in qualify_dict.keys()])
 
     combined_export = filter_all_prepare_export(feeds_to_filter, multi_only_explode, qualify_dict,

diff --git a/high_quality_transit_areas/technical_notes.md b/high_quality_transit_areas/technical_notes.md
@@ -0,0 +1,62 @@
+# HQ Transit Corridors/Major Transit Stops Technical Notes
+
+## Parameter Definitions
+
+Mostly located in [update_vars.py](update_vars.py).
+
+* `HQ_TRANSIT_THRESHOLD`, `MS_TRANSIT_THRESHOLD`: Statutory definition, see [README.md](README.md)
+* `AM_PEAK`, `PM_PEAK`: Reflect most common MPO methodologies, not defined in statute
+* `HQTA_SEGMENT_LENGTH`, `SEGMENT_BUFFER_METERS`: Our definitions for splitting corridors into analysis segments and matching them to stops
+* `CORRIDOR_BUFFER_METERS`: Statutory definition
+* `SHARED_STOP_THRESHOLD`: Our threshold for finding similar routes (routes sharing at least this many stops with another in the same direction). Reflects the 96th percentile. 
+
+## Script Sequence and Notes
+
+Defined via [Makefile](Makefile)
+
+## `create_aggregate_stop_frequencies.py`
+
+This script finds similar (collinear) routes.
+
+### Initial steps
+
+Get frequencies at each stop for defined peak periods, with one version only looking at the single most frequent routes per stop and another version looking at all routes per stop.
+
+Find stops in the multi-route version that qualify at a higher threshold than they do in the single-route version. These are the stops (and thus routes and feeds) that we want to check for collinearity.
+
+### Detailed collinearity evaluation
+
+1. Get a list of unique feeds where at least one route_directions pair qualifies to evaluate.
+1. Get stop_times filtered to that feed, and filter that to stops that only qualify with multiple routes, and route directions that pair with at least one other route_direction. Do not consider pairs between the same route in one direction and the same route in the opposite direction.
+1. After that filtering, check again if stop_times includes the minimum frequency to qualify at each stop. Exclude stops where it doesn't.
+1. Then... evaluate which route_directions can be aggregated at each remaining stop. From the full list of route_directions (sorted by frequency) serving the stop, use `list(itertools.combinations(this_stop_route_dirs, 2))` to get each unique pair of route_directions. Check each of those unique pairs to see if it meets the `SHARED_STOP_THRESHOLD`. If they all do, keep all stop_times entries for that stop, different route_directions can be aggregated together at that stop. If any do not, remove the least frequent route_direction and try again, until a subset passes (only keep stop_times for that subset) or until all are eliminated. Currently implemented recursively as below:
+
+    ```
+    attempting ['103_1', '101_1', '102_1', '104_1']... subsetting...
+    attempting ['103_1', '101_1', '102_1']... subsetting...
+    attempting ['103_1', '101_1']... matched!
+
+    attempting ['103_1', '101_0', '101_1', '103_0']... subsetting...
+    attempting ['103_1', '101_0', '101_1']... subsetting...
+    attempting ['103_1', '101_0']... subsetting...
+    exhausted!
+    ```
+
+1. With that filtered stop_times, recalculate stop-level frequencies as before. Only keep stops meeting the minimum frequency threshold for a major stop or HQ corridor.
+1. Finally, once again apply the `SHARED_STOP_THRESHOLD` after aggregation (by ensuring at least one route_dir at each stop has >= `SHARED_STOP_THRESHOLD` frequent stops). Exclude stops that don't meet this criteria.
+
+#### edge cases:
+
+[AC Transit 45](https://www.actransit.org/sites/default/files/timetable_files/45-2023_12_03.pdf) _Opposite directions share a same-direction loop._ __Solved__ by preventing the same route from being compared with itself in the opposite direction.
+
+[SDMTS 944/945](https://www.sdmts.com/sites/default/files/routes/pdf/944.pdf) _Shared frequent stops are few, and these routes are isolated._ __Solved__ by once again applying the `SHARED_STOP_THRESHOLD` after aggregation (by ensuring at least one route_dir at each stop has >= `SHARED_STOP_THRESHOLD` frequent stops). Complex typology including a loop route, each pair of [944, 945, 945A(946)] has >= threshold... but not actually in the same spots!
+
+### Export
+
+Export stop-level frequencies that are a composite of single-route results, and multi-route results passing the collinearity evaluation. These will be the stop-level frequencies used in subsequent steps.
+
+## `sjoin_stops_to_segments.py`
+
+* Formerly, this evaluates segments from all routes against all stops. Since it is a spatial join, this meant that infrequent routes running perpendicular to frequent routes often grabbed the cross street stop from the frequent route, creating an erroneous, isolated frequent segment and often an erroneous major transit stop.
+* Now, we first filter segments to only segments from routes with at least one stop-level frequency meeting the standard. This screens out entirely infrequent routes and vastly reduces the risk of false positives.
+* We have also reduced the segment to stop buffer (`SEGMENT_BUFFER_METERS`) to the extent possible.
diff --git a/high_quality_transit_areas/update_vars.py b/high_quality_transit_areas/update_vars.py
@@ -8,19 +8,17 @@
 PROJECT_CRS = "EPSG:3310"
 HQTA_SEGMENT_LENGTH = 1_250 # meters
 SEGMENT_BUFFER_METERS = 35 # buffer around segment to sjoin to stops
-INTERSECTION_BUFFER_METERS = 152 # ~ 500ft
-HALF_MILE_BUFFER_METERS = 805 # half mile ~ 805 meters
+INTERSECTION_BUFFER_METERS = 152.4 # ~ 500ft
+HALF_MILE_BUFFER_METERS = 804.7 # half mile
 CORRIDOR_BUFFER_METERS = HALF_MILE_BUFFER_METERS - SEGMENT_BUFFER_METERS # 755 meters
 EXPORT_PATH = f"{GCS_FILE_PATH}export/{analysis_date}/"
 
 AM_PEAK = (dt.time(hour=6), dt.time(hour=9)) #  time range to calculate average peak frequencies
 PM_PEAK = (dt.time(hour=15), dt.time(hour=19))
 HQ_TRANSIT_THRESHOLD = 4  #  trips per hour, from statute, see README.md
 MS_TRANSIT_THRESHOLD = 3  #  different statutory threshold as of October 2024
-# CORR_SINGLE_ROUTE = False #  only consider frequency from a single route at each stop (not recommended, see README)
-# BUS_MS_SINGLE_ROUTE = False #  only consider frequency from a single route at each stop (not recommended, see README)
 
 SHARED_STOP_THRESHOLD = 8 #  current rec
 #  Yolobus. Separate route_id, but same route in a CW and CCW loop, drop per rule to not compare same rt with itself
-KEYS_TO_DROP = ['3c62ad6ee589d56eca915ce291a5df0a__42A_0__42B_0',
+ROUTE_COLLINEARITY_KEYS_TO_DROP = ['3c62ad6ee589d56eca915ce291a5df0a__42A_0__42B_0',
                '3c62ad6ee589d56eca915ce291a5df0a__42B_0__42A_0']