From 1a7c8322ff09dcdf31298cf9e59095d9a9e0cd8a Mon Sep 17 00:00:00 2001
From: oweno-tfwm <owen.oneill@tfwm.org.uk>
Date: Tue, 15 Aug 2023 21:40:52 +0100
Subject: [PATCH 01/11] fix failure to merge BODS and NR gtfs files because
 data type of join columns not being explicitly set - and some fields assumed
 to be present in route table (route_desc) are not present in BODS data

---
 R/gtfs_merge.R |  7 +++++--
 R/gtfs_read.R  | 47 +++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/R/gtfs_merge.R b/R/gtfs_merge.R
index a1f1b3e..cf08bfe 100644
--- a/R/gtfs_merge.R
+++ b/R/gtfs_merge.R
@@ -131,8 +131,11 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) {
 
     route_id$route_id_new <- seq(1, nrow(route_id))
     routes <- dplyr::left_join(routes, route_id, by = c("file_id", "route_id"))
-    routes <- routes[, c("route_id_new", "agency_id", "route_short_name", "route_long_name", "route_desc", "route_type")]
-    names(routes) <- c("route_id", "agency_id", "route_short_name", "route_long_name", "route_desc", "route_type")
+
+    columns_to_select <- c("route_id_new", "agency_id", "route_short_name", "route_long_name", "route_desc", "route_type")
+    columns_to_select <- columns_to_select[columns_to_select %in% colnames(routes)]
+    routes <- routes[, columns_to_select]
+    names(routes) <- columns_to_select
   }
 
 
diff --git a/R/gtfs_read.R b/R/gtfs_read.R
index 5f58818..089f2a5 100644
--- a/R/gtfs_read.R
+++ b/R/gtfs_read.R
@@ -20,20 +20,24 @@ gtfs_read <- function(path){
 
   if(checkmate::test_file_exists(file.path(tmp_folder,"agency.txt"))){
     gtfs$agency <- readr::read_csv(file.path(tmp_folder,"agency.txt"),
-                                   col_types = readr::cols(agency_id = readr::col_character()),
+                                   col_types = readr::cols(agency_id = readr::col_character(),
+                                                           agency_noc = readr::col_character()),
                                    show_col_types = FALSE,
                                    lazy = FALSE)
   } else {
     warning("Unable to find required file: agency.txt")
   }
-
   if(checkmate::test_file_exists(file.path(tmp_folder,"stops.txt"))){
     gtfs$stops <- readr::read_csv(file.path(tmp_folder,"stops.txt"),
                                   col_types = readr::cols(stop_id = readr::col_character(),
                                                           stop_code = readr::col_character(),
                                                           stop_name = readr::col_character(),
                                                           stop_lat = readr::col_number(),
-                                                          stop_lon = readr::col_number()),
+                                                          stop_lon = readr::col_number(),
+                                                          wheelchair_boarding = readr::col_logical(),
+                                                          location_type = readr::col_integer(),
+                                                          parent_station = readr::col_character(),
+                                                          platform_code = readr::col_character()),
 
 
                                   lazy = FALSE, show_col_types = FALSE)
@@ -46,7 +50,8 @@ gtfs_read <- function(path){
                                    col_types = readr::cols(route_id = readr::col_character(),
                                                            agency_id = readr::col_character(),
                                                            route_short_name = readr::col_character(),
-                                                           route_long_name = readr::col_character()),
+                                                           route_long_name = readr::col_character(),
+                                                           route_type = readr::col_integer()),
                                    show_col_types = FALSE,
                                    lazy = FALSE)
   } else {
@@ -56,7 +61,12 @@ gtfs_read <- function(path){
   if(checkmate::test_file_exists(file.path(tmp_folder,"trips.txt"))){
     gtfs$trips <- readr::read_csv(file.path(tmp_folder,"trips.txt"),
                                   col_types = readr::cols(trip_id = readr::col_character(),
-                                                          route_id = readr::col_character()),
+                                                          route_id = readr::col_character(),
+                                                          service_id = readr::col_character(),
+                                                          block_id = readr::col_character(),
+                                                          shape_id = readr::col_character(),
+                                                          wheelchair_accessible = readr::col_logical()
+                                                          ),
                                   show_col_types = FALSE,
                                   lazy = FALSE)
   } else {
@@ -66,8 +76,14 @@ gtfs_read <- function(path){
   if(checkmate::test_file_exists(file.path(tmp_folder,"stop_times.txt"))){
     gtfs$stop_times <- readr::read_csv(file.path(tmp_folder,"stop_times.txt"),
                                        col_types = readr::cols(trip_id = readr::col_character(),
+                                                               stop_id = readr::col_character(),
+                                                               stop_sequence = readr::col_integer(),
                                                                departure_time = readr::col_character(),
-                                                               arrival_time = readr::col_character()),
+                                                               arrival_time = readr::col_character(),
+                                                               shape_dist_traveled = readr::col_number(),
+                                                               timepoint = readr::col_logical(),
+                                                               pickup_type = readr::col_integer(),
+                                                               drop_off_type = readr::col_integer()),
                                        show_col_types = FALSE,
                                        lazy = FALSE)
     gtfs$stop_times$arrival_time <- lubridate::hms(gtfs$stop_times$arrival_time)
@@ -79,7 +95,15 @@ gtfs_read <- function(path){
 
   if(checkmate::test_file_exists(file.path(tmp_folder,"calendar.txt"))){
     gtfs$calendar <- readr::read_csv(file.path(tmp_folder,"calendar.txt"),
-                                     col_types = readr::cols(start_date = readr::col_date(format = "%Y%m%d"),
+                                     col_types = readr::cols(service_id = readr::col_character(),
+                                                             monday = readr::col_logical(),
+                                                             tuesday = readr::col_logical(),
+                                                             wednesday = readr::col_logical(),
+                                                             thursday = readr::col_logical(),
+                                                             friday = readr::col_logical(),
+                                                             saturday = readr::col_logical(),
+                                                             sunday = readr::col_logical(),
+                                                             start_date = readr::col_date(format = "%Y%m%d"),
                                                              end_date = readr::col_date(format = "%Y%m%d")),
                                      show_col_types = FALSE,
                                      lazy = FALSE)
@@ -90,7 +114,9 @@ gtfs_read <- function(path){
 
   if(checkmate::test_file_exists(file.path(tmp_folder,"calendar_dates.txt"))){
     gtfs$calendar_dates <- readr::read_csv(file.path(tmp_folder,"calendar_dates.txt"),
-                                           col_types = readr::cols(date = readr::col_date(format = "%Y%m%d")),
+                                           col_types = readr::cols(service_id = readr::col_character(),
+                                                                   date = readr::col_date(format = "%Y%m%d"),
+                                                                   exception_type = readr::col_integer()),
                                            show_col_types = FALSE,
                                            lazy = FALSE)
   } else {
@@ -115,6 +141,11 @@ gtfs_read <- function(path){
 
   if(checkmate::test_file_exists(file.path(tmp_folder,"shapes.txt"))){
     gtfs$shapes <- readr::read_csv(file.path(tmp_folder,"shapes.txt"),
+                                   col_types = readr::cols(shape_id = readr::col_character(),
+                                                           shape_pt_lat = readr::col_number(),
+                                                           shape_pt_lon = readr::col_number(),
+                                                           shape_pt_sequence = readr::col_integer(),
+                                                           shape_dist_traveled = readr::col_number()),
                                    show_col_types = FALSE,
                                    lazy = FALSE)
   } else {

From b31dfb76689add03f59dc155543009a3879d5a8a Mon Sep 17 00:00:00 2001
From: oweno-tfwm <owen.oneill@tfwm.org.uk>
Date: Thu, 24 Aug 2023 22:05:43 +0100
Subject: [PATCH 02/11] trip id is a varchar in BODS data - remove assumption
 that it's an int

---
 R/gtfs_subset.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/gtfs_subset.R b/R/gtfs_subset.R
index b824899..c26ff8d 100644
--- a/R/gtfs_subset.R
+++ b/R/gtfs_subset.R
@@ -36,7 +36,7 @@ gtfs_clip <- function(gtfs, bounds) {
   gtfs$stop_times <- gtfs$stop_times[gtfs$stop_times$stop_id %in% stops_inc, ]
   # Check for single stop trips
   n_stops <- table(gtfs$stop_times$trip_id)
-  single_stops <- as.integer(names(n_stops[n_stops == 1]))
+  single_stops <- names(n_stops[n_stops == 1])
   gtfs$stop_times <- gtfs$stop_times[!gtfs$stop_times$trip_id %in% single_stops, ]
 
   # Check for any unused stops

From 074826c931844c152a848dfa1d4de37eab43759a Mon Sep 17 00:00:00 2001
From: oweno-tfwm <owen.oneill@tfwm.org.uk>
Date: Thu, 24 Aug 2023 22:07:04 +0100
Subject: [PATCH 03/11] exception_type is an int - tests failing after
 tightening up data type definitions

---
 R/transxchange_export.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/transxchange_export.R b/R/transxchange_export.R
index 8517cc5..acae130 100644
--- a/R/transxchange_export.R
+++ b/R/transxchange_export.R
@@ -499,12 +499,12 @@ transxchange_export <- function(obj,
     calendar_dates <- data.frame(
       trip_id = character(),
       date = character(),
-      exception_type = character(),
+      exception_type = integer(),
       stringsAsFactors = FALSE
     )
     calendar_summary <- dplyr::group_by(calendar, start_date, end_date, DaysOfWeek)
   } else {
-    # remove calendar_dates for trips that have been competly removed
+    # remove calendar_dates for trips that have been competely removed
     calendar_dates <- calendar_dates[calendar_dates$trip_id %in% calendar$trip_id, ]
 
     calendar_summary <- dplyr::group_by(calendar, start_date, end_date, DaysOfWeek)

From 8d4c2d84d00d8677ab14ff65b55881f976fe69f4 Mon Sep 17 00:00:00 2001
From: oweno-tfwm <owen.oneill@tfwm.org.uk>
Date: Thu, 24 Aug 2023 22:10:03 +0100
Subject: [PATCH 04/11] code was assuming a set of tables, and hence dumping
 tables like 'frequencies' and 'feed_info' when reading / writing BODS data.

Removed assumptions about table names, all additional tables encountered in zip are read and written.

also performance improvement when writing.
---
 R/gtfs_read.R  | 63 ++++++++++++++++++--------------------------------
 R/write_gtfs.R | 46 +++++++++++++++---------------------
 2 files changed, 41 insertions(+), 68 deletions(-)

diff --git a/R/gtfs_read.R b/R/gtfs_read.R
index 089f2a5..fcd585d 100644
--- a/R/gtfs_read.R
+++ b/R/gtfs_read.R
@@ -16,7 +16,6 @@ gtfs_read <- function(path){
   files <- list.files(tmp_folder, pattern = ".txt")
 
   gtfs <- list()
-  message_log <- c("Unable to find optional files: ")
 
   if(checkmate::test_file_exists(file.path(tmp_folder,"agency.txt"))){
     gtfs$agency <- readr::read_csv(file.path(tmp_folder,"agency.txt"),
@@ -27,6 +26,7 @@ gtfs_read <- function(path){
   } else {
     warning("Unable to find required file: agency.txt")
   }
+
   if(checkmate::test_file_exists(file.path(tmp_folder,"stops.txt"))){
     gtfs$stops <- readr::read_csv(file.path(tmp_folder,"stops.txt"),
                                   col_types = readr::cols(stop_id = readr::col_character(),
@@ -34,7 +34,7 @@ gtfs_read <- function(path){
                                                           stop_name = readr::col_character(),
                                                           stop_lat = readr::col_number(),
                                                           stop_lon = readr::col_number(),
-                                                          wheelchair_boarding = readr::col_logical(),
+                                                          wheelchair_boarding = readr::col_integer(), #boolean but treat as integer so 0|1 written to file
                                                           location_type = readr::col_integer(),
                                                           parent_station = readr::col_character(),
                                                           platform_code = readr::col_character()),
@@ -65,7 +65,7 @@ gtfs_read <- function(path){
                                                           service_id = readr::col_character(),
                                                           block_id = readr::col_character(),
                                                           shape_id = readr::col_character(),
-                                                          wheelchair_accessible = readr::col_logical()
+                                                          wheelchair_accessible = readr::col_integer() #boolean but treat as integer so 0|1 written to file
                                                           ),
                                   show_col_types = FALSE,
                                   lazy = FALSE)
@@ -81,7 +81,7 @@ gtfs_read <- function(path){
                                                                departure_time = readr::col_character(),
                                                                arrival_time = readr::col_character(),
                                                                shape_dist_traveled = readr::col_number(),
-                                                               timepoint = readr::col_logical(),
+                                                               timepoint = readr::col_integer(), #boolean but treat as integer so 0|1 written to file
                                                                pickup_type = readr::col_integer(),
                                                                drop_off_type = readr::col_integer()),
                                        show_col_types = FALSE,
@@ -96,13 +96,13 @@ gtfs_read <- function(path){
   if(checkmate::test_file_exists(file.path(tmp_folder,"calendar.txt"))){
     gtfs$calendar <- readr::read_csv(file.path(tmp_folder,"calendar.txt"),
                                      col_types = readr::cols(service_id = readr::col_character(),
-                                                             monday = readr::col_logical(),
-                                                             tuesday = readr::col_logical(),
-                                                             wednesday = readr::col_logical(),
-                                                             thursday = readr::col_logical(),
-                                                             friday = readr::col_logical(),
-                                                             saturday = readr::col_logical(),
-                                                             sunday = readr::col_logical(),
+                                                             monday = readr::col_integer(), #boolean but treat as integer so 0|1 written to file
+                                                             tuesday = readr::col_integer(), #boolean but treat as integer so 0|1 written to file
+                                                             wednesday = readr::col_integer(), #boolean but treat as integer so 0|1 written to file
+                                                             thursday = readr::col_integer(), #boolean but treat as integer so 0|1 written to file
+                                                             friday = readr::col_integer(), #boolean but treat as integer so 0|1 written to file
+                                                             saturday = readr::col_integer(), #boolean but treat as integer so 0|1 written to file
+                                                             sunday = readr::col_integer(), #boolean but treat as integer so 0|1 written to file
                                                              start_date = readr::col_date(format = "%Y%m%d"),
                                                              end_date = readr::col_date(format = "%Y%m%d")),
                                      show_col_types = FALSE,
@@ -123,22 +123,6 @@ gtfs_read <- function(path){
     message("Unable to find conditionally required file: calendar_dates.txt")
   }
 
-  if(checkmate::test_file_exists(file.path(tmp_folder,"fare_attributes.txt"))){
-    gtfs$fare_attributes <- readr::read_csv(file.path(tmp_folder,"fare_attributes.txt"),
-                                            show_col_types = FALSE,
-                                            lazy = FALSE)
-  } else {
-    message_log <- c(message_log, "fare_attributes.txt")
-  }
-
-  if(checkmate::test_file_exists(file.path(tmp_folder,"fare_rules.txt"))){
-    gtfs$fare_rules <- readr::read_csv(file.path(tmp_folder,"fare_rules.txt"),
-                                       show_col_types = FALSE,
-                                       lazy = FALSE)
-  } else {
-    message_log <- c(message_log, "fare_rules.txt")
-  }
-
   if(checkmate::test_file_exists(file.path(tmp_folder,"shapes.txt"))){
     gtfs$shapes <- readr::read_csv(file.path(tmp_folder,"shapes.txt"),
                                    col_types = readr::cols(shape_id = readr::col_character(),
@@ -148,25 +132,24 @@ gtfs_read <- function(path){
                                                            shape_dist_traveled = readr::col_number()),
                                    show_col_types = FALSE,
                                    lazy = FALSE)
-  } else {
-    message_log <- c(message_log, "shapes.txt")
-  }
-
-  if(checkmate::test_file_exists(file.path(tmp_folder,"transfers.txt"))){
-    gtfs$transfers <- readr::read_csv(file.path(tmp_folder,"transfers.txt"),
-                                      show_col_types = FALSE,
-                                      lazy = FALSE)
-  } else {
-    message_log <- c(message_log, "transfers.txt")
   }
 
-  unlink(tmp_folder, recursive = TRUE)
 
+  #load any other tables in the .zip file
+  filenamesOnly <- tools::file_path_sans_ext(basename(files))
+  notLoadedFiles = setdiff(  filenamesOnly, names(gtfs) )
 
-  if(length(message_log) > 0){
-    message(paste(message_log, collapse = " "))
+  for (fileName in notLoadedFiles)
+  {
+    table <- readr::read_csv(file.path( tmp_folder, paste0( fileName, ".txt" ) ),
+                              show_col_types = FALSE,
+                              lazy = FALSE)
+    gtfs[[fileName]] <- table
   }
 
+  #remove temp directory
+  unlink(tmp_folder, recursive = TRUE)
+
   return(gtfs)
 }
 
diff --git a/R/write_gtfs.R b/R/write_gtfs.R
index 6e2e5d9..2142f7f 100644
--- a/R/write_gtfs.R
+++ b/R/write_gtfs.R
@@ -1,6 +1,6 @@
 #' Write GTFS
 #'
-#' Takes a list of data frames represneting the GTFS fromat and saves them as GTFS
+#' Takes a list of data frames representing the GTFS format and saves them as GTFS
 #' Zip file.
 #'
 #' @param gtfs named list of data.frames
@@ -19,6 +19,7 @@ gtfs_write <- function(gtfs,
                        stripTab = TRUE,
                        stripNewline = TRUE,
                        quote = FALSE) {
+
   if (stripComma) {
     for (i in seq_len(length(gtfs))) {
       gtfs[[i]] <- stripCommas(gtfs[[i]])
@@ -33,7 +34,6 @@ gtfs_write <- function(gtfs,
 
 
   #Format Dates
-
   if(class(gtfs$calendar$start_date) == "Date"){
     gtfs$calendar$start_date <- format(gtfs$calendar$start_date, "%Y%m%d")
   }
@@ -57,24 +57,20 @@ gtfs_write <- function(gtfs,
 
 
   dir.create(paste0(tempdir(), "/gtfs_temp"))
-  data.table::fwrite(gtfs$calendar, paste0(tempdir(), "/gtfs_temp/calendar.txt"), row.names = FALSE, quote = quote)
-  if (nrow(gtfs$calendar_dates) > 0) {
-    data.table::fwrite(gtfs$calendar_dates, paste0(tempdir(), "/gtfs_temp/calendar_dates.txt"), row.names = FALSE, quote = quote)
-  }
-  data.table::fwrite(gtfs$routes, paste0(tempdir(), "/gtfs_temp/routes.txt"), row.names = FALSE, quote = quote)
-  data.table::fwrite(gtfs$stop_times, paste0(tempdir(), "/gtfs_temp/stop_times.txt"), row.names = FALSE, quote = quote)
-  data.table::fwrite(gtfs$trips, paste0(tempdir(), "/gtfs_temp/trips.txt"), row.names = FALSE, quote = quote)
-  data.table::fwrite(gtfs$stops, paste0(tempdir(), "/gtfs_temp/stops.txt"), row.names = FALSE, quote = quote)
-  data.table::fwrite(gtfs$agency, paste0(tempdir(), "/gtfs_temp/agency.txt"), row.names = FALSE, quote = quote)
-  if ("transfers" %in% names(gtfs)) {
-    data.table::fwrite(gtfs$transfers, paste0(tempdir(), "/gtfs_temp/transfers.txt"), row.names = FALSE, quote = quote)
-  }
-  if ("shapes" %in% names(gtfs)) {
-    data.table::fwrite(gtfs$shapes, paste0(tempdir(), "/gtfs_temp/shapes.txt"), row.names = FALSE, quote = quote)
+
+  for ( tableName in names(gtfs) )
+  {
+    table <- gtfs[[tableName]]
+
+    if ( !is.null(table) & nrow(table) > 0 )
+    {
+      data.table::fwrite(table, file.path(tempdir(), "gtfs_temp", paste0(tableName, ".txt")), row.names = FALSE, quote = quote)
+    }
   }
+
   zip::zipr(paste0(folder, "/", name, ".zip"), list.files(paste0(tempdir(), "/gtfs_temp"), full.names = TRUE), recurse = FALSE)
+
   unlink(paste0(tempdir(), "/gtfs_temp"), recursive = TRUE)
-  message(paste0(folder, "/", name, ".zip"))
 }
 
 
@@ -126,9 +122,11 @@ stripTabs <- function(df, stripNewline) {
 
 
 #' Convert Period to GTFS timestamps
+#' When writing a 400mb (zipped) file, we spend nearly 4 minutes in this fn(), about 10x longer than writing the files to the filesystem.
+#' profiler reports this being mostly nchar(), so we optimise down to one sprintf which reduces the time to 1 minute
+#' .format() is about 7x slower than sprintf()
 #'
-#'
-#' @param x peridos
+#' @param x periods
 #' @noRd
 #'
 period2gtfs <- function(x) {
@@ -139,14 +137,6 @@ period2gtfs <- function(x) {
     stop("Days detected in period objects, incorectly formatted period object")
   }
 
-  hrs <- as.character(lubridate::hour(x))
-  min <- as.character(lubridate::minute(x))
-  sec <- as.character(lubridate::second(x))
-
-  hrs <- ifelse(nchar(hrs) == 1,paste0("0",hrs), hrs)
-  min <- ifelse(nchar(min) == 1,paste0("0",min), min)
-  sec <- ifelse(nchar(sec) == 1,paste0("0",sec), sec)
-
-  return(paste0(hrs,":",min,":",sec))
+  return( sprintf("%02d:%02d:%02d", lubridate::hour(x), lubridate::minute(x), lubridate::second(x)) )
 }
 

From ccd451e6f0795aa16d8ef0a3d842f9465eade36e Mon Sep 17 00:00:00 2001
From: oweno-tfwm <owen.oneill@tfwm.org.uk>
Date: Thu, 24 Aug 2023 22:12:42 +0100
Subject: [PATCH 05/11] merge was expecting a fixed set of tables and columns
 and throwing away anything else that it wasn't expecting. OK for the GTFS
 tables generated by this tool from CIF files, but caused data loss when using
 BODS data.

modified to keep all encountered columns and tables - passes through if there isn't any specific code for the table in question.
---
 R/gtfs_merge.R | 225 ++++++++++++++++++++++++++++++++-----------------
 1 file changed, 147 insertions(+), 78 deletions(-)

diff --git a/R/gtfs_merge.R b/R/gtfs_merge.R
index cf08bfe..cf3f1de 100644
--- a/R/gtfs_merge.R
+++ b/R/gtfs_merge.R
@@ -1,5 +1,11 @@
 #' merge a list of gtfs files
 #'
+#' !WARNING! only the tables:
+#' agency, stops, routes, trips, stop_times, calendar, calendar_dates, shapes
+#' are processed, any other tables in the input timetables are passed through
+#'
+#' if duplicate IDs are detected then completely new ID for all rows will be generated in the output.
+#'
 #' @param gtfs_list a list of gtfs objects to be merged
 #' @param force logical, if TRUE duplicated values are merged taking the fist
 #' @param quiet logical, if TRUE less messages
@@ -8,40 +14,62 @@
 #' @export
 gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) {
 
-  # remove any NULLS
+  # remove any empty input tables
   gtfs_list <- gtfs_list[lengths(gtfs_list) != 0]
+  flattened <- unlist(gtfs_list, recursive = FALSE)
+  rm(gtfs_list)
 
-  # Split out lists
-  agency <- sapply(gtfs_list, "[", "agency")
-  stops <- sapply(gtfs_list, "[", "stops")
-  routes <- sapply(gtfs_list, "[", "routes")
-  trips <- sapply(gtfs_list, "[", "trips")
-  stop_times <- sapply(gtfs_list, "[", "stop_times")
-  calendar <- sapply(gtfs_list, "[", "calendar")
-  calendar_dates <- sapply(gtfs_list, "[", "calendar_dates")
+  #get unique input table names
+  tableNames <- unique(names(flattened))
+
+    grouped_list <- list()
+
+  # Loop through table names names and group data frames
+  for (tableName in tableNames) {
+
+    matched <- purrr::imap( flattened, function( item, name ) {
+      if (name == tableName) {
+        return(item)
+      }
+    })
+
+    #remove empty input tables
+    matched <- matched[lengths(matched) != 0]
+
+    #assign each instance of the input table a unique number
+    names(matched) <- seq(1, length(matched))
 
-  # bind together
-  names(agency) <- seq(1, length(agency))
-  suppressWarnings(agency <- dplyr::bind_rows(agency, .id = "file_id"))
+    #add a column to the data frame containing this unique number
+    suppressWarnings(matched <- dplyr::bind_rows(matched, .id = "file_id"))
 
-  names(stops) <- seq(1, length(stops))
-  suppressWarnings(stops <- dplyr::bind_rows(stops, .id = "file_id"))
+    #if("calendar_dates"==tableName)
+    #{
+    #  #don't understand what this is doing ? comment would be nice.
+    #  calendar_dates <- calendar_dates[sapply(calendar_dates, function(x){ifelse(is.null(nrow(x)),0,nrow(x))}) > 0]
+    #  #matched <- matched[sapply(matched, function(x){ifelse(is.null(nrow(x)),0,nrow(x))}) > 0]
+    #}
 
-  names(routes) <- seq(1, length(routes))
-  suppressWarnings(routes <- dplyr::bind_rows(routes, .id = "file_id"))
+    #add to map
+    grouped_list[[tableName]] <-  matched
+  }
 
-  names(trips) <- seq(1, length(trips))
-  suppressWarnings(trips <- dplyr::bind_rows(trips, .id = "file_id"))
+  rm(flattened)
 
-  names(stop_times) <- seq(1, length(stop_times))
-  suppressWarnings(stop_times <- dplyr::bind_rows(stop_times, .id = "file_id"))
+  # Split out lists
+  agency <- grouped_list$agency
+  stops <- grouped_list$stops
+  routes <- grouped_list$routes
+  trips <- grouped_list$trips
+  stop_times <- grouped_list$stop_times
+  calendar <- grouped_list$calendar
+  calendar_dates <- grouped_list$calendar_dates
+  shapes <- grouped_list$shapes
+  frequencies <- grouped_list$frequencies
 
-  names(calendar) <- seq(1, length(calendar))
-  suppressWarnings(calendar <- dplyr::bind_rows(calendar, .id = "file_id"))
+  #remove items from map.
+  grouped_list <- grouped_list[setdiff(names(grouped_list),
+          c("agency", "stops", "routes", "trips", "stop_times", "calendar", "calendar_dates", "shapes", "frequencies" ))]
 
-  names(calendar_dates) <- seq(1, length(calendar_dates))
-  calendar_dates <- calendar_dates[sapply(calendar_dates, function(x){ifelse(is.null(nrow(x)),0,nrow(x))}) > 0]
-  suppressWarnings(calendar_dates <- dplyr::bind_rows(calendar_dates, .id = "file_id"))
 
   # fix typo
   agency$agency_name <- as.character(agency$agency_name)
@@ -112,49 +140,54 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) {
     } else {
       stop("Duplicated Stop IDS")
     }
-
-
   }
 
   # routes
   if (any(duplicated(routes$route_id))) {
     if(!quiet){message("De-duplicating route_id")}
-    route_id <- routes[, c("file_id", "route_id")]
-    if (any(duplicated(route_id))) {
+
+    retainedColumnNames <- colnames(routes)[!(colnames(routes) %in% c("route_id", "file_id"))]
+
+    new_route_id <- routes[, c("file_id", "route_id")]
+    if (any(duplicated(new_route_id))) {
       if(force){
-        routes <- routes[!duplicated(route_id), ]
-        route_id <- routes[, c("file_id", "route_id")]
+        routes <- routes[!duplicated(new_route_id), ]
+        new_route_id <- routes[, c("file_id", "route_id")]
       } else {
         stop("Duplicated route_id within the same GTFS file, try using force = TRUE")
       }
     }
 
-    route_id$route_id_new <- seq(1, nrow(route_id))
-    routes <- dplyr::left_join(routes, route_id, by = c("file_id", "route_id"))
+    new_route_id$route_id_new <- seq(1, nrow(new_route_id))
+    routes <- dplyr::left_join(routes, new_route_id, by = c("file_id", "route_id"))
 
-    columns_to_select <- c("route_id_new", "agency_id", "route_short_name", "route_long_name", "route_desc", "route_type")
-    columns_to_select <- columns_to_select[columns_to_select %in% colnames(routes)]
-    routes <- routes[, columns_to_select]
-    names(routes) <- columns_to_select
+    routes <- routes[, c("route_id_new", retainedColumnNames)]
+    routes <- routes %>% dplyr::rename(route_id = route_id_new)
   }
 
 
   # calendar
   if (any(duplicated(calendar$service_id))) {
     if(!quiet){message("De-duplicating service_id")}
-    service_id <- calendar[, c("file_id", "service_id")]
-    if (any(duplicated(service_id))) {
+
+    new_service_id <- calendar[, c("file_id", "service_id")]
+    if (any(duplicated(new_service_id))) {
       stop("Duplicated service_id within the same GTFS file")
     }
-    service_id$service_id_new <- seq(1, nrow(service_id))
-    calendar <- dplyr::left_join(calendar, service_id, by = c("file_id", "service_id"))
-    calendar <- calendar[, c("service_id_new", "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "start_date", "end_date")]
-    names(calendar) <- c("service_id", "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "start_date", "end_date")
+
+    new_service_id$service_id_new <- seq(1, nrow(new_service_id))
+
+    retainedColumnNames <- colnames(calendar)[!(colnames(calendar) %in% c("service_id", "file_id"))]
+    calendar <- dplyr::left_join(calendar, new_service_id, by = c("file_id", "service_id"))
+    calendar <- calendar[, c("service_id_new", retainedColumnNames)]
+    names(calendar) <- c("service_id", retainedColumnNames)
 
     if (nrow(calendar_dates) > 0) {
-      calendar_dates <- dplyr::left_join(calendar_dates, service_id, by = c("file_id", "service_id"))
-      calendar_dates <- calendar_dates[, c("service_id_new", "date", "exception_type")]
-      names(calendar_dates) <- c("service_id", "date", "exception_type")
+      retainedColumnNames <- colnames(calendar_dates)[!(colnames(calendar_dates) %in% c("service_id", "file_id"))]
+
+      calendar_dates <- dplyr::left_join(calendar_dates, new_service_id, by = c("file_id", "service_id"))
+      calendar_dates <- calendar_dates[, c("service_id_new", retainedColumnNames)]
+      calendar_dates <- calendar_dates %>% dplyr::rename(service_id = service_id_new)
     }
   }
 
@@ -162,45 +195,60 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) {
   # Trips
   if (any(duplicated(trips$trip_id))) {
     if(!quiet){message("De-duplicating trip_id")}
-    trip_id <- trips[, c("file_id", "trip_id")]
-    if (any(duplicated(trip_id))) {
+
+    new_trip_id <- trips[, c("file_id", "trip_id")]
+    if (any(duplicated(new_trip_id))) {
       if(force){
         trips <- unique(trips)
         stop_times <- unique(stop_times)
-        trip_id <- trips[, c("file_id", "trip_id")]
+        new_trip_id <- trips[, c("file_id", "trip_id")]
       } else{
         stop("Duplicated trip_id within the same GTFS file")
       }
 
 
     }
-    trip_id$trip_id_new <- seq(1, nrow(trip_id))
-    trips <- dplyr::left_join(trips, trip_id, by = c("file_id", "trip_id"))
-    trips <- trips[, c("route_id", "service_id", "trip_id_new", "file_id")]
-    names(trips) <- c("route_id", "service_id", "trip_id", "file_id")
-
-
-    stop_times <- dplyr::left_join(stop_times, trip_id, by = c("file_id", "trip_id"))
-    stop_times <- stop_times[, c("trip_id_new", "arrival_time", "departure_time", "stop_id", "stop_sequence", "timepoint")]
-    names(stop_times) <- c("trip_id", "arrival_time", "departure_time", "stop_id", "stop_sequence", "timepoint")
+    new_trip_id$trip_id_new <- seq(1, nrow(new_trip_id))
+
+    retainedColumnNames <- colnames(trips)[!(colnames(trips) %in% c("trip_id"))]
+    trips <- dplyr::left_join(trips, new_trip_id, by = c("file_id", "trip_id"))
+    trips <- trips[, c("trip_id_new", retainedColumnNames)]
+    trips <- trips %>% dplyr::rename(trip_id = trip_id_new)
+
+    retainedColumnNames <- colnames(stop_times)[!(colnames(stop_times) %in% c("trip_id", "file_id"))]
+    stop_times <- dplyr::left_join(stop_times, new_trip_id, by = c("file_id", "trip_id"))
+    stop_times <- stop_times[, c("trip_id_new", retainedColumnNames)]
+    stop_times <- stop_times %>% dplyr::rename(trip_id = trip_id_new)
+
+    if ( length(frequencies) > 0 )
+    {
+      retainedColumnNames <- colnames(frequencies)[!(colnames(frequencies) %in% c("trip_id", "file_id"))]
+      frequencies <- dplyr::left_join(frequencies, new_trip_id, by = c("file_id", "trip_id"))
+      frequencies <- frequencies[, c("trip_id_new", retainedColumnNames)]
+      frequencies <- frequencies %>% dplyr::rename(trip_id = trip_id_new)
+    }
   }
-  if (exists("service_id")) {
-    trips <- dplyr::left_join(trips, service_id, by = c("file_id", "service_id"))
-    trips <- trips[, c("route_id", "service_id_new", "trip_id", "file_id")]
-    names(trips) <- c("route_id", "service_id", "trip_id", "file_id")
+
+  if (exists("new_service_id")) {
+    retainedColumnNames <- colnames(trips)[!(colnames(trips) %in% c("service_id"))]
+    trips <- dplyr::left_join(trips, new_service_id, by = c("file_id", "service_id"))
+    trips <- trips[, c(retainedColumnNames, "service_id_new")]
+    trips <- trips %>% dplyr::rename(service_id = service_id_new)
   }
-  if (exists("route_id")) {
-    trips <- dplyr::left_join(trips, route_id, by = c("file_id", "route_id"))
-    trips <- trips[, c("route_id_new", "service_id", "trip_id", "file_id")]
-    names(trips) <- c("route_id", "service_id", "trip_id", "file_id")
+
+  if (exists("new_route_id")) {
+    retainedColumnNames <- colnames(trips)[!(colnames(trips) %in% c("route_id"))]
+    trips <- dplyr::left_join(trips, new_route_id, by = c("file_id", "route_id"))
+    trips <- trips[, c("route_id_new", retainedColumnNames)]
+    trips <- trips %>% dplyr::rename(route_id = route_id_new)
   }
 
-  trips <- trips[, c("route_id", "service_id", "trip_id")]
-  names(trips) <- c("route_id", "service_id", "trip_id")
+  trips$file_id <- NULL
 
   # Condense Duplicate Service patterns
   if (nrow(calendar_dates) > 0) {
     if(!quiet){message("Condensing duplicated service patterns")}
+
     calendar_dates_summary <- dplyr::group_by(calendar_dates, service_id)
     if(class(calendar_dates_summary$date) == "Date"){
       calendar_dates_summary <- dplyr::summarise(calendar_dates_summary,
@@ -221,27 +269,48 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) {
     calendar_summary$service_id_new <- dplyr::group_indices(calendar_summary)
     calendar_summary <- calendar_summary[, c("service_id_new", "service_id")]
 
+    retainedColumnNames <- colnames(trips)[!(colnames(trips) %in% c("service_id", "route_id"))]
     trips <- dplyr::left_join(trips, calendar_summary, by = c("service_id"))
-    trips <- trips[, c("route_id", "service_id_new", "trip_id")]
-    names(trips) <- c("route_id", "service_id", "trip_id")
+    trips <- trips[, c("route_id", "service_id_new", retainedColumnNames)]
+    trips <- trips %>% dplyr::rename(service_id = service_id_new)
 
+    retainedColumnNames <- colnames(calendar)[!(colnames(calendar) %in% c("service_id", "file_id"))]
     calendar <- dplyr::left_join(calendar, calendar_summary, by = c("service_id"))
-    calendar <- calendar[, c("service_id_new", "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "start_date", "end_date")]
-    names(calendar) <- c("service_id", "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "start_date", "end_date")
+    calendar <- calendar[, c("service_id_new", retainedColumnNames)]
+    calendar <- calendar %>% dplyr::rename(service_id = service_id_new)
     calendar <- calendar[!duplicated(calendar$service_id), ]
 
-
+    retainedColumnNames <- colnames(calendar_dates)[!(colnames(calendar_dates) %in% c("service_id", "file_id"))]
     calendar_dates <- dplyr::left_join(calendar_dates, calendar_summary, by = c("service_id"))
-    calendar_dates <- calendar_dates[, c("service_id_new", "date", "exception_type")]
-    names(calendar_dates) <- c("service_id", "date", "exception_type")
+    calendar_dates <- calendar_dates[, c("service_id_new", retainedColumnNames)]
+    calendar_dates <- calendar_dates %>% dplyr::rename(service_id = service_id_new)
     calendar_dates <- calendar_dates[!duplicated(calendar_dates$service_id), ]
   }
 
+  # shapes are keyed on a UUID type string, so fairly improbable that the keys collide unless it's actually the same object
+  composite_key <- paste0(shapes$shape_id, shapes$shape_pt_sequence, sep = "#")
+  if (any(duplicated(composite_key))) {
+    if(force){
+      shapes <- shapes[!duplicated(composite_key),]
+    } else {
+      stop("Duplicated Shapes IDS")
+    }
+  }
+
+  shapes$file_id <- NULL
   stop_times$file_id <- NULL
   routes$file_id <- NULL
   calendar$file_id <- NULL
+  res_final <- list(agency, stops, routes, trips, stop_times, calendar, calendar_dates, shapes, frequencies)
+  names(res_final) <- c("agency", "stops", "routes", "trips", "stop_times", "calendar", "calendar_dates", "shapes","frequencies")
+
+  #for tables we don't explicitly process - hope items are unique
+  for (item in grouped_list) {
+    item$file_id <- NULL
+  }
+
+  #remove nulls (e.g. tables that are often empty like frequencies)
+  res_final <- Filter(Negate(is.null), res_final)
 
-  res_final <- list(agency, stops, routes, trips, stop_times, calendar, calendar_dates)
-  names(res_final) <- c("agency", "stops", "routes", "trips", "stop_times", "calendar", "calendar_dates")
-  return(res_final)
+  return (c(res_final, grouped_list))
 }

From c9f7cce8da16710a0a7518e380675c7697a37d42 Mon Sep 17 00:00:00 2001
From: oweno-tfwm <owen.oneill@tfwm.org.uk>
Date: Thu, 24 Aug 2023 22:29:38 +0100
Subject: [PATCH 06/11] added FOC YG + MV (appearing in NR timetable extracts)
 renamed ZZ to reflect it's a generic obfuscated freight, not a specific
 operator

---
 data/atoc_agency.rda | Bin 2055 -> 2133 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/data/atoc_agency.rda b/data/atoc_agency.rda
index 93c47cad3b74f115a6b19afb7c8bdd3d7cd81550..97495dad84e4893b5dc671509aefc7d12b9e8329 100644
GIT binary patch
literal 2133
zcmV-b2&(rViwFP!000002JKqia}!4rcOay|fC-Qg5=fZj5*MjD+uGHqQ}^O1{yHSX
zdgTvzQ7x;nw6V0Sn%%YRd%FMX{=4IPc4m5K*0LO4@{kjSOw;q5{_dW}X8*V2^5XI0
z!otGMg->tXSh#r$$c0-6ovoF|0&w6jpDcX3a0mXp>m`xj_4+jQF97}+Sf9b?bNK8^
z$c>GCq>dETY$COTl&h%CHc~n7Kv7$GcRQ!{kV0+SsBQaLQ9HjPg?D#4NFi>wrKr|#
zNNpmug%n!adW{rX&_e55sH2Ti+J}nT!_&RDigGrPLYp15*+Huul;@yT&NfmQUFV-j
zy+R7TbkGk6Bk7<=&Ksm0q|i$Ty>!qI2eaj%mk#E`$-R81C>Q;3(NY&}anTkRZE?{8
z7bSPlqYmOa`6=o>L`#oQ$`Sf;jB!53)3+GYckc=net%;)47|{%WZw&hlKFUpvEa-j
z9p(jLOpd%Ejpflz-}WW>usH~Z31wk+b>dwp%u|;#KO#-I){8=rPTV1-W6I?D7n@$3
zkao};dSNe?TIAX7&6GuP;wkKt!+<5JHzb@trXcH$mkbp8%dLQcsHQg#62LU0Fr<E>
zT-#<KpEPNhFi*L*w9TlOkiCc{12A237c+~R5qwo45RwJ@h{j0{ez5H&lfXYEo9AOj
zV`a<pRuqD}LNwt5wa`e+=Q|*7z@kaP)Q3XbPC6W#;m9wT%I@yQJb0uPgjC7MeK2J_
z*?3PO$&qk|GO`i&A?Nb+k%-bk7=Z1@y(Q_8Oq5b1xv$SC>$51u1F}bxNyJVwm&wc5
z;R;BDD78I*09M97tH`z@KN>|za7GCf%30uJaXpf<55itS_+p!~m|n2FM(-VZ%o_!<
zUbpvET6N~PA|Y?Vw0ICOO4?Dx;zDA6G>nI!_lT2v45AgU9kD3vla8wz>#2ct(>SK*
z<Xwb`FwdcO3g=%J&$B|&0N)q@>=O;~+&~BixgZ_$Hv@C2IIp%?l!t!)!Z=qQxy5ME
zAMlcsu<s3+R*r3`D!o*{G$ZIoiC$pF;VbCgskh$naljIl!0$}m=RBr-BBXH=jd1cj
zGb8K$9x*QHcj!b_gD<BApDgo+O~8YY&zYAyuJ!X&#YUKUM%2hFm`C<q_t&W!-6&NV
z+JMSQU_Mfq#!193rZ(DccoMOZm+<y=FQAr2$`kvKcw&rV<<ldxLbdU~Y@8gtg6P3R
zkx*_115qsXla#4EKQ5FhbhdiG?`3{cPamvb3*YljbyJ>VXmj;Ah)Z~3$!#^sYrtgg
zD;poU6EB3duF&GQd7Wu<;i1r$wT&_N7CQrPMB`x)o~niv;~7w4{Dy`hM6WQ9p{LK&
zn7r&<z|k@Y^%!Z>gheW(ONRjj9E>tlJ={3+AQ{34<N3)s#n<)85u@i|A>r9{;cXaa
z!BTN;!t)t7VRR-~Ft<A=Eqx+B+W|iy8OvnHg9k^HojHlWSUC~hy%N1YNn;;U+*>KE
z4&M!uWc<t8+GH|W_2d@82lT2Rt){1IY7FV9Y^_ML2uyfYnb7AN6xNKDD1;nP6Z3<P
zk^2Xe0Wy(KHDoOIkJT7TK%M`no%1>z$tDoynX;BSHU=;aS0}+~Fs8l0Ta8$MjsI<}
zu(N!l&|LJveyhFfy1RSZAW{`_dA7W)+`4kB>6)v89)Lc7x_z<VT#o$r1`?kBP`-{5
zI#R%Ag-H0k?5@t@3AZj2I1z7`3DwJ@FE%AAdPUdy%j^h<Ioi$UXjkr1U9+EGXnF>6
zOY%jibVYhm4Pd{TmfmTirr4|pH0HyD`#(+Ird|LtTv0k65B*G>_Z}{<<ovbtbcN?+
zEf2mIkI(hmQw=0GxxQHByzHUAx<9h=Ncmzn&B$*x<S+}j>;T<mqJff=;c7oR<07Ks
z@^^j7uV0sNwG?*MDC)L?Q2=vC3iwKzlBDmcZi{Y#G+>0wVy0l4jb~zeym|l`&7w*9
z$80}D#o`mz&6Z77xkPUfWl1RB$jkjZN6k@l)SrR+sW{l>UMd$06Ku^M0PcXEoCI{(
z>lU}y7nii?8b(u1j)#7P`>l>zxyA~y0vDc4eMa(CX!g!9^NG`e3GT~N+|iedWSqaM
zSG+_a-=n!7{CvbWXbJlJY=`A~-8I(hwb^&X^4;aDc2{4%ugP$ZSK(FqD%V^Ti+lGc
zyVkw5xa;ev&v$rV?C{fq|AW$_J%gjDM`O+qtAw5=RZN3y?H|Cd{_%}c_^zS&Zk$}x
z=FB;B&YUyn%)dCZ@WW9W&Z#-|m!ixHzgL1^M1GLhs8mU<RI_TeBG0Nc>SfhBoL7U3
zRbthu$SjvZSPec^l$asw#Iiv|y)jeuALR*i%9Tdd&NTzCWY<WgUes)ZX84#)F498g
zQYNxd1zweuXDPogl-C=iwqCEA!Kl~P$$BYg+9lGcgG`8Ie%)Gclu5&~Np;;qJ(g`(
zNUd73ih4kI-L6z;8f4iPsg*0lE>()REmCRJ>K_^3%<#HxgNcUWRg2V1HmIpv$kc9^
zj96woEy7nRs*@^YqcLaxUol06l%Oqbdq#m|rDPEsdUze`?&Ie~J!4d%baT~$M-1jw
z!y{`ztA+)?1nmp<l=$y@WmaS5J~pnia{A>;os?moSaw;nvdkmKd6m2^%VitdA9}Nv
zUpLy){BL2mLjK(h`4W64nRmp06YB=*yU}+ggM%e&QL<8|e$e(na*|*~|CRqZUELje
zVP82ZS>pkR6%6u+UaSmK%*CFUc&jIjt2g0ySOouqmp<I*@7llN%ZCpi{sVvfmw)~T
LqfUnGBqsm>ZNWZm

literal 2055
zcmV+i2>ACxT4*^jL0KkKS)Y2IIsgkEf5`uT|NsAI|MWlq|M0*6-|#>H03ZMX;0S+3
z$$D6&ZM)q!PTdC}06GR3qzQrv>Uv4&o}kcpQKp`ugwPrdJx^28c|8px4^wIY9*{JX
zLV5ujo}uj#Jw}ZH007VdkOM#sG%$bxrkYJ8%4%qOo`?bJ2dD#020@?z000d!44MEX
zGDtN+r1dm<K=gnB27nC!001%o01X3;4H`5WGz|a%000bt000dEkN{wjAesrFhMJi*
zJk*;aJt><_DYY6j9--<Q18E+nAbLTgLqZmph5|?p4(4pE2?zFIfEkz0d~fHVgp;7?
z=H{e91Pu#sHri;ANdi<1sDKC(`lidldma@#VrQP?zUEf3;KRipNNB$5<i#9tyAGC?
zN$Jy)rd~NDlCLJETM|=_IO3s6rau#r%F6(d2A!(mYuQ$tcQd_4*v6WltS3ShkvAvR
z@xWiv>zApkFcbA%;+j&K1o6})1Nwf6m8#j-)Lmh5r?IyPohiJi^7`h5IyGB@zApJ1
z$}ibrLbP_ciMwRI1OU@V-CCF6hRGsGhjDlD-<edey^99qyP&+tLPBW(2Go;66azpY
z^E8B#7n!c>5)wY+0S2vRyw;9fkQXGk$o_EPjVy;0Fk^W5rxKdHx;6yxKn)^+qO?PS
z3NBXx7uz+*mz}OyQ4^`{AK@VrtlG(K?W!XEvn`|B4iPhiC}SJ+G1ix{UYyIBszU)0
zx4HM$(%4ARaOx6@EM?*iUQPppT#FNj$re`F;NV~evBt%SgkreQrE_aL>=at~EJfjU
zA|Pm4xE$J^-ET>mzA7wp$0is%(?|-zB#qNcCa(IJ2;Byhj1CdGz=aL5yc#sJP0l`q
zkQ;*(kPR&BZnTaYDbi>BcqB9lZ#*p>{-^{V7K9Nj-76HSmXc8g1Q8p&ZpmO7?ZiTE
z#M_*_EJH~RwIq`eew33bPlj$!^*b>NqY2^HeH)z6DqPWI;9`y&U|>XHX!6S@WQIBo
z<t9|R&J&`+X(=QGCm`}vp%X2nrib-{-pO0GWnGwIc8CgLQj?3M)C@}d?k4ZF?zQ$p
z7(>bo<X)vLtwfa);0Az8R`>ytaBD8;M5~flHe^-bb!!scK!gM`rMB3~S&h)K0u<;V
zijX*^oLdPzc7oc|K-vZYMIZwP{lPt02MWcj!9um&tGh0{UG1tyA#p~F5lAF0uGC$K
zwNFV_nkihSZ>qt=sf37t2dN5#9dW+MobW;#L5S^@gG(<@fq9=>jan(^6U1i|8h257
z3Vvt1`?jHsK^g=%00IJG>lrXDQftj9B@~7LH$iuw2n*G(Ki1OIEh}F#N(qMiD(@Yb
z&tN=4%4Z(}baA+_Ip?1o#H`Q(DOyGrM}Ag&A^%z=nt>!1#fV{py%kDq3ep1Ipve1+
zSyHUVA!DEAx^#V)L8oVvgV&i>JU!p@A!OG^>*ReSBK^93`fuNNCoSR7ZJ|a62pAuh
zrt{)h;($Z*ZPv-Vu5V?t!+|C^*PkOWJ$`UnnF<Jz(5>dl1s9T+?dTTm*MDRZQ2+!A
zD5z9KC0NMR9)^LXhUmmA;h1x65wHtZ$+Uqb7>rDdZp$}s+`7TBjjE^_fX-ZCU(=q1
zyOfUytAPSx++N|fCDncZxS29WO6;7*_kmIXf<b^LM7J<m1`9xqhJZ3;n6x!*_ejR6
z7Xcw85YpT{+h8_l9J_Z)9XNd{ASF6K9QXKi_lIDqA`AoS(RxP$t##&9@q5fj4G9`j
zw|m!_)5EmYw@2}nB?mW0aL`a}Ol|oewS|?4IPiJAWiyOiqBp5wzqJCu1qH@eYprTq
zPS1;<mGA|jhJh&WBzqH6nm4-0pbyAG!r0W6pmkU&k*45qivPRVF#CU#Oq+4BkBB{o
z5@Rp&xai(dY*FRSpGI!3hG&v0gMLy0>1!UzAzU{J7Fau@#(F%9v8Utgppn=B9#NrX
z>4s&L%0Wbzq=Ix~(vehp(TceG#*Wau^5u_VL<{bKDNEiGCIXXDNPvNiZH<6wXvv`_
zm`iR9?P%MEbK<+UP`g$}b0E|BD`H%-(p8~SM!~TUV<NB~-Er9xsb;`IXc7Ub!Y-0<
z5&oSLB@-UZJo74+h#0{6ERsP8W&=y}!I(9A1@bD_SrJ=bY#jv}u+0=yX%6x##z7?o
z$G=`Xql!ooF)`uJa@}=7QJjE~O!y$UNe1qJ`|%}(eaE3=i38~>0tvPFxP-cp3;nMD
z8Q<VR%5h*An?^#@5PCX8rM1YqXpZgzOE3^|By$}ypy|`laAjXOs(}|1zTu8gdHS>V
z(nqs>Wye?0IWoXiBa`zSMf4^@;nvf2Rb6}n{3fw2m##C=<_K81fM!za0D?RRH#E%%
znflp(UW}yKtO%1xks;_9<l~W?L@<j+;$-PQifGwS1zB}mtkB!K>F}UL=_&M+2q<zd
z6$*?l)JTlO4ouKU$W0`~4K&c$>=Bep5Q;*xL=!E-W}+c$Vkqp?GR&p~1nv!lf>8tw
zb2ZN5{(Vi&V>H;=C?T}&LoYfRC5pNSj}Vf*phYW;wcQq>iKw9?OtmC{nL|wrl~4+H
zJuVIR8KEQL$4cRAzB|qi(Fit__9}(9aKbuu5PL{NV1sRr;uQ30A`OWf5+MMrfhizM
ln^w?irp?>4Wt=ofT~s6)l$G!I&>+9!?ntK!5)<!J$3P~Nt9JkZ


From 003333849099b638f10aa8ad68a6699a34a08a3f Mon Sep 17 00:00:00 2001
From: oweno-tfwm <owen.oneill@tfwm.org.uk>
Date: Fri, 25 Aug 2023 15:41:50 +0100
Subject: [PATCH 07/11] fix null columns being inserted in calendar_dates key
 field when a calendar_date exists with no calendar (rare but valid data
 configuration)

---
 R/gtfs_merge.R | 44 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 33 insertions(+), 11 deletions(-)

diff --git a/R/gtfs_merge.R b/R/gtfs_merge.R
index cf3f1de..4926c20 100644
--- a/R/gtfs_merge.R
+++ b/R/gtfs_merge.R
@@ -1,20 +1,21 @@
 #' merge a list of gtfs files
 #'
 #' !WARNING! only the tables:
-#' agency, stops, routes, trips, stop_times, calendar, calendar_dates, shapes
+#' agency, stops, routes, trips, stop_times, calendar, calendar_dates, shapes, frequencies
 #' are processed, any other tables in the input timetables are passed through
 #'
-#' if duplicate IDs are detected then completely new ID for all rows will be generated in the output.
+#' if duplicate IDs are detected then completely new IDs for all rows will be generated in the output.
 #'
 #' @param gtfs_list a list of gtfs objects to be merged
 #' @param force logical, if TRUE duplicated values are merged taking the fist
-#' @param quiet logical, if TRUE less messages
 #'   instance to be the correct instance, in most cases this is ok, but may
 #'   cause some errors
+#' @param quiet logical, if TRUE less messages
+#' @param condenseServicePatterns logical, if TRUE service patterns across all routes are condensed into a unique set of patterns
 #' @export
-gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) {
+gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE, condenseServicePatterns = TRUE) {
 
-  # remove any empty input tables
+  # remove any empty input GTFS objects
   gtfs_list <- gtfs_list[lengths(gtfs_list) != 0]
   flattened <- unlist(gtfs_list, recursive = FALSE)
   rm(gtfs_list)
@@ -33,7 +34,7 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) {
       }
     })
 
-    #remove empty input tables
+    #remove input tables not matching tableName
     matched <- matched[lengths(matched) != 0]
 
     #assign each instance of the input table a unique number
@@ -41,10 +42,11 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) {
 
     #add a column to the data frame containing this unique number
     suppressWarnings(matched <- dplyr::bind_rows(matched, .id = "file_id"))
+    matched$file_id <- as.integer(matched$file_id)
 
     #if("calendar_dates"==tableName)
     #{
-    #  #don't understand what this is doing ? comment would be nice.
+    #  #don't understand what this complex line is doing ? comment would be nice.
     #  calendar_dates <- calendar_dates[sapply(calendar_dates, function(x){ifelse(is.null(nrow(x)),0,nrow(x))}) > 0]
     #  #matched <- matched[sapply(matched, function(x){ifelse(is.null(nrow(x)),0,nrow(x))}) > 0]
     #}
@@ -167,7 +169,9 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) {
 
 
   # calendar
-  if (any(duplicated(calendar$service_id))) {
+  calendar_dates_key <- paste(calendar_dates$service_id, calendar_dates$date, calendar_dates$exception_type, sep="#")
+
+  if (any(duplicated(calendar$service_id)) || any(duplicated(calendar_dates_key))) {
     if(!quiet){message("De-duplicating service_id")}
 
     new_service_id <- calendar[, c("file_id", "service_id")]
@@ -175,6 +179,10 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) {
       stop("Duplicated service_id within the same GTFS file")
     }
 
+    # it is valid to have calendar_dates with no associated calendar (see comments further down)
+    # so create the distinct set of service_id in both calendar and calendar_dates
+    new_service_id <- dplyr::union(unique(new_service_id), unique(calendar_dates[, c("file_id", "service_id")]))
+
     new_service_id$service_id_new <- seq(1, nrow(new_service_id))
 
     retainedColumnNames <- colnames(calendar)[!(colnames(calendar) %in% c("service_id", "file_id"))]
@@ -246,9 +254,18 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) {
   trips$file_id <- NULL
 
   # Condense Duplicate Service patterns
-  if (nrow(calendar_dates) > 0) {
+
+  # in an ideal world we should not have a trip without a service pattern, and not have calendar_dates with no associated calendar,
+  # but the real world data isn't that tidy.
+  # In a typical all GB BODS extract Around 0.2% of trips have a calendar ID but no row in calendar,
+  # 0.2% of calendar_dates have no trips, 0.1% of calendar_dates have no corresponding calendar.
+  # we need to guard against this to make sure we don't end up putting null values into any key fields
+  # This documentation https://gtfs.org/schedule/reference/#calendar_datestxt specifically mentions calendar dates without calendars
+  # as being a legitimate way to construct the data.
+  if (condenseServicePatterns && nrow(calendar_dates) > 0) {
     if(!quiet){message("Condensing duplicated service patterns")}
 
+    #find every unique combination of calendar_dates and calender values
     calendar_dates_summary <- dplyr::group_by(calendar_dates, service_id)
     if(class(calendar_dates_summary$date) == "Date"){
       calendar_dates_summary <- dplyr::summarise(calendar_dates_summary,
@@ -260,12 +277,15 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) {
       )
     }
 
-    calendar_summary <- dplyr::left_join(calendar, calendar_dates_summary, by = "service_id")
+    #we want to keep all rows in calendar_dates even if they don't have a row in calendar
+    calendar_summary <- dplyr::full_join(calendar, calendar_dates_summary, by = "service_id")
     calendar_summary <- dplyr::group_by(
       calendar_summary,
       start_date, end_date, monday, tuesday, wednesday,
       thursday, friday, saturday, sunday, pattern
     )
+
+    #give every unique combination of dates / days / exceptions a new distinct service ID
     calendar_summary$service_id_new <- dplyr::group_indices(calendar_summary)
     calendar_summary <- calendar_summary[, c("service_id_new", "service_id")]
 
@@ -287,7 +307,8 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) {
     calendar_dates <- calendar_dates[!duplicated(calendar_dates$service_id), ]
   }
 
-  # shapes are keyed on a UUID type string, so fairly improbable that the keys collide unless it's actually the same object
+
+  # shapes in a BODS extract are keyed on a UUID type string, so fairly improbable that the keys collide unless it's actually the same object
   composite_key <- paste0(shapes$shape_id, shapes$shape_pt_sequence, sep = "#")
   if (any(duplicated(composite_key))) {
     if(force){
@@ -301,6 +322,7 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) {
   stop_times$file_id <- NULL
   routes$file_id <- NULL
   calendar$file_id <- NULL
+  frequencies$file_id <- NULL
   res_final <- list(agency, stops, routes, trips, stop_times, calendar, calendar_dates, shapes, frequencies)
   names(res_final) <- c("agency", "stops", "routes", "trips", "stop_times", "calendar", "calendar_dates", "shapes","frequencies")
 

From bf8f460acbf624f594215cbea94132c602a924de Mon Sep 17 00:00:00 2001
From: oweno-tfwm <owen.oneill@tfwm.org.uk>
Date: Fri, 25 Aug 2023 15:42:38 +0100
Subject: [PATCH 08/11] correct comment - column is a 3 valued enum, not bool

---
 R/gtfs_read.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/gtfs_read.R b/R/gtfs_read.R
index fcd585d..ead852c 100644
--- a/R/gtfs_read.R
+++ b/R/gtfs_read.R
@@ -34,7 +34,7 @@ gtfs_read <- function(path){
                                                           stop_name = readr::col_character(),
                                                           stop_lat = readr::col_number(),
                                                           stop_lon = readr::col_number(),
-                                                          wheelchair_boarding = readr::col_integer(), #boolean but treat as integer so 0|1 written to file
+                                                          wheelchair_boarding = readr::col_integer(), #enum value 2 is valid but rarely seen outside the spec document
                                                           location_type = readr::col_integer(),
                                                           parent_station = readr::col_character(),
                                                           platform_code = readr::col_character()),
@@ -65,7 +65,7 @@ gtfs_read <- function(path){
                                                           service_id = readr::col_character(),
                                                           block_id = readr::col_character(),
                                                           shape_id = readr::col_character(),
-                                                          wheelchair_accessible = readr::col_integer() #boolean but treat as integer so 0|1 written to file
+                                                          wheelchair_accessible = readr::col_integer() #enum value 2 is valid but rarely seen outside the spec document
                                                           ),
                                   show_col_types = FALSE,
                                   lazy = FALSE)

From f04ae8b4231bf5e3a41bc84a20708dc0ea43b7d6 Mon Sep 17 00:00:00 2001
From: oweno-tfwm <owen.oneill@tfwm.org.uk>
Date: Fri, 25 Aug 2023 15:43:35 +0100
Subject: [PATCH 09/11] pass 'silent' value down the call stack and give some
 level of warning about merge() failing

---
 R/transxchange.R | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/R/transxchange.R b/R/transxchange.R
index 5cf8172..b6c6c7a 100644
--- a/R/transxchange.R
+++ b/R/transxchange.R
@@ -208,11 +208,11 @@ transxchange2gtfs <- function(path_in,
 
   if(!silent){ message(paste0(Sys.time(), " Merging GTFS objects"))}
 
-  gtfs_merged <- try(gtfs_merge(gtfs_all, force = force_merge))
+  gtfs_merged <- try(gtfs_merge(gtfs_all, force=force_merge, quiet=silent))
 
   if (class(gtfs_merged) == "try-error") {
-    message("Merging failed, returing unmerged GFTS object for analysis")
-    return(gtfs_all)
+    warning("Merging failed, returing unmerged GFTS object for analysis")
+    return(gtfs_all) #this is not helpful - caller has no idea there was an error and ploughs on, causing strange errors much later on
   }
   return(gtfs_merged)
 }

From b4b5651ebf7205883e4b516a23cc05fd678710f6 Mon Sep 17 00:00:00 2001
From: oweno-tfwm <owen.oneill@tfwm.org.uk>
Date: Fri, 25 Aug 2023 15:44:12 +0100
Subject: [PATCH 10/11] format frequencies table columns correctly

---
 R/write_gtfs.R | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/R/write_gtfs.R b/R/write_gtfs.R
index 2142f7f..a56faa8 100644
--- a/R/write_gtfs.R
+++ b/R/write_gtfs.R
@@ -55,6 +55,16 @@ gtfs_write <- function(gtfs,
     gtfs$stop_times$departure_time <- period2gtfs(gtfs$stop_times$departure_time)
   }
 
+  if("frequencies" %in% names(gtfs))
+  {
+    if("difftime" %in% class(gtfs$frequencies$start_time)){
+      gtfs$frequencies$start_time <- format(gtfs$frequencies$start_time, format = "%H:%M:%S")
+    }
+
+    if("difftime" %in% class(gtfs$frequencies$end_time)){
+      gtfs$frequencies$end_time <- format(gtfs$frequencies$end_time, format = "%H:%M:%S")
+    }
+  }
 
   dir.create(paste0(tempdir(), "/gtfs_temp"))
 

From 4048f2774cb082d1f0964331faf85c81eb84e983 Mon Sep 17 00:00:00 2001
From: oweno-tfwm <owen.oneill@tfwm.org.uk>
Date: Fri, 25 Aug 2023 15:44:56 +0100
Subject: [PATCH 11/11] exercise more code paths with test by producing more
 verbose output

---
 tests/testthat/test_transxchange.R | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/testthat/test_transxchange.R b/tests/testthat/test_transxchange.R
index 112cc7b..92f3578 100644
--- a/tests/testthat/test_transxchange.R
+++ b/tests/testthat/test_transxchange.R
@@ -26,7 +26,8 @@ test_that("test transxchange2gtfs singlecore", {
                             naptan = naptan,
                             ncores = 1,
                             try_mode = FALSE,
-                            force_merge = TRUE)
+                            force_merge = TRUE,
+                            silent = FALSE)
   gtfs_write(gtfs,folder = file_path, name = "txc_gtfs2")
   expect_true(file.exists(file.path(file_path,"txc_gtfs2.zip")))
 
@@ -40,7 +41,8 @@ if(.Platform$OS.type == "unix") {
                               naptan = naptan,
                               ncores = 2,
                               try_mode = FALSE,
-                              force_merge = TRUE)
+                              force_merge = TRUE,
+                              silent = FALSE)
     gtfs_write(gtfs,folder = file_path, name = "txc_gtfs")
     expect_true(file.exists(file.path(file_path,"txc_gtfs.zip")))