From 1a7c8322ff09dcdf31298cf9e59095d9a9e0cd8a Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Tue, 15 Aug 2023 21:40:52 +0100 Subject: [PATCH 01/11] fix failure to merge BODS and NR gtfs files because data type of join columns not being explicitly set - and some fields assumed to be present in route table (route_desc) are not present in BODS data --- R/gtfs_merge.R | 7 +++++-- R/gtfs_read.R | 47 +++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 44 insertions(+), 10 deletions(-) diff --git a/R/gtfs_merge.R b/R/gtfs_merge.R index a1f1b3e..cf08bfe 100644 --- a/R/gtfs_merge.R +++ b/R/gtfs_merge.R @@ -131,8 +131,11 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) { route_id$route_id_new <- seq(1, nrow(route_id)) routes <- dplyr::left_join(routes, route_id, by = c("file_id", "route_id")) - routes <- routes[, c("route_id_new", "agency_id", "route_short_name", "route_long_name", "route_desc", "route_type")] - names(routes) <- c("route_id", "agency_id", "route_short_name", "route_long_name", "route_desc", "route_type") + + columns_to_select <- c("route_id_new", "agency_id", "route_short_name", "route_long_name", "route_desc", "route_type") + columns_to_select <- columns_to_select[columns_to_select %in% colnames(routes)] + routes <- routes[, columns_to_select] + names(routes) <- columns_to_select } diff --git a/R/gtfs_read.R b/R/gtfs_read.R index 5f58818..089f2a5 100644 --- a/R/gtfs_read.R +++ b/R/gtfs_read.R @@ -20,20 +20,24 @@ gtfs_read <- function(path){ if(checkmate::test_file_exists(file.path(tmp_folder,"agency.txt"))){ gtfs$agency <- readr::read_csv(file.path(tmp_folder,"agency.txt"), - col_types = readr::cols(agency_id = readr::col_character()), + col_types = readr::cols(agency_id = readr::col_character(), + agency_noc = readr::col_character()), show_col_types = FALSE, lazy = FALSE) } else { warning("Unable to find required file: agency.txt") } - if(checkmate::test_file_exists(file.path(tmp_folder,"stops.txt"))){ gtfs$stops <- readr::read_csv(file.path(tmp_folder,"stops.txt"), col_types = readr::cols(stop_id = readr::col_character(), stop_code = readr::col_character(), stop_name = readr::col_character(), stop_lat = readr::col_number(), - stop_lon = readr::col_number()), + stop_lon = readr::col_number(), + wheelchair_boarding = readr::col_logical(), + location_type = readr::col_integer(), + parent_station = readr::col_character(), + platform_code = readr::col_character()), lazy = FALSE, show_col_types = FALSE) @@ -46,7 +50,8 @@ gtfs_read <- function(path){ col_types = readr::cols(route_id = readr::col_character(), agency_id = readr::col_character(), route_short_name = readr::col_character(), - route_long_name = readr::col_character()), + route_long_name = readr::col_character(), + route_type = readr::col_integer()), show_col_types = FALSE, lazy = FALSE) } else { @@ -56,7 +61,12 @@ gtfs_read <- function(path){ if(checkmate::test_file_exists(file.path(tmp_folder,"trips.txt"))){ gtfs$trips <- readr::read_csv(file.path(tmp_folder,"trips.txt"), col_types = readr::cols(trip_id = readr::col_character(), - route_id = readr::col_character()), + route_id = readr::col_character(), + service_id = readr::col_character(), + block_id = readr::col_character(), + shape_id = readr::col_character(), + wheelchair_accessible = readr::col_logical() + ), show_col_types = FALSE, lazy = FALSE) } else { @@ -66,8 +76,14 @@ gtfs_read <- function(path){ if(checkmate::test_file_exists(file.path(tmp_folder,"stop_times.txt"))){ gtfs$stop_times <- readr::read_csv(file.path(tmp_folder,"stop_times.txt"), col_types = readr::cols(trip_id = readr::col_character(), + stop_id = readr::col_character(), + stop_sequence = readr::col_integer(), departure_time = readr::col_character(), - arrival_time = readr::col_character()), + arrival_time = readr::col_character(), + shape_dist_traveled = readr::col_number(), + timepoint = readr::col_logical(), + pickup_type = readr::col_integer(), + drop_off_type = readr::col_integer()), show_col_types = FALSE, lazy = FALSE) gtfs$stop_times$arrival_time <- lubridate::hms(gtfs$stop_times$arrival_time) @@ -79,7 +95,15 @@ gtfs_read <- function(path){ if(checkmate::test_file_exists(file.path(tmp_folder,"calendar.txt"))){ gtfs$calendar <- readr::read_csv(file.path(tmp_folder,"calendar.txt"), - col_types = readr::cols(start_date = readr::col_date(format = "%Y%m%d"), + col_types = readr::cols(service_id = readr::col_character(), + monday = readr::col_logical(), + tuesday = readr::col_logical(), + wednesday = readr::col_logical(), + thursday = readr::col_logical(), + friday = readr::col_logical(), + saturday = readr::col_logical(), + sunday = readr::col_logical(), + start_date = readr::col_date(format = "%Y%m%d"), end_date = readr::col_date(format = "%Y%m%d")), show_col_types = FALSE, lazy = FALSE) @@ -90,7 +114,9 @@ gtfs_read <- function(path){ if(checkmate::test_file_exists(file.path(tmp_folder,"calendar_dates.txt"))){ gtfs$calendar_dates <- readr::read_csv(file.path(tmp_folder,"calendar_dates.txt"), - col_types = readr::cols(date = readr::col_date(format = "%Y%m%d")), + col_types = readr::cols(service_id = readr::col_character(), + date = readr::col_date(format = "%Y%m%d"), + exception_type = readr::col_integer()), show_col_types = FALSE, lazy = FALSE) } else { @@ -115,6 +141,11 @@ gtfs_read <- function(path){ if(checkmate::test_file_exists(file.path(tmp_folder,"shapes.txt"))){ gtfs$shapes <- readr::read_csv(file.path(tmp_folder,"shapes.txt"), + col_types = readr::cols(shape_id = readr::col_character(), + shape_pt_lat = readr::col_number(), + shape_pt_lon = readr::col_number(), + shape_pt_sequence = readr::col_integer(), + shape_dist_traveled = readr::col_number()), show_col_types = FALSE, lazy = FALSE) } else { From b31dfb76689add03f59dc155543009a3879d5a8a Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Thu, 24 Aug 2023 22:05:43 +0100 Subject: [PATCH 02/11] trip id is a varchar in BODS data - remove assumption that it's an int --- R/gtfs_subset.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/gtfs_subset.R b/R/gtfs_subset.R index b824899..c26ff8d 100644 --- a/R/gtfs_subset.R +++ b/R/gtfs_subset.R @@ -36,7 +36,7 @@ gtfs_clip <- function(gtfs, bounds) { gtfs$stop_times <- gtfs$stop_times[gtfs$stop_times$stop_id %in% stops_inc, ] # Check for single stop trips n_stops <- table(gtfs$stop_times$trip_id) - single_stops <- as.integer(names(n_stops[n_stops == 1])) + single_stops <- names(n_stops[n_stops == 1]) gtfs$stop_times <- gtfs$stop_times[!gtfs$stop_times$trip_id %in% single_stops, ] # Check for any unused stops From 074826c931844c152a848dfa1d4de37eab43759a Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Thu, 24 Aug 2023 22:07:04 +0100 Subject: [PATCH 03/11] exception_type is an int - tests failing after tightening up data type definitions --- R/transxchange_export.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/transxchange_export.R b/R/transxchange_export.R index 8517cc5..acae130 100644 --- a/R/transxchange_export.R +++ b/R/transxchange_export.R @@ -499,12 +499,12 @@ transxchange_export <- function(obj, calendar_dates <- data.frame( trip_id = character(), date = character(), - exception_type = character(), + exception_type = integer(), stringsAsFactors = FALSE ) calendar_summary <- dplyr::group_by(calendar, start_date, end_date, DaysOfWeek) } else { - # remove calendar_dates for trips that have been competly removed + # remove calendar_dates for trips that have been competely removed calendar_dates <- calendar_dates[calendar_dates$trip_id %in% calendar$trip_id, ] calendar_summary <- dplyr::group_by(calendar, start_date, end_date, DaysOfWeek) From 8d4c2d84d00d8677ab14ff65b55881f976fe69f4 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Thu, 24 Aug 2023 22:10:03 +0100 Subject: [PATCH 04/11] code was assuming a set of tables, and hence dumping tables like 'frequencies' and 'feed_info' when reading / writing BODS data. Removed assumptions about table names, all additional tables encountered in zip are read and written. also performance improvement when writing. --- R/gtfs_read.R | 63 ++++++++++++++++++-------------------------------- R/write_gtfs.R | 46 +++++++++++++++--------------------- 2 files changed, 41 insertions(+), 68 deletions(-) diff --git a/R/gtfs_read.R b/R/gtfs_read.R index 089f2a5..fcd585d 100644 --- a/R/gtfs_read.R +++ b/R/gtfs_read.R @@ -16,7 +16,6 @@ gtfs_read <- function(path){ files <- list.files(tmp_folder, pattern = ".txt") gtfs <- list() - message_log <- c("Unable to find optional files: ") if(checkmate::test_file_exists(file.path(tmp_folder,"agency.txt"))){ gtfs$agency <- readr::read_csv(file.path(tmp_folder,"agency.txt"), @@ -27,6 +26,7 @@ gtfs_read <- function(path){ } else { warning("Unable to find required file: agency.txt") } + if(checkmate::test_file_exists(file.path(tmp_folder,"stops.txt"))){ gtfs$stops <- readr::read_csv(file.path(tmp_folder,"stops.txt"), col_types = readr::cols(stop_id = readr::col_character(), @@ -34,7 +34,7 @@ gtfs_read <- function(path){ stop_name = readr::col_character(), stop_lat = readr::col_number(), stop_lon = readr::col_number(), - wheelchair_boarding = readr::col_logical(), + wheelchair_boarding = readr::col_integer(), #boolean but treat as integer so 0|1 written to file location_type = readr::col_integer(), parent_station = readr::col_character(), platform_code = readr::col_character()), @@ -65,7 +65,7 @@ gtfs_read <- function(path){ service_id = readr::col_character(), block_id = readr::col_character(), shape_id = readr::col_character(), - wheelchair_accessible = readr::col_logical() + wheelchair_accessible = readr::col_integer() #boolean but treat as integer so 0|1 written to file ), show_col_types = FALSE, lazy = FALSE) @@ -81,7 +81,7 @@ gtfs_read <- function(path){ departure_time = readr::col_character(), arrival_time = readr::col_character(), shape_dist_traveled = readr::col_number(), - timepoint = readr::col_logical(), + timepoint = readr::col_integer(), #boolean but treat as integer so 0|1 written to file pickup_type = readr::col_integer(), drop_off_type = readr::col_integer()), show_col_types = FALSE, @@ -96,13 +96,13 @@ gtfs_read <- function(path){ if(checkmate::test_file_exists(file.path(tmp_folder,"calendar.txt"))){ gtfs$calendar <- readr::read_csv(file.path(tmp_folder,"calendar.txt"), col_types = readr::cols(service_id = readr::col_character(), - monday = readr::col_logical(), - tuesday = readr::col_logical(), - wednesday = readr::col_logical(), - thursday = readr::col_logical(), - friday = readr::col_logical(), - saturday = readr::col_logical(), - sunday = readr::col_logical(), + monday = readr::col_integer(), #boolean but treat as integer so 0|1 written to file + tuesday = readr::col_integer(), #boolean but treat as integer so 0|1 written to file + wednesday = readr::col_integer(), #boolean but treat as integer so 0|1 written to file + thursday = readr::col_integer(), #boolean but treat as integer so 0|1 written to file + friday = readr::col_integer(), #boolean but treat as integer so 0|1 written to file + saturday = readr::col_integer(), #boolean but treat as integer so 0|1 written to file + sunday = readr::col_integer(), #boolean but treat as integer so 0|1 written to file start_date = readr::col_date(format = "%Y%m%d"), end_date = readr::col_date(format = "%Y%m%d")), show_col_types = FALSE, @@ -123,22 +123,6 @@ gtfs_read <- function(path){ message("Unable to find conditionally required file: calendar_dates.txt") } - if(checkmate::test_file_exists(file.path(tmp_folder,"fare_attributes.txt"))){ - gtfs$fare_attributes <- readr::read_csv(file.path(tmp_folder,"fare_attributes.txt"), - show_col_types = FALSE, - lazy = FALSE) - } else { - message_log <- c(message_log, "fare_attributes.txt") - } - - if(checkmate::test_file_exists(file.path(tmp_folder,"fare_rules.txt"))){ - gtfs$fare_rules <- readr::read_csv(file.path(tmp_folder,"fare_rules.txt"), - show_col_types = FALSE, - lazy = FALSE) - } else { - message_log <- c(message_log, "fare_rules.txt") - } - if(checkmate::test_file_exists(file.path(tmp_folder,"shapes.txt"))){ gtfs$shapes <- readr::read_csv(file.path(tmp_folder,"shapes.txt"), col_types = readr::cols(shape_id = readr::col_character(), @@ -148,25 +132,24 @@ gtfs_read <- function(path){ shape_dist_traveled = readr::col_number()), show_col_types = FALSE, lazy = FALSE) - } else { - message_log <- c(message_log, "shapes.txt") - } - - if(checkmate::test_file_exists(file.path(tmp_folder,"transfers.txt"))){ - gtfs$transfers <- readr::read_csv(file.path(tmp_folder,"transfers.txt"), - show_col_types = FALSE, - lazy = FALSE) - } else { - message_log <- c(message_log, "transfers.txt") } - unlink(tmp_folder, recursive = TRUE) + #load any other tables in the .zip file + filenamesOnly <- tools::file_path_sans_ext(basename(files)) + notLoadedFiles = setdiff( filenamesOnly, names(gtfs) ) - if(length(message_log) > 0){ - message(paste(message_log, collapse = " ")) + for (fileName in notLoadedFiles) + { + table <- readr::read_csv(file.path( tmp_folder, paste0( fileName, ".txt" ) ), + show_col_types = FALSE, + lazy = FALSE) + gtfs[[fileName]] <- table } + #remove temp directory + unlink(tmp_folder, recursive = TRUE) + return(gtfs) } diff --git a/R/write_gtfs.R b/R/write_gtfs.R index 6e2e5d9..2142f7f 100644 --- a/R/write_gtfs.R +++ b/R/write_gtfs.R @@ -1,6 +1,6 @@ #' Write GTFS #' -#' Takes a list of data frames represneting the GTFS fromat and saves them as GTFS +#' Takes a list of data frames representing the GTFS format and saves them as GTFS #' Zip file. #' #' @param gtfs named list of data.frames @@ -19,6 +19,7 @@ gtfs_write <- function(gtfs, stripTab = TRUE, stripNewline = TRUE, quote = FALSE) { + if (stripComma) { for (i in seq_len(length(gtfs))) { gtfs[[i]] <- stripCommas(gtfs[[i]]) @@ -33,7 +34,6 @@ gtfs_write <- function(gtfs, #Format Dates - if(class(gtfs$calendar$start_date) == "Date"){ gtfs$calendar$start_date <- format(gtfs$calendar$start_date, "%Y%m%d") } @@ -57,24 +57,20 @@ gtfs_write <- function(gtfs, dir.create(paste0(tempdir(), "/gtfs_temp")) - data.table::fwrite(gtfs$calendar, paste0(tempdir(), "/gtfs_temp/calendar.txt"), row.names = FALSE, quote = quote) - if (nrow(gtfs$calendar_dates) > 0) { - data.table::fwrite(gtfs$calendar_dates, paste0(tempdir(), "/gtfs_temp/calendar_dates.txt"), row.names = FALSE, quote = quote) - } - data.table::fwrite(gtfs$routes, paste0(tempdir(), "/gtfs_temp/routes.txt"), row.names = FALSE, quote = quote) - data.table::fwrite(gtfs$stop_times, paste0(tempdir(), "/gtfs_temp/stop_times.txt"), row.names = FALSE, quote = quote) - data.table::fwrite(gtfs$trips, paste0(tempdir(), "/gtfs_temp/trips.txt"), row.names = FALSE, quote = quote) - data.table::fwrite(gtfs$stops, paste0(tempdir(), "/gtfs_temp/stops.txt"), row.names = FALSE, quote = quote) - data.table::fwrite(gtfs$agency, paste0(tempdir(), "/gtfs_temp/agency.txt"), row.names = FALSE, quote = quote) - if ("transfers" %in% names(gtfs)) { - data.table::fwrite(gtfs$transfers, paste0(tempdir(), "/gtfs_temp/transfers.txt"), row.names = FALSE, quote = quote) - } - if ("shapes" %in% names(gtfs)) { - data.table::fwrite(gtfs$shapes, paste0(tempdir(), "/gtfs_temp/shapes.txt"), row.names = FALSE, quote = quote) + + for ( tableName in names(gtfs) ) + { + table <- gtfs[[tableName]] + + if ( !is.null(table) & nrow(table) > 0 ) + { + data.table::fwrite(table, file.path(tempdir(), "gtfs_temp", paste0(tableName, ".txt")), row.names = FALSE, quote = quote) + } } + zip::zipr(paste0(folder, "/", name, ".zip"), list.files(paste0(tempdir(), "/gtfs_temp"), full.names = TRUE), recurse = FALSE) + unlink(paste0(tempdir(), "/gtfs_temp"), recursive = TRUE) - message(paste0(folder, "/", name, ".zip")) } @@ -126,9 +122,11 @@ stripTabs <- function(df, stripNewline) { #' Convert Period to GTFS timestamps +#' When writing a 400mb (zipped) file, we spend nearly 4 minutes in this fn(), about 10x longer than writing the files to the filesystem. +#' profiler reports this being mostly nchar(), so we optimise down to one sprintf which reduces the time to 1 minute +#' .format() is about 7x slower than sprintf() #' -#' -#' @param x peridos +#' @param x periods #' @noRd #' period2gtfs <- function(x) { @@ -139,14 +137,6 @@ period2gtfs <- function(x) { stop("Days detected in period objects, incorectly formatted period object") } - hrs <- as.character(lubridate::hour(x)) - min <- as.character(lubridate::minute(x)) - sec <- as.character(lubridate::second(x)) - - hrs <- ifelse(nchar(hrs) == 1,paste0("0",hrs), hrs) - min <- ifelse(nchar(min) == 1,paste0("0",min), min) - sec <- ifelse(nchar(sec) == 1,paste0("0",sec), sec) - - return(paste0(hrs,":",min,":",sec)) + return( sprintf("%02d:%02d:%02d", lubridate::hour(x), lubridate::minute(x), lubridate::second(x)) ) } From ccd451e6f0795aa16d8ef0a3d842f9465eade36e Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Thu, 24 Aug 2023 22:12:42 +0100 Subject: [PATCH 05/11] merge was expecting a fixed set of tables and columns and throwing away anything else that it wasn't expecting. OK for the GTFS tables generated by this tool from CIF files, but caused data loss when using BODS data. modified to keep all encountered columns and tables - passes through if there isn't any specific code for the table in question. --- R/gtfs_merge.R | 225 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 147 insertions(+), 78 deletions(-) diff --git a/R/gtfs_merge.R b/R/gtfs_merge.R index cf08bfe..cf3f1de 100644 --- a/R/gtfs_merge.R +++ b/R/gtfs_merge.R @@ -1,5 +1,11 @@ #' merge a list of gtfs files #' +#' !WARNING! only the tables: +#' agency, stops, routes, trips, stop_times, calendar, calendar_dates, shapes +#' are processed, any other tables in the input timetables are passed through +#' +#' if duplicate IDs are detected then completely new ID for all rows will be generated in the output. +#' #' @param gtfs_list a list of gtfs objects to be merged #' @param force logical, if TRUE duplicated values are merged taking the fist #' @param quiet logical, if TRUE less messages @@ -8,40 +14,62 @@ #' @export gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) { - # remove any NULLS + # remove any empty input tables gtfs_list <- gtfs_list[lengths(gtfs_list) != 0] + flattened <- unlist(gtfs_list, recursive = FALSE) + rm(gtfs_list) - # Split out lists - agency <- sapply(gtfs_list, "[", "agency") - stops <- sapply(gtfs_list, "[", "stops") - routes <- sapply(gtfs_list, "[", "routes") - trips <- sapply(gtfs_list, "[", "trips") - stop_times <- sapply(gtfs_list, "[", "stop_times") - calendar <- sapply(gtfs_list, "[", "calendar") - calendar_dates <- sapply(gtfs_list, "[", "calendar_dates") + #get unique input table names + tableNames <- unique(names(flattened)) + + grouped_list <- list() + + # Loop through table names names and group data frames + for (tableName in tableNames) { + + matched <- purrr::imap( flattened, function( item, name ) { + if (name == tableName) { + return(item) + } + }) + + #remove empty input tables + matched <- matched[lengths(matched) != 0] + + #assign each instance of the input table a unique number + names(matched) <- seq(1, length(matched)) - # bind together - names(agency) <- seq(1, length(agency)) - suppressWarnings(agency <- dplyr::bind_rows(agency, .id = "file_id")) + #add a column to the data frame containing this unique number + suppressWarnings(matched <- dplyr::bind_rows(matched, .id = "file_id")) - names(stops) <- seq(1, length(stops)) - suppressWarnings(stops <- dplyr::bind_rows(stops, .id = "file_id")) + #if("calendar_dates"==tableName) + #{ + # #don't understand what this is doing ? comment would be nice. + # calendar_dates <- calendar_dates[sapply(calendar_dates, function(x){ifelse(is.null(nrow(x)),0,nrow(x))}) > 0] + # #matched <- matched[sapply(matched, function(x){ifelse(is.null(nrow(x)),0,nrow(x))}) > 0] + #} - names(routes) <- seq(1, length(routes)) - suppressWarnings(routes <- dplyr::bind_rows(routes, .id = "file_id")) + #add to map + grouped_list[[tableName]] <- matched + } - names(trips) <- seq(1, length(trips)) - suppressWarnings(trips <- dplyr::bind_rows(trips, .id = "file_id")) + rm(flattened) - names(stop_times) <- seq(1, length(stop_times)) - suppressWarnings(stop_times <- dplyr::bind_rows(stop_times, .id = "file_id")) + # Split out lists + agency <- grouped_list$agency + stops <- grouped_list$stops + routes <- grouped_list$routes + trips <- grouped_list$trips + stop_times <- grouped_list$stop_times + calendar <- grouped_list$calendar + calendar_dates <- grouped_list$calendar_dates + shapes <- grouped_list$shapes + frequencies <- grouped_list$frequencies - names(calendar) <- seq(1, length(calendar)) - suppressWarnings(calendar <- dplyr::bind_rows(calendar, .id = "file_id")) + #remove items from map. + grouped_list <- grouped_list[setdiff(names(grouped_list), + c("agency", "stops", "routes", "trips", "stop_times", "calendar", "calendar_dates", "shapes", "frequencies" ))] - names(calendar_dates) <- seq(1, length(calendar_dates)) - calendar_dates <- calendar_dates[sapply(calendar_dates, function(x){ifelse(is.null(nrow(x)),0,nrow(x))}) > 0] - suppressWarnings(calendar_dates <- dplyr::bind_rows(calendar_dates, .id = "file_id")) # fix typo agency$agency_name <- as.character(agency$agency_name) @@ -112,49 +140,54 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) { } else { stop("Duplicated Stop IDS") } - - } # routes if (any(duplicated(routes$route_id))) { if(!quiet){message("De-duplicating route_id")} - route_id <- routes[, c("file_id", "route_id")] - if (any(duplicated(route_id))) { + + retainedColumnNames <- colnames(routes)[!(colnames(routes) %in% c("route_id", "file_id"))] + + new_route_id <- routes[, c("file_id", "route_id")] + if (any(duplicated(new_route_id))) { if(force){ - routes <- routes[!duplicated(route_id), ] - route_id <- routes[, c("file_id", "route_id")] + routes <- routes[!duplicated(new_route_id), ] + new_route_id <- routes[, c("file_id", "route_id")] } else { stop("Duplicated route_id within the same GTFS file, try using force = TRUE") } } - route_id$route_id_new <- seq(1, nrow(route_id)) - routes <- dplyr::left_join(routes, route_id, by = c("file_id", "route_id")) + new_route_id$route_id_new <- seq(1, nrow(new_route_id)) + routes <- dplyr::left_join(routes, new_route_id, by = c("file_id", "route_id")) - columns_to_select <- c("route_id_new", "agency_id", "route_short_name", "route_long_name", "route_desc", "route_type") - columns_to_select <- columns_to_select[columns_to_select %in% colnames(routes)] - routes <- routes[, columns_to_select] - names(routes) <- columns_to_select + routes <- routes[, c("route_id_new", retainedColumnNames)] + routes <- routes %>% dplyr::rename(route_id = route_id_new) } # calendar if (any(duplicated(calendar$service_id))) { if(!quiet){message("De-duplicating service_id")} - service_id <- calendar[, c("file_id", "service_id")] - if (any(duplicated(service_id))) { + + new_service_id <- calendar[, c("file_id", "service_id")] + if (any(duplicated(new_service_id))) { stop("Duplicated service_id within the same GTFS file") } - service_id$service_id_new <- seq(1, nrow(service_id)) - calendar <- dplyr::left_join(calendar, service_id, by = c("file_id", "service_id")) - calendar <- calendar[, c("service_id_new", "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "start_date", "end_date")] - names(calendar) <- c("service_id", "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "start_date", "end_date") + + new_service_id$service_id_new <- seq(1, nrow(new_service_id)) + + retainedColumnNames <- colnames(calendar)[!(colnames(calendar) %in% c("service_id", "file_id"))] + calendar <- dplyr::left_join(calendar, new_service_id, by = c("file_id", "service_id")) + calendar <- calendar[, c("service_id_new", retainedColumnNames)] + names(calendar) <- c("service_id", retainedColumnNames) if (nrow(calendar_dates) > 0) { - calendar_dates <- dplyr::left_join(calendar_dates, service_id, by = c("file_id", "service_id")) - calendar_dates <- calendar_dates[, c("service_id_new", "date", "exception_type")] - names(calendar_dates) <- c("service_id", "date", "exception_type") + retainedColumnNames <- colnames(calendar_dates)[!(colnames(calendar_dates) %in% c("service_id", "file_id"))] + + calendar_dates <- dplyr::left_join(calendar_dates, new_service_id, by = c("file_id", "service_id")) + calendar_dates <- calendar_dates[, c("service_id_new", retainedColumnNames)] + calendar_dates <- calendar_dates %>% dplyr::rename(service_id = service_id_new) } } @@ -162,45 +195,60 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) { # Trips if (any(duplicated(trips$trip_id))) { if(!quiet){message("De-duplicating trip_id")} - trip_id <- trips[, c("file_id", "trip_id")] - if (any(duplicated(trip_id))) { + + new_trip_id <- trips[, c("file_id", "trip_id")] + if (any(duplicated(new_trip_id))) { if(force){ trips <- unique(trips) stop_times <- unique(stop_times) - trip_id <- trips[, c("file_id", "trip_id")] + new_trip_id <- trips[, c("file_id", "trip_id")] } else{ stop("Duplicated trip_id within the same GTFS file") } } - trip_id$trip_id_new <- seq(1, nrow(trip_id)) - trips <- dplyr::left_join(trips, trip_id, by = c("file_id", "trip_id")) - trips <- trips[, c("route_id", "service_id", "trip_id_new", "file_id")] - names(trips) <- c("route_id", "service_id", "trip_id", "file_id") - - - stop_times <- dplyr::left_join(stop_times, trip_id, by = c("file_id", "trip_id")) - stop_times <- stop_times[, c("trip_id_new", "arrival_time", "departure_time", "stop_id", "stop_sequence", "timepoint")] - names(stop_times) <- c("trip_id", "arrival_time", "departure_time", "stop_id", "stop_sequence", "timepoint") + new_trip_id$trip_id_new <- seq(1, nrow(new_trip_id)) + + retainedColumnNames <- colnames(trips)[!(colnames(trips) %in% c("trip_id"))] + trips <- dplyr::left_join(trips, new_trip_id, by = c("file_id", "trip_id")) + trips <- trips[, c("trip_id_new", retainedColumnNames)] + trips <- trips %>% dplyr::rename(trip_id = trip_id_new) + + retainedColumnNames <- colnames(stop_times)[!(colnames(stop_times) %in% c("trip_id", "file_id"))] + stop_times <- dplyr::left_join(stop_times, new_trip_id, by = c("file_id", "trip_id")) + stop_times <- stop_times[, c("trip_id_new", retainedColumnNames)] + stop_times <- stop_times %>% dplyr::rename(trip_id = trip_id_new) + + if ( length(frequencies) > 0 ) + { + retainedColumnNames <- colnames(frequencies)[!(colnames(frequencies) %in% c("trip_id", "file_id"))] + frequencies <- dplyr::left_join(frequencies, new_trip_id, by = c("file_id", "trip_id")) + frequencies <- frequencies[, c("trip_id_new", retainedColumnNames)] + frequencies <- frequencies %>% dplyr::rename(trip_id = trip_id_new) + } } - if (exists("service_id")) { - trips <- dplyr::left_join(trips, service_id, by = c("file_id", "service_id")) - trips <- trips[, c("route_id", "service_id_new", "trip_id", "file_id")] - names(trips) <- c("route_id", "service_id", "trip_id", "file_id") + + if (exists("new_service_id")) { + retainedColumnNames <- colnames(trips)[!(colnames(trips) %in% c("service_id"))] + trips <- dplyr::left_join(trips, new_service_id, by = c("file_id", "service_id")) + trips <- trips[, c(retainedColumnNames, "service_id_new")] + trips <- trips %>% dplyr::rename(service_id = service_id_new) } - if (exists("route_id")) { - trips <- dplyr::left_join(trips, route_id, by = c("file_id", "route_id")) - trips <- trips[, c("route_id_new", "service_id", "trip_id", "file_id")] - names(trips) <- c("route_id", "service_id", "trip_id", "file_id") + + if (exists("new_route_id")) { + retainedColumnNames <- colnames(trips)[!(colnames(trips) %in% c("route_id"))] + trips <- dplyr::left_join(trips, new_route_id, by = c("file_id", "route_id")) + trips <- trips[, c("route_id_new", retainedColumnNames)] + trips <- trips %>% dplyr::rename(route_id = route_id_new) } - trips <- trips[, c("route_id", "service_id", "trip_id")] - names(trips) <- c("route_id", "service_id", "trip_id") + trips$file_id <- NULL # Condense Duplicate Service patterns if (nrow(calendar_dates) > 0) { if(!quiet){message("Condensing duplicated service patterns")} + calendar_dates_summary <- dplyr::group_by(calendar_dates, service_id) if(class(calendar_dates_summary$date) == "Date"){ calendar_dates_summary <- dplyr::summarise(calendar_dates_summary, @@ -221,27 +269,48 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) { calendar_summary$service_id_new <- dplyr::group_indices(calendar_summary) calendar_summary <- calendar_summary[, c("service_id_new", "service_id")] + retainedColumnNames <- colnames(trips)[!(colnames(trips) %in% c("service_id", "route_id"))] trips <- dplyr::left_join(trips, calendar_summary, by = c("service_id")) - trips <- trips[, c("route_id", "service_id_new", "trip_id")] - names(trips) <- c("route_id", "service_id", "trip_id") + trips <- trips[, c("route_id", "service_id_new", retainedColumnNames)] + trips <- trips %>% dplyr::rename(service_id = service_id_new) + retainedColumnNames <- colnames(calendar)[!(colnames(calendar) %in% c("service_id", "file_id"))] calendar <- dplyr::left_join(calendar, calendar_summary, by = c("service_id")) - calendar <- calendar[, c("service_id_new", "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "start_date", "end_date")] - names(calendar) <- c("service_id", "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "start_date", "end_date") + calendar <- calendar[, c("service_id_new", retainedColumnNames)] + calendar <- calendar %>% dplyr::rename(service_id = service_id_new) calendar <- calendar[!duplicated(calendar$service_id), ] - + retainedColumnNames <- colnames(calendar_dates)[!(colnames(calendar_dates) %in% c("service_id", "file_id"))] calendar_dates <- dplyr::left_join(calendar_dates, calendar_summary, by = c("service_id")) - calendar_dates <- calendar_dates[, c("service_id_new", "date", "exception_type")] - names(calendar_dates) <- c("service_id", "date", "exception_type") + calendar_dates <- calendar_dates[, c("service_id_new", retainedColumnNames)] + calendar_dates <- calendar_dates %>% dplyr::rename(service_id = service_id_new) calendar_dates <- calendar_dates[!duplicated(calendar_dates$service_id), ] } + # shapes are keyed on a UUID type string, so fairly improbable that the keys collide unless it's actually the same object + composite_key <- paste0(shapes$shape_id, shapes$shape_pt_sequence, sep = "#") + if (any(duplicated(composite_key))) { + if(force){ + shapes <- shapes[!duplicated(composite_key),] + } else { + stop("Duplicated Shapes IDS") + } + } + + shapes$file_id <- NULL stop_times$file_id <- NULL routes$file_id <- NULL calendar$file_id <- NULL + res_final <- list(agency, stops, routes, trips, stop_times, calendar, calendar_dates, shapes, frequencies) + names(res_final) <- c("agency", "stops", "routes", "trips", "stop_times", "calendar", "calendar_dates", "shapes","frequencies") + + #for tables we don't explicitly process - hope items are unique + for (item in grouped_list) { + item$file_id <- NULL + } + + #remove nulls (e.g. tables that are often empty like frequencies) + res_final <- Filter(Negate(is.null), res_final) - res_final <- list(agency, stops, routes, trips, stop_times, calendar, calendar_dates) - names(res_final) <- c("agency", "stops", "routes", "trips", "stop_times", "calendar", "calendar_dates") - return(res_final) + return (c(res_final, grouped_list)) } From c9f7cce8da16710a0a7518e380675c7697a37d42 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Thu, 24 Aug 2023 22:29:38 +0100 Subject: [PATCH 06/11] added FOC YG + MV (appearing in NR timetable extracts) renamed ZZ to reflect it's a generic obfuscated freight, not a specific operator --- data/atoc_agency.rda | Bin 2055 -> 2133 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/data/atoc_agency.rda b/data/atoc_agency.rda index 93c47cad3b74f115a6b19afb7c8bdd3d7cd81550..97495dad84e4893b5dc671509aefc7d12b9e8329 100644 GIT binary patch literal 2133 zcmV-b2&(rViwFP!000002JKqia}!4rcOay|fC-Qg5=fZj5*MjD+uGHqQ}^O1{yHSX zdgTvzQ7x;nw6V0Sn%%YRd%FMX{=4IPc4m5K*0LO4@{kjSOw;q5{_dW}X8*V2^5XI0 z!otGMg->tXSh#r$$c0-6ovoF|0&w6jpDcX3a0mXp>m`xj_4+jQF97}+Sf9b?bNK8^ z$c>GCq>dETY$COTl&h%CHc~n7Kv7$GcRQ!{kV0+SsBQaLQ9HjPg?D#4NFi>wrKr|# zNNpmug%n!adW{rX&_e55sH2Ti+J}nT!_&RDigGrPLYp15*+Huul;@yT&NfmQUFV-j zy+R7TbkGk6Bk7<=&Ksm0q|i$Ty>!qI2eaj%mk#E`$-R81C>Q;3(NY&}anTkRZE?{8 z7bSPlqYmOa`6=o>L`#oQ$`Sf;jB!53)3+GYckc=net%;)47|{%WZw&hlKFUpvEa-j z9p(jLOpd%Ejpflz-}WW>usH~Z31wk+b>dwp%u|;#KO#-I){8=rPTV1-W6I?D7n@$3 zkao};dSNe?TIAX7&6GuP;wkKt!+<5JHzb@trXcH$mkbp8%dLQcsHQg#62LU0Fr zT-#$51u1F}bxNyJVwm&wc5 z;R;BDD78I*09M97tH`z@KN>|za7GCf%30uJaXpf<55itS_+p!~m|n2FM(-VZ%o_!< zUbpvET6N~PA|Y?Vw0ICOO4?Dx;zDA6G>nI!_lT2v45AgU9kD3vla8wz>#2ct(>SK* z=O;~+&~BixgZ_$Hv@C2IIp%?l!t!)!Z=qQxy5ME zAMlcsuVXmj;Ah)Z~3$!#^sYrtgg zD;poU6EB3duF&GQd7Wu<;i1r$wT&_N7CQrPMB`x)o~niv;~7w4{Dy`hM6WQ9p{LK& zn7r&gheW(ONRjj9E>tlJ={3+AQ{34r9{;cXaa z!BTN;!t)t7VRR-~FtKE z4&M!uWcz$tDoynX;BSHU=;aS0}+~Fs8l0Ta8$MjsI<} zu(N!l&|LJveyhFfy1RSZAW{`_dA7W)+`4kB>6)v89)Lc7x_z6 zOY%jibVYhm4Pd{TmfmTirr4|pH0HyD`#(+Ird|LtTv0k65B*G>_Z}{<;T0wVy0l4jb~zeym|l`&7w*9 z$80}D#o`mz&6Z77xkPUfWl1RB$jkjZN6k@l)SrR+sW{l>UMd$06Ku^M0PcXEoCI{( z>lU}y7nii?8b(u1j)#7P`>l>zxyA~y0vDc4eMa(CX!g!9^NG`e3GT~N+|iedWSqaM zSG+_a-=n!7{CvbWXbJlJY=`A~-8I(hwb^&X^4;aDc2{4%ugP$ZSK(FqD%V^Ti+lGc zyVkw5xa;ev&v$rV?C{fq|AW$_J%gjDM`O+qtAw5=RZN3y?H|Cd{_%}c_^zS&Zk$}x z=FB;B&YUyn%)dCZ@WW9W&Z#-|m!ixHzgL1^M1GLhs8mUSfhBoL7U3 zRbthu$SjvZSPec^l$asw#Iiv|y)jeuALR*i%9Tdd&NTzCWY()REmCRJ>K_^3%<#HxgNcUWRg2V1HmIpv$kc9^ zj96woEy7nRs*@^YqcLaxUol06l%Oqbdq#m|rDPEsdUze`?&Ie~J!4d%baT~$M-1jw z!y{`ztA+)?1nmp;os?moSaw;nvdkmKd6m2^%VitdA9}Nv zUpLy){BL2mLjK(h`4W64nRmp06YB=*yU}+ggM%e&QL<8|e$e(na*|*~|CRqZUELje zVP82ZS>pkR6%6u+UaSmK%*CFUc&jIjt2g0ySOouqmpZNWZm literal 2055 zcmV+i2>ACxT4*^jL0KkKS)Y2IIsgkEf5`uT|NsAI|MWlq|M0*6-|#>H03ZMX;0S+3 z$$D6&ZM)q!PTdC}06GR3qzQrv>Uv4&o}kcpQKp`ugwPrdJx^28c|8px4^wIY9*{JX zLV5ujo}uj#Jw}ZH007VdkOM#sG%$bxrkYJ8%4%qOo`?bJ2dD#020@?z000d!44MEX zGDtN+r1dm<_DYY6j9--zApkFcbA%;+j&K1o6})1Nwf6m8#j-)Lmh5r?IyPohiJi^7`h5IyGB@zApJ1 z$}ibrLbP_ciMwRI1OU@V-CCF6hRGsGhjDlD-5)wY+0S2vRyw;9fkQXGk$o_EPjVy;0Fk^W5rxKdHx;6yxKn)^+qO?PS z3NBXx7uz+*mz}OyQ4^`{AK@VrtlG(K?W!XEvn`|B4iPhiC}SJ+G1ix{UYyIBszU)0 zx4HM$(%4ARaOx6@EM?*iUQPppT#FNj$re`F;NV~evBt%SgkreQrE_aL>=at~EJfjU zA|Pm4xE$J^-ET>mzA7wp$0is%(?|-zB#qNcCa(IJ2;Byhj1CdGz=aL5yc#sJP0l`q zkQ;*(kPR&BZnTaYDbi>BcqB9lZ#*p>{-^{V7K9Nj-76HSmXc8g1Q8p&ZpmO7?ZiTE z#M_*_EJH~RwIq`eew33bPlj$!^*b>NqY2^HeH)z6DqPWI;9`y&U|>XHX!6S@WQIBo zboytaBD8;M5~flHe^-bb!!scK!gM`rMB3~S&h)K0u<;V zijX*^oLdPzc7oc|K-vZYMIZwP{lPt02MWcj!9um&tGh0{UG1tyA#p~F5lAF0uGC$K zwNFV_nkihSZ>qt=sf37t2dN5#9dW+MobW;#L5S^@gG(<@fq9=>jan(^6U1i|8h257 z3Vvt1`?jHsK^g=%00IJG>lrXDQftj9B@~7LH$iuw2n*G(Ki1OIEh}F#N(qMiD(@Yb z&tN=4%4Z(}baA+_Ip?1o#H`Q(DOyGrM}Ag&A^%z=nt>!1#fV{py%kDq3ep1Ipve1+ zSyHUVA!DEAx^#V)L8oVvgV&i>JU!p@A!OG^>*ReSBK^93`fuNNCoSR7ZJ|a62pAuh zrt{)h;($Z*ZPv-Vu5V?t!+|C^*PkOWJ$`UnnFdl1s9T+?dTTm*MDRZQ2+!A zD5z9KC0NMR9)^LXhUmmA;h1x65wHtZ$+Uqb7>rDdZp$}s+`7TBjjE^_fX-ZCU(=q1 zyOfUytAPSx++N|fCDncZxS29WO6;7*_kmIXf1!UzAzU{J7Fau@#(F%9v8Utgppn=B9#NrX z>4s&L%0Wbzq=Ix~(vehp(TceG#*Wau^5u_VL<{bKDNEiGCIXXDNPvNiZH<6wXvv`_ zm`iR9?P%MEbK<+UP`g$}b0E|BD`H%-(p8~SM!~TUV0tvPFxP-cp3;nMD z8QV z(nqs>Wye?0IWoXiBa`zSMf4^@;nvf2Rb6}n{3fw2m##C=<_K81fM!za0D?RRH#E%% znflp(UW}yKtO%1xks;_9F}UL=_&M+2q=Bep5Q;*xL=!E-W}+c$Vkqp?GR&p~1nv!lf>8tw zb2ZN5{(Vi&V>H;=C?T}&LoYfRC5pNSj}Vf*phYW;wcQq>iKw9?OtmC{nL|wrl~4+H zJuVIR8KEQL$4cRAzB|qi(Fit__9}(9aKbuu5PL{NV1sRr;uQ30A`OWf5+MMrfhizM ln^w?irp?>4Wt=ofT~s6)l$G!I&>+9!?ntK!5) Date: Fri, 25 Aug 2023 15:41:50 +0100 Subject: [PATCH 07/11] fix null columns being inserted in calendar_dates key field when a calendar_date exists with no calendar (rare but valid data configuration) --- R/gtfs_merge.R | 44 +++++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/R/gtfs_merge.R b/R/gtfs_merge.R index cf3f1de..4926c20 100644 --- a/R/gtfs_merge.R +++ b/R/gtfs_merge.R @@ -1,20 +1,21 @@ #' merge a list of gtfs files #' #' !WARNING! only the tables: -#' agency, stops, routes, trips, stop_times, calendar, calendar_dates, shapes +#' agency, stops, routes, trips, stop_times, calendar, calendar_dates, shapes, frequencies #' are processed, any other tables in the input timetables are passed through #' -#' if duplicate IDs are detected then completely new ID for all rows will be generated in the output. +#' if duplicate IDs are detected then completely new IDs for all rows will be generated in the output. #' #' @param gtfs_list a list of gtfs objects to be merged #' @param force logical, if TRUE duplicated values are merged taking the fist -#' @param quiet logical, if TRUE less messages #' instance to be the correct instance, in most cases this is ok, but may #' cause some errors +#' @param quiet logical, if TRUE less messages +#' @param condenseServicePatterns logical, if TRUE service patterns across all routes are condensed into a unique set of patterns #' @export -gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) { +gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE, condenseServicePatterns = TRUE) { - # remove any empty input tables + # remove any empty input GTFS objects gtfs_list <- gtfs_list[lengths(gtfs_list) != 0] flattened <- unlist(gtfs_list, recursive = FALSE) rm(gtfs_list) @@ -33,7 +34,7 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) { } }) - #remove empty input tables + #remove input tables not matching tableName matched <- matched[lengths(matched) != 0] #assign each instance of the input table a unique number @@ -41,10 +42,11 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) { #add a column to the data frame containing this unique number suppressWarnings(matched <- dplyr::bind_rows(matched, .id = "file_id")) + matched$file_id <- as.integer(matched$file_id) #if("calendar_dates"==tableName) #{ - # #don't understand what this is doing ? comment would be nice. + # #don't understand what this complex line is doing ? comment would be nice. # calendar_dates <- calendar_dates[sapply(calendar_dates, function(x){ifelse(is.null(nrow(x)),0,nrow(x))}) > 0] # #matched <- matched[sapply(matched, function(x){ifelse(is.null(nrow(x)),0,nrow(x))}) > 0] #} @@ -167,7 +169,9 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) { # calendar - if (any(duplicated(calendar$service_id))) { + calendar_dates_key <- paste(calendar_dates$service_id, calendar_dates$date, calendar_dates$exception_type, sep="#") + + if (any(duplicated(calendar$service_id)) || any(duplicated(calendar_dates_key))) { if(!quiet){message("De-duplicating service_id")} new_service_id <- calendar[, c("file_id", "service_id")] @@ -175,6 +179,10 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) { stop("Duplicated service_id within the same GTFS file") } + # it is valid to have calendar_dates with no associated calendar (see comments further down) + # so create the distinct set of service_id in both calendar and calendar_dates + new_service_id <- dplyr::union(unique(new_service_id), unique(calendar_dates[, c("file_id", "service_id")])) + new_service_id$service_id_new <- seq(1, nrow(new_service_id)) retainedColumnNames <- colnames(calendar)[!(colnames(calendar) %in% c("service_id", "file_id"))] @@ -246,9 +254,18 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) { trips$file_id <- NULL # Condense Duplicate Service patterns - if (nrow(calendar_dates) > 0) { + + # in an ideal world we should not have a trip without a service pattern, and not have calendar_dates with no associated calendar, + # but the real world data isn't that tidy. + # In a typical all GB BODS extract Around 0.2% of trips have a calendar ID but no row in calendar, + # 0.2% of calendar_dates have no trips, 0.1% of calendar_dates have no corresponding calendar. + # we need to guard against this to make sure we don't end up putting null values into any key fields + # This documentation https://gtfs.org/schedule/reference/#calendar_datestxt specifically mentions calendar dates without calendars + # as being a legitimate way to construct the data. + if (condenseServicePatterns && nrow(calendar_dates) > 0) { if(!quiet){message("Condensing duplicated service patterns")} + #find every unique combination of calendar_dates and calender values calendar_dates_summary <- dplyr::group_by(calendar_dates, service_id) if(class(calendar_dates_summary$date) == "Date"){ calendar_dates_summary <- dplyr::summarise(calendar_dates_summary, @@ -260,12 +277,15 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) { ) } - calendar_summary <- dplyr::left_join(calendar, calendar_dates_summary, by = "service_id") + #we want to keep all rows in calendar_dates even if they don't have a row in calendar + calendar_summary <- dplyr::full_join(calendar, calendar_dates_summary, by = "service_id") calendar_summary <- dplyr::group_by( calendar_summary, start_date, end_date, monday, tuesday, wednesday, thursday, friday, saturday, sunday, pattern ) + + #give every unique combination of dates / days / exceptions a new distinct service ID calendar_summary$service_id_new <- dplyr::group_indices(calendar_summary) calendar_summary <- calendar_summary[, c("service_id_new", "service_id")] @@ -287,7 +307,8 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) { calendar_dates <- calendar_dates[!duplicated(calendar_dates$service_id), ] } - # shapes are keyed on a UUID type string, so fairly improbable that the keys collide unless it's actually the same object + + # shapes in a BODS extract are keyed on a UUID type string, so fairly improbable that the keys collide unless it's actually the same object composite_key <- paste0(shapes$shape_id, shapes$shape_pt_sequence, sep = "#") if (any(duplicated(composite_key))) { if(force){ @@ -301,6 +322,7 @@ gtfs_merge <- function(gtfs_list, force = FALSE, quiet = TRUE) { stop_times$file_id <- NULL routes$file_id <- NULL calendar$file_id <- NULL + frequencies$file_id <- NULL res_final <- list(agency, stops, routes, trips, stop_times, calendar, calendar_dates, shapes, frequencies) names(res_final) <- c("agency", "stops", "routes", "trips", "stop_times", "calendar", "calendar_dates", "shapes","frequencies") From bf8f460acbf624f594215cbea94132c602a924de Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Fri, 25 Aug 2023 15:42:38 +0100 Subject: [PATCH 08/11] correct comment - column is a 3 valued enum, not bool --- R/gtfs_read.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/gtfs_read.R b/R/gtfs_read.R index fcd585d..ead852c 100644 --- a/R/gtfs_read.R +++ b/R/gtfs_read.R @@ -34,7 +34,7 @@ gtfs_read <- function(path){ stop_name = readr::col_character(), stop_lat = readr::col_number(), stop_lon = readr::col_number(), - wheelchair_boarding = readr::col_integer(), #boolean but treat as integer so 0|1 written to file + wheelchair_boarding = readr::col_integer(), #enum value 2 is valid but rarely seen outside the spec document location_type = readr::col_integer(), parent_station = readr::col_character(), platform_code = readr::col_character()), @@ -65,7 +65,7 @@ gtfs_read <- function(path){ service_id = readr::col_character(), block_id = readr::col_character(), shape_id = readr::col_character(), - wheelchair_accessible = readr::col_integer() #boolean but treat as integer so 0|1 written to file + wheelchair_accessible = readr::col_integer() #enum value 2 is valid but rarely seen outside the spec document ), show_col_types = FALSE, lazy = FALSE) From f04ae8b4231bf5e3a41bc84a20708dc0ea43b7d6 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Fri, 25 Aug 2023 15:43:35 +0100 Subject: [PATCH 09/11] pass 'silent' value down the call stack and give some level of warning about merge() failing --- R/transxchange.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/transxchange.R b/R/transxchange.R index 5cf8172..b6c6c7a 100644 --- a/R/transxchange.R +++ b/R/transxchange.R @@ -208,11 +208,11 @@ transxchange2gtfs <- function(path_in, if(!silent){ message(paste0(Sys.time(), " Merging GTFS objects"))} - gtfs_merged <- try(gtfs_merge(gtfs_all, force = force_merge)) + gtfs_merged <- try(gtfs_merge(gtfs_all, force=force_merge, quiet=silent)) if (class(gtfs_merged) == "try-error") { - message("Merging failed, returing unmerged GFTS object for analysis") - return(gtfs_all) + warning("Merging failed, returing unmerged GFTS object for analysis") + return(gtfs_all) #this is not helpful - caller has no idea there was an error and ploughs on, causing strange errors much later on } return(gtfs_merged) } From b4b5651ebf7205883e4b516a23cc05fd678710f6 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Fri, 25 Aug 2023 15:44:12 +0100 Subject: [PATCH 10/11] format frequencies table columns correctly --- R/write_gtfs.R | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/R/write_gtfs.R b/R/write_gtfs.R index 2142f7f..a56faa8 100644 --- a/R/write_gtfs.R +++ b/R/write_gtfs.R @@ -55,6 +55,16 @@ gtfs_write <- function(gtfs, gtfs$stop_times$departure_time <- period2gtfs(gtfs$stop_times$departure_time) } + if("frequencies" %in% names(gtfs)) + { + if("difftime" %in% class(gtfs$frequencies$start_time)){ + gtfs$frequencies$start_time <- format(gtfs$frequencies$start_time, format = "%H:%M:%S") + } + + if("difftime" %in% class(gtfs$frequencies$end_time)){ + gtfs$frequencies$end_time <- format(gtfs$frequencies$end_time, format = "%H:%M:%S") + } + } dir.create(paste0(tempdir(), "/gtfs_temp")) From 4048f2774cb082d1f0964331faf85c81eb84e983 Mon Sep 17 00:00:00 2001 From: oweno-tfwm Date: Fri, 25 Aug 2023 15:44:56 +0100 Subject: [PATCH 11/11] exercise more code paths with test by producing more verbose output --- tests/testthat/test_transxchange.R | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test_transxchange.R b/tests/testthat/test_transxchange.R index 112cc7b..92f3578 100644 --- a/tests/testthat/test_transxchange.R +++ b/tests/testthat/test_transxchange.R @@ -26,7 +26,8 @@ test_that("test transxchange2gtfs singlecore", { naptan = naptan, ncores = 1, try_mode = FALSE, - force_merge = TRUE) + force_merge = TRUE, + silent = FALSE) gtfs_write(gtfs,folder = file_path, name = "txc_gtfs2") expect_true(file.exists(file.path(file_path,"txc_gtfs2.zip"))) @@ -40,7 +41,8 @@ if(.Platform$OS.type == "unix") { naptan = naptan, ncores = 2, try_mode = FALSE, - force_merge = TRUE) + force_merge = TRUE, + silent = FALSE) gtfs_write(gtfs,folder = file_path, name = "txc_gtfs") expect_true(file.exists(file.path(file_path,"txc_gtfs.zip")))