Skip to content

Commit

Permalink
Revamp logging
Browse files Browse the repository at this point in the history
Use optparse for script arguments
Simplify renv approach
Change base image to r-ver from r-base
  • Loading branch information
jangevaare committed Apr 19, 2024
1 parent eab5042 commit 4753e0a
Show file tree
Hide file tree
Showing 3 changed files with 118 additions and 618 deletions.
26 changes: 22 additions & 4 deletions processing/dockerfile
Original file line number Diff line number Diff line change
@@ -1,12 +1,30 @@
FROM r-base:4.2.2
FROM rocker/r-ver:4.3.3

WORKDIR /home/docker/

# Library initialization using renv
RUN Rscript --vanilla -e "install.packages('renv', repos='https://mirror.csclub.uwaterloo.ca/CRAN/')"
COPY renv.lock .
RUN Rscript --vanilla -e "renv::restore(prompt = F)"
RUN Rscript --vanilla -e " \
options(repos = c(CRAN = 'https://cloud.r-project.org')); \
install.packages('renv') \
"

# Direct dependencies
RUN Rscript --vanilla -e " \
renv::install( \
packages = c( \
'arrow@15.0.1', \
'dplyr@1.1.4', \
'lubridate@1.9.3', \
'optparse@1.7.5', \
'readr@2.1.5', \
'renv@1.0.7', \
'stringr@1.5.1', \
'tidyr@1.3.1' \
), \
prompt = F, \
lock = T \
) \
"
# Data processing code
COPY process.R /home/docker/

Expand Down
177 changes: 96 additions & 81 deletions processing/process.R
Original file line number Diff line number Diff line change
@@ -1,52 +1,57 @@
cat(
"WTISEN Result Pre-processing",
"############################",
sep = "\n")
##############
# Parameters #
##############

cat("\nWorking directory:", getwd(),"\n")
library(optparse)

logger = function(..., sep = ""){
cat("\n", format(Sys.time(), format = '%Y-%m-%d %H:%M:%S'), " ", ..., sep = sep)}

parser = OptionParser(
option_list = list(

make_option(
opt_str = c("-i", "--input"),
help = "Input file, in CSV format.",
type = "character",
default = ""),

make_option(
opt_str = c("-o", "--output"),
help = "Output file, in CSV format.",
type = "character",
default = ""),

make_option(
opt_str = c("-v", "--verbose"),
help = "Print additional diagnostic information.",
action = "store_true",
default = FALSE)
)
)

# Parse arguments
args = parse_args(parser)

# Verbose argument
if(args$verbose){
logger("The following arguments have been passed to R:",
commandArgs(trailingOnly = TRUE))
}

###################
# Data processing #
###################

# Disable package masking warnings for production
options(conflicts.policy = list("warn" = F))

# Load libraries
library(readr)
library(tidyr)
library(dplyr)
library(stringr)
library(lubridate)

# Script arguments
cat(
"\n\nThis script requires 2 arguments:",
"The path of the CSV export from PHO WTISEN",
"The path of the processed parquet output", sep = "\n-"
)

args = commandArgs(trailingOnly = T)

cat("\nArguments detected:", args, sep = "\n-")

wtisen_input = args[1]
wtisen_output = args[2]

# Extract top content of CSV for logging
cat("\nFile info from PHO WTISEN:\n")
read_csv(
file = wtisen_input,
skip = 1,
n_max = 1,
col_names = F,
col_select = 2,
show_col_types = F) |>
pull(1) |>
str_replace_all(c(
"--" = "\n",
" {2,}" = " ",
"\r" = "",
"\n " = "\n")) |>
cat()


# Utility function
# Postal code cleaner
postalcode_check = function(x){
Expand Down Expand Up @@ -108,37 +113,45 @@ postalcode_cleaner = function(x){
return(x)
}

date_bounds = interval(as.POSIXct("2008-01-01"), Sys.Date())
if(args$input != ""){
logger("Reading unprocessed CSV file input from: ", args$input)
} else {
logger("No input file specified, ending script")
stop("No input file specified")
}

# Extract CSV content
wtisen_data = read_csv(
file = wtisen_input,
skip = 3,
col_types = cols_only(
DATE_Collected = col_datetime(format = "%m/%d/%Y %I:%M:%S %p"),
DATE_RECEIVED = col_datetime(format = "%m/%d/%Y %I:%M:%S %p"),
Barcode = col_character(),
Laboratory = col_character(),
Sub_Phone = col_character(),
Sub_Alt_Phone = col_character(),
Sub_First_Name2 = col_character(),
Sub_Last_Name2 = col_character(),
SRC_ADDRESS = col_character(),
SRC_LOT_NUM = col_character(),
SRC_CONCESSION = col_character(),
SRC_CITY = col_character(),
SRC_MUNICIPALITY = col_character(),
SRC_COUNTY = col_character(),
SRC_EMERGENCY_LOC_NO = col_character(),
SRC_POSTAL = col_character(),
ENTRY = col_integer(),
FORMATTED_ENTRY = col_character(),
TOTAL_COLIFORM = col_character(),
E_COLI = col_character(),
DATE_RELEASED = col_datetime(format = "%Y-%m-%d %H:%M:%S"),
DATE_REPORTED = col_datetime(format = "%Y-%m-%d %H:%M:%S"),
REQ_LEGIBLE = col_character())) |>
rename_with(.fn = \(x) str_remove_all(str_to_upper(x), "^SRC_|^SUB_|2$")) |>
file = args$input,
guess_max = 0,
show_col_types = FALSE) |>
rename_with(.fn = \(x) {x |>
str_to_upper() |>
str_remove_all("^SRC_|^SUB_|2$")}) |>
select(
BARCODE,
DATE_COLLECTED,
DATE_RECEIVED,
DATE_RELEASED,
DATE_REPORTED,
LABORATORY,
PHONE,
ALT_PHONE,
FIRST_NAME,
LAST_NAME,
ADDRESS,
LOT_NUM,
CONCESSION,
CITY,
MUNICIPALITY,
COUNTY,
EMERGENCY_LOC_NO,
POSTAL,
ENTRY,
FORMATTED_ENTRY,
TOTAL_COLIFORM,
E_COLI,
REQ_LEGIBLE) |>
mutate(
across(
.cols = c(
Expand All @@ -148,23 +161,25 @@ wtisen_data = read_csv(
"COUNTY"),
.fns = \(x) str_replace(x, "_", " ")),
across(
.cols = where(is.character),
.fns = str_trim),
.cols = starts_with("DATE_"),
.fns = \(x) {x |>
as_datetime(format = c("%m/%d/%Y %I:%M:%S %p", "%Y-%m-%d %H:%M:%S")) |>
force_tz(tz = "America/Toronto")}),
across(
.cols = starts_with("DATE"),
.fns = \(x) force_tz(x, tz = "America/Toronto")),
across(
.cols = starts_with("DATE"),
.fns = \(x) if_else(x %within% date_bounds, x, NA_POSIXct_)),
.cols = where(is.character),
.fns = \(x) str_trim(x)),
POSTAL = postalcode_cleaner(POSTAL),
REQ_LEGIBLE = str_detect(REQ_LEGIBLE, "^y|Y$")) |>
relocate("BARCODE", "REQ_LEGIBLE", starts_with("DATE"))
ENTRY = as.integer(ENTRY),
REQ_LEGIBLE = str_detect(REQ_LEGIBLE, "^y|Y$"))

cat("\nData loaded and processed")
cat("\nDimensions: ", dim(wtisen_data)[1], " x ", dim(wtisen_data)[2], "\n", sep = "")
cat("\nFields:", names(wtisen_data), sep = "\n-")
logger("Data loaded and processed")
logger("Dimensions: ", dim(wtisen_data)[1], " x ", dim(wtisen_data)[2])

arrow::write_parquet(wtisen_data, wtisen_output)
cat("\nPre-processed data output to: ", wtisen_output, sep = "")
if(args$output != ""){
arrow::write_parquet(wtisen_data, args$output)
logger("Processed data output in parquet format to: ", args$output)
} else {
logger("No output location specified, skipping data output")
}

cat("\n\nDone!")
logger("Done!")
Loading

0 comments on commit 4753e0a

Please sign in to comment.