Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Version 2 #3

Merged
merged 3 commits into from
Apr 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,20 @@ URL to access WTISEN
Four-digit Public Health Unit ID
**Example**: `1234`

**3. `start`**
**3. `report`**
Report name, preceded in report URL by `/RSReports/` and ends with `.rdl`
**Example**: `Water+Testing+PHU+Report.rdl`

**4. `start`**
The start date for the records you want to retrieve in the format of `YYYY-MM-DD`
**Example**: `2022-01-01`

**4. `end`**
The end date for the records you want to retrieve in the format of `YYYY-MM-DD`. Must be less than 3 years after `start`.
**5. `end`**
The end date for the records you want to retrieve in the format of `YYYY-MM-DD`
**Example**: `2022-12-31`

**5. `output`**
The filename where the output will be written
**6. `output`**
The filename where the output will be written in CSV format
**Example**: `wtisen.csv`

## Processing Container
Expand Down
26 changes: 22 additions & 4 deletions processing/dockerfile
Original file line number Diff line number Diff line change
@@ -1,12 +1,30 @@
FROM r-base:4.2.2
FROM rocker/r-ver:4.3.3

WORKDIR /home/docker/

# Library initialization using renv
RUN Rscript --vanilla -e "install.packages('renv', repos='https://mirror.csclub.uwaterloo.ca/CRAN/')"
COPY renv.lock .
RUN Rscript --vanilla -e "renv::restore(prompt = F)"
RUN Rscript --vanilla -e " \
options(repos = c(CRAN = 'https://cloud.r-project.org')); \
install.packages('renv') \
"

# Direct dependencies
RUN Rscript --vanilla -e " \
renv::install( \
packages = c( \
'arrow@15.0.1', \
'dplyr@1.1.4', \
'lubridate@1.9.3', \
'optparse@1.7.5', \
'readr@2.1.5', \
'renv@1.0.7', \
'stringr@1.5.1', \
'tidyr@1.3.1' \
), \
prompt = F, \
lock = T \
) \
"
# Data processing code
COPY process.R /home/docker/

Expand Down
177 changes: 96 additions & 81 deletions processing/process.R
Original file line number Diff line number Diff line change
@@ -1,52 +1,57 @@
cat(
"WTISEN Result Pre-processing",
"############################",
sep = "\n")
##############
# Parameters #
##############

cat("\nWorking directory:", getwd(),"\n")
library(optparse)

logger = function(..., sep = ""){
cat("\n", format(Sys.time(), format = '%Y-%m-%d %H:%M:%S'), " ", ..., sep = sep)}

parser = OptionParser(
option_list = list(

make_option(
opt_str = c("-i", "--input"),
help = "Input file, in CSV format.",
type = "character",
default = ""),

make_option(
opt_str = c("-o", "--output"),
help = "Output file, in CSV format.",
type = "character",
default = ""),

make_option(
opt_str = c("-v", "--verbose"),
help = "Print additional diagnostic information.",
action = "store_true",
default = FALSE)
)
)

# Parse arguments
args = parse_args(parser)

# Verbose argument
if(args$verbose){
logger("The following arguments have been passed to R:",
commandArgs(trailingOnly = TRUE))
}

###################
# Data processing #
###################

# Disable package masking warnings for production
options(conflicts.policy = list("warn" = F))

# Load libraries
library(readr)
library(tidyr)
library(dplyr)
library(stringr)
library(lubridate)

# Script arguments
cat(
"\n\nThis script requires 2 arguments:",
"The path of the CSV export from PHO WTISEN",
"The path of the processed parquet output", sep = "\n-"
)

args = commandArgs(trailingOnly = T)

cat("\nArguments detected:", args, sep = "\n-")

wtisen_input = args[1]
wtisen_output = args[2]

# Extract top content of CSV for logging
cat("\nFile info from PHO WTISEN:\n")
read_csv(
file = wtisen_input,
skip = 1,
n_max = 1,
col_names = F,
col_select = 2,
show_col_types = F) |>
pull(1) |>
str_replace_all(c(
"--" = "\n",
" {2,}" = " ",
"\r" = "",
"\n " = "\n")) |>
cat()


# Utility function
# Postal code cleaner
postalcode_check = function(x){
Expand Down Expand Up @@ -108,37 +113,45 @@ postalcode_cleaner = function(x){
return(x)
}

date_bounds = interval(as.POSIXct("2008-01-01"), Sys.Date())
if(args$input != ""){
logger("Reading unprocessed CSV file input from: ", args$input)
} else {
logger("No input file specified, ending script")
stop("No input file specified")
}

# Extract CSV content
wtisen_data = read_csv(
file = wtisen_input,
skip = 3,
col_types = cols_only(
DATE_Collected = col_datetime(format = "%m/%d/%Y %I:%M:%S %p"),
DATE_RECEIVED = col_datetime(format = "%m/%d/%Y %I:%M:%S %p"),
Barcode = col_character(),
Laboratory = col_character(),
Sub_Phone = col_character(),
Sub_Alt_Phone = col_character(),
Sub_First_Name2 = col_character(),
Sub_Last_Name2 = col_character(),
SRC_ADDRESS = col_character(),
SRC_LOT_NUM = col_character(),
SRC_CONCESSION = col_character(),
SRC_CITY = col_character(),
SRC_MUNICIPALITY = col_character(),
SRC_COUNTY = col_character(),
SRC_EMERGENCY_LOC_NO = col_character(),
SRC_POSTAL = col_character(),
ENTRY = col_integer(),
FORMATTED_ENTRY = col_character(),
TOTAL_COLIFORM = col_character(),
E_COLI = col_character(),
DATE_RELEASED = col_datetime(format = "%Y-%m-%d %H:%M:%S"),
DATE_REPORTED = col_datetime(format = "%Y-%m-%d %H:%M:%S"),
REQ_LEGIBLE = col_character())) |>
rename_with(.fn = \(x) str_remove_all(str_to_upper(x), "^SRC_|^SUB_|2$")) |>
file = args$input,
guess_max = 0,
show_col_types = FALSE) |>
rename_with(.fn = \(x) {x |>
str_to_upper() |>
str_remove_all("^SRC_|^SUB_|2$")}) |>
select(
BARCODE,
DATE_COLLECTED,
DATE_RECEIVED,
DATE_RELEASED,
DATE_REPORTED,
LABORATORY,
PHONE,
ALT_PHONE,
FIRST_NAME,
LAST_NAME,
ADDRESS,
LOT_NUM,
CONCESSION,
CITY,
MUNICIPALITY,
COUNTY,
EMERGENCY_LOC_NO,
POSTAL,
ENTRY,
FORMATTED_ENTRY,
TOTAL_COLIFORM,
E_COLI,
REQ_LEGIBLE) |>
mutate(
across(
.cols = c(
Expand All @@ -148,23 +161,25 @@ wtisen_data = read_csv(
"COUNTY"),
.fns = \(x) str_replace(x, "_", " ")),
across(
.cols = where(is.character),
.fns = str_trim),
.cols = starts_with("DATE_"),
.fns = \(x) {x |>
as_datetime(format = c("%m/%d/%Y %I:%M:%S %p", "%Y-%m-%d %H:%M:%S")) |>
force_tz(tz = "America/Toronto")}),
across(
.cols = starts_with("DATE"),
.fns = \(x) force_tz(x, tz = "America/Toronto")),
across(
.cols = starts_with("DATE"),
.fns = \(x) if_else(x %within% date_bounds, x, NA_POSIXct_)),
.cols = where(is.character),
.fns = \(x) str_trim(x)),
POSTAL = postalcode_cleaner(POSTAL),
REQ_LEGIBLE = str_detect(REQ_LEGIBLE, "^y|Y$")) |>
relocate("BARCODE", "REQ_LEGIBLE", starts_with("DATE"))
ENTRY = as.integer(ENTRY),
REQ_LEGIBLE = str_detect(REQ_LEGIBLE, "^y|Y$"))

cat("\nData loaded and processed")
cat("\nDimensions: ", dim(wtisen_data)[1], " x ", dim(wtisen_data)[2], "\n", sep = "")
cat("\nFields:", names(wtisen_data), sep = "\n-")
logger("Data loaded and processed")
logger("Dimensions: ", dim(wtisen_data)[1], " x ", dim(wtisen_data)[2])

arrow::write_parquet(wtisen_data, wtisen_output)
cat("\nPre-processed data output to: ", wtisen_output, sep = "")
if(args$output != ""){
arrow::write_parquet(wtisen_data, args$output)
logger("Processed data output in parquet format to: ", args$output)
} else {
logger("No output location specified, skipping data output")
}

cat("\n\nDone!")
logger("Done!")
Loading