Skip to content

Commit bdd6065

Browse files
committed
log_with_indent; thin_portal_data; bugfixes
1 parent 4081baf commit bdd6065

File tree

3 files changed

+210
-35
lines changed

3 files changed

+210
-35
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
102695305

src/acquisition_master.R

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -269,7 +269,8 @@ for(dmnrow in 1:nrow(network_domain)){
269269
logger_module <- 'ms.module'
270270

271271
generate_portal_extras(site_data = site_data,
272-
network_domain = network_domain)
272+
network_domain = network_domain,
273+
thin_portal_data_to_interval = '1 day')
273274

274275
if(length(email_err_msgs)){
275276
email_err(msgs = email_err_msgs,

src/global/global_helpers.R

Lines changed: 207 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -628,7 +628,7 @@ ms_read_raw_csv <- function(filepath,
628628
preprocessed_tibble,
629629
datetime_cols,
630630
datetime_tz,
631-
datetime_optional_chars = ':',
631+
optionalize_nontoken_characters = ':',
632632
site_name_col,
633633
alt_site_name,
634634
data_cols,
@@ -670,7 +670,17 @@ ms_read_raw_csv <- function(filepath,
670670
# columns.
671671
#datetime_tz: string specifying time zone. this specification must be
672672
# among those provided by OlsonNames()
673-
#datetime_optional_chars: see "optional" argument to dt_format_to_regex
673+
#optionalize_nontoken_characters: character vector; used when there might be
674+
# variation in date/time formatting within a column. in regex speak,
675+
# optionalizing a token string means, "match this string if it exists,
676+
# but move on to the next token if it doesn't." All datetime parsing tokens
677+
# (like "%H") are optionalized automatically when this function converts
678+
# them to regex. But other tokens like ":" and "-" that might be used in
679+
# datetime strings are not. Concretely, if you wanted to read either "%H:%M:%S"
680+
# or "%H:%M" in the same column, you'd set optionalize_nontoken_characters = ':',
681+
# and then the parser wouldn't require there to be two colons in order to
682+
# match the string. Don't use this if you don't have to, because it reduces
683+
# specificity. See "optional" argument to dt_format_to_regex for more details.
674684
#site_name_col: name of column containing site name information
675685
#alt_site_name: optional list. Names of list elements are desired site_names
676686
# within MacroSheds. List elements are character vectors of alternative
@@ -942,10 +952,10 @@ ms_read_raw_csv <- function(filepath,
942952

943953
#resolve datetime structure into POSIXct
944954
d <- resolve_datetime(d = d,
945-
datetime_colnames = datetime_colnames,
946-
datetime_formats = datetime_formats,
947-
datetime_tz = datetime_tz,
948-
optional = datetime_optional_chars)
955+
datetime_colnames = datetime_colnames,
956+
datetime_formats = datetime_formats,
957+
datetime_tz = datetime_tz,
958+
optional = optionalize_nontoken_characters)
949959

950960
#remove rows with NA in datetime or site_name
951961
d <- filter(d,
@@ -1035,18 +1045,18 @@ resolve_datetime <- function(d,
10351045
datetime_colnames,
10361046
datetime_formats,
10371047
datetime_tz,
1038-
optional) {
1048+
optional){
10391049

1040-
#d is a data.frame or tibble with at least one date or time column
1050+
#d: a data.frame or tibble with at least one date or time column
10411051
# (all date and/or time columns must contain character strings,
1042-
# not parsed date/time/datetime objects)
1043-
#datetime_colnames is a character vector of column names that contain
1044-
# relevantdatetime information
1045-
#datetime_formats is a character vector of datetime parsing tokens
1052+
# not parsed date/time/datetime objects).
1053+
#datetime_colnames: character vector; column names that contain
1054+
# relevant datetime information.
1055+
#datetime_formats: character vector; datetime parsing tokens
10461056
# (like '%A, %Y-%m-%d %I:%M:%S %p' or '%j') corresponding to the
1047-
# elements of datetime_colnames
1048-
# datetime_tz is the time zone of the returned datetime column
1049-
#optional: see dt_format_to_regex
1057+
# elements of datetime_colnames.
1058+
#datetime_tz: character; time zone of the returned datetime column.
1059+
#optional: character vector; see dt_format_to_regex.
10501060

10511061
#return value: d, but with a "datetime" column containing POSIXct datetimes
10521062
# and without the input datetime columns
@@ -1088,6 +1098,22 @@ resolve_datetime <- function(d,
10881098
colnames(dt_tb)),
10891099
datetime_formats_split)
10901100

1101+
if('H' %in% colnames(dt_tb)){
1102+
dt_tb$H[dt_tb$H == ''] <- '00'
1103+
}
1104+
if('M' %in% colnames(dt_tb)){
1105+
dt_tb$M[dt_tb$M == ''] <- '00'
1106+
}
1107+
if('S' %in% colnames(dt_tb)){
1108+
dt_tb$S[dt_tb$S == ''] <- '00'
1109+
}
1110+
if('I' %in% colnames(dt_tb)){
1111+
dt_tb$I[dt_tb$I == ''] <- '00'
1112+
}
1113+
if('P' %in% colnames(dt_tb)){
1114+
dt_tb$P[dt_tb$P == ''] <- 'AM'
1115+
}
1116+
10911117
dt_tb <- dt_tb %>%
10921118
tidyr::unite(col = 'datetime',
10931119
everything(),
@@ -1112,15 +1138,15 @@ dt_format_to_regex <- function(fmt, optional){
11121138
#fmt is a character vector of datetime formatting strings, such as
11131139
# '%A, %Y-%m-%d %I:%M:%S %p' or '%j'. each element of fmt that is a
11141140
# datetime token is replaced with a regex string that matches
1115-
# the what the token represents. For example, '%Y' matches a 4-digit
1141+
# what the token represents. For example, '%Y' matches a 4-digit
11161142
# year and '[0-9]{4}' matches a 4-digit numeric sequence. non-token
11171143
# characters (anything not following a %) are not modified. Note that
11181144
# tokens B, b, h, A, and a are replaced by '[a-zA-Z]+', which matches
11191145
# any sequence of one or more alphabetic characters of either case,
11201146
# not just meaningful month/day names 'Weds' or 'january'. Also note
11211147
# that these tokens are not currently accepted: g, G, n, t, c, r, R, T.
1122-
#optional is a character vector of characters that should be made
1123-
# optional in the exported regex (succeeded by a ?). This is useful if
1148+
#optional is a vector of characters that should be made
1149+
# optional in the exported regex (followed by a '?'). This is useful if
11241150
# e.g. fmt is '%H:%M:%S' and elements to be matched may either appear in
11251151
# HH:MM:SS or HH:MM format. making the ":" character optional here
11261152
# (via optional = ':') allows the hour and minute data to be retained,
@@ -1144,10 +1170,10 @@ dt_format_to_regex <- function(fmt, optional){
11441170
W = '([0-9]{2})?',
11451171
V = '([0-9]{2})?',
11461172
C = '([0-9]{2})?',
1147-
H = '([0-9]{2})?',
1148-
I = '([0-9]{2})?',
1149-
M = '([0-9]{2})?',
1150-
S = '([0-9]{2})?',
1173+
H = '([0-9]{1,2})?',
1174+
I = '([0-9]{1,2})?',
1175+
M = '([0-9]{1,2})?',
1176+
S = '([0-9]{1,2})?',
11511177
p = '([AP]M)?',
11521178
z = '([+\\-][0-9]{4})?',
11531179
`F` = '([0-9]{4}-[0-9]{2}-[0-9]{2})')
@@ -2861,6 +2887,7 @@ delineate_watershed_apriori <- function(lat, long, crs,
28612887
filename = dem_f,
28622888
overwrite = TRUE)
28632889

2890+
#loses projection
28642891
sf::st_write(obj = site,
28652892
dsn = point_f,
28662893
delete_layer = TRUE,
@@ -3061,6 +3088,7 @@ delineate_watershed_by_specification <- function(lat,
30613088
filename = dem_f,
30623089
overwrite = TRUE)
30633090

3091+
#loses projection
30643092
sf::st_write(obj = site,
30653093
dsn = point_f,
30663094
delete_layer = TRUE,
@@ -4748,14 +4776,24 @@ choose_projection <- function(lat = NULL,
47484776

47494777
abslat <- abs(lat)
47504778

4779+
# THIS WORKS (PROJECTS STUFF), BUT CAN'T BE READ AUTOMATICALLY BY st_read
47514780
if(abslat < 23){ #tropical
47524781
PROJ4 = glue('+proj=laea +lon_0=', long)
47534782
# ' +datum=WGS84 +units=m +no_defs')
47544783
} else { #temperate or polar
47554784
PROJ4 = glue('+proj=laea +lat_0=', lat, ' +lon_0=', long)
47564785
}
4757-
# ' +datum=WGS84 +units=m +no_defs')
47584786

4787+
## UTM/UPS would be nice for watersheds that don't fall on more than two zones
4788+
## (incomplete)
4789+
# if(lat > 84 || lat < -80){ #polar; use Universal Polar Stereographic (UPS)
4790+
# PROJ4 <- glue('+proj=ups +lon_0=', long)
4791+
# # ' +datum=WGS84 +units=m +no_defs')
4792+
# } else { #not polar; use UTM
4793+
# PROJ4 <- glue('+proj=utm +lat_0=', lat, ' +lon_0=', long)
4794+
# }
4795+
4796+
## EXTRA CODE FOR CHOOSING PROJECTION BY LATITUDE ONLY
47594797
# if(abslat < 23){ #tropical
47604798
# PROJ4 <- 9835 #Lambert cylindrical equal area (ellipsoidal; should spherical 9834 be used instead?)
47614799
# } else if(abslat > 23 && abslat < 66){ # middle latitudes
@@ -8454,24 +8492,155 @@ pull_usgs_discharge <- function(network, domain, prodname_ms, sites, time_step)
84548492
return()
84558493
}
84568494

8495+
log_with_indent <- function(msg, logger, level = 'info', indent = 1){
8496+
8497+
#level is one of "info", "warn", 'error".
8498+
#indent: the number of spaces to indent after the colon.
8499+
8500+
indent_str <- rep('\U2800\U2800', indent)
8501+
8502+
if(level == 'info'){
8503+
loginfo(msg = paste0(enc2native(indent_str),
8504+
msg),
8505+
logger = logger)
8506+
} else if(level == 'warn'){
8507+
logwarn(msg = paste0(enc2native(indent_str),
8508+
msg),
8509+
logger = logger)
8510+
} else if(level == 'error'){
8511+
logerror(msg = paste0(enc2native(indent_str),
8512+
msg),
8513+
logger = logger)
8514+
}
8515+
}
8516+
84578517
generate_portal_extras <- function(site_data,
8458-
network_domain){
8518+
network_domain,
8519+
thin_portal_data_to_interval = NA){
8520+
8521+
#thin_portal_data_to_interval: passed to the "unit" parameter of lubridate::round_date.
8522+
# set to NA (the dafault) to prevent thinning.
84598523

84608524
#for post-derive steps that save the portal some processing.
84618525

8462-
loginfo(msg = 'Generating portal extras',
8526+
loginfo(msg = 'Generating portal extras:',
84638527
logger = logger_module)
84648528

8465-
loginfo('scaling flux by area', logger = logger_module)
8529+
log_with_indent('scaling flux by area', logger = logger_module)
84668530
calculate_flux_by_area(site_data = site_data)
8467-
loginfo('writing config datasets to local dir', logger = logger_module)
8531+
8532+
log_with_indent('writing config datasets to local dir', logger = logger_module)
84688533
write_portal_config_datasets()
8469-
loginfo('cataloguing held data', logger = logger_module)
8470-
catalogue_held_data(site_data = site_data,
8471-
network_domain = network_domain)
8472-
loginfo('combining watershed boundaries', logger = logger_module)
8534+
8535+
log_with_indent('cataloging held data', logger = logger_module)
8536+
catalog_held_data(site_data = site_data,
8537+
network_domain = network_domain)
8538+
8539+
log_with_indent('combining watershed boundaries', logger = logger_module)
84738540
combine_ws_boundaries()
8541+
8542+
log_with_indent('determining which domains have Q', logger = logger_module)
84748543
list_domains_with_discharge(site_data = site_data)
8544+
8545+
if(! is.na(thin_portal_data_to_interval)){
8546+
log_with_indent('thinning portal datasets to 1 day',
8547+
logger = logger_module)
8548+
thin_portal_data(network_domain = network_domain,
8549+
thin_interval = thin_portal_data_to_interval)
8550+
}
8551+
}
8552+
8553+
thin_portal_data <- function(network_domain, thin_interval){
8554+
8555+
#thin_interval: passed to the "unit" parameter of lubridate::round_date
8556+
8557+
domains <- network_domain$domain
8558+
8559+
n_domains <- length(domains)
8560+
for(i in 1:n_domains){
8561+
8562+
dmn <- domains[i]
8563+
8564+
log_with_indent(msg = glue('{d}: ({ii}/{n})',
8565+
d = dmn,
8566+
ii = i,
8567+
n = n_domains),
8568+
logger = logger_module,
8569+
indent = 2)
8570+
8571+
prod_dirs <- try(
8572+
{
8573+
list.files(path = glue('../portal/data/{d}/',
8574+
d = dmn),
8575+
full.names = FALSE,
8576+
recursive = FALSE)
8577+
},
8578+
silent = TRUE
8579+
)
8580+
8581+
if(length(prod_dirs)){
8582+
8583+
#filter products that never need to be thinned. keep the ones that might
8584+
rgx <- paste0('(^precipitation|^precip_chemistry|^discharge',
8585+
'|^precip_flux|^stream_chemistry|^stream_flux)')
8586+
prod_dirs <- grep(pattern = rgx,
8587+
x = prod_dirs,
8588+
value = TRUE)
8589+
}
8590+
8591+
for(prd in prod_dirs){
8592+
8593+
site_files <- list.files(path = glue('../portal/data/{d}/{p}',
8594+
d = dmn,
8595+
p = prd),
8596+
full.names = TRUE,
8597+
recursive = FALSE)
8598+
8599+
if(prd == 'precipitation'){
8600+
agg_call <- quote(sum(val, na.rm = TRUE))
8601+
} else {
8602+
agg_call <- quote(mean(val, na.rm = TRUE))
8603+
}
8604+
8605+
for(stf in site_files){
8606+
8607+
#check whether this file needs to be thinned
8608+
dtcol <- read_feather(stf, columns = 'datetime')
8609+
interval_min <- Mode(diff(as.numeric(dtcol$datetime)) / 60)
8610+
needs_thin <- ! is.na(interval_min) && interval_min <= 24 * 60
8611+
8612+
if(needs_thin){
8613+
8614+
d <- read_feather(stf) %>%
8615+
mutate(
8616+
datetime = lubridate::round_date(
8617+
x = datetime,
8618+
unit = thin_interval),
8619+
val = errors::set_errors(val, val_err)) %>%
8620+
select(-val_err)
8621+
8622+
if(length(unique(d$site_name)) > 1){
8623+
stop(paste('Multiple site_names in', stf))
8624+
}
8625+
8626+
d %>%
8627+
group_by(datetime, var) %>%
8628+
summarize(
8629+
site_name = first(site_name),
8630+
val = eval(agg_call),
8631+
ms_status = numeric_any(ms_status),
8632+
ms_interp = numeric_any(ms_interp)) %>%
8633+
ungroup() %>%
8634+
mutate(val_err = errors(val),
8635+
val_err = ifelse(is.na(val_err), 0, val_err),
8636+
val = errors::drop_errors(val)) %>%
8637+
select(datetime, site_name, var, val, ms_status, ms_interp,
8638+
val_err) %>%
8639+
write_feather(stf)
8640+
}
8641+
}
8642+
}
8643+
}
84758644
}
84768645

84778646
list_domains_with_discharge <- function(site_data){
@@ -8868,7 +9037,7 @@ munge_versionless_product <- function(network,
88689037
}
88699038
}
88709039

8871-
catalogue_held_data <- function(network_domain, site_data){
9040+
catalog_held_data <- function(network_domain, site_data){
88729041

88739042
#tabulates:
88749043
# + total nonspatial observations for the portal landing page
@@ -9036,11 +9205,14 @@ catalogue_held_data <- function(network_domain, site_data){
90369205
}
90379206
}
90389207

9039-
# setwd('../portal/data/')
9208+
readr::write_file(x = as.character(nobs_nonspatial),
9209+
file = '../portal/data/general/total_nonspatial_observations.txt')
9210+
90409211
dir.create('../portal/data/general/catalog_files',
90419212
showWarnings = FALSE)
90429213

90439214
#generate and write file describing all variables
9215+
90449216
all_variable_display <- all_variable_breakdown %>%
90459217
group_by(VariableCode) %>%
90469218
summarize(
@@ -9115,6 +9287,7 @@ catalogue_held_data <- function(network_domain, site_data){
91159287
#generate and write file describing all sites
91169288
#TODO: make sure to include a note about datum on display page
91179289
# also, incude url column somehow
9290+
91189291
all_site_display <- all_variable_breakdown %>%
91199292
group_by(network, domain, site_name) %>%
91209293
summarize(
@@ -9150,6 +9323,7 @@ catalogue_held_data <- function(network_domain, site_data){
91509323
file = '../portal/data/general/catalog_files/all_sites.csv')
91519324

91529325
#generate and write individual file for each site, describing it by variable
9326+
91539327
dir.create('../portal/data/general/catalog_files/indiv_sites',
91549328
showWarnings = FALSE)
91559329

@@ -9194,7 +9368,6 @@ catalogue_held_data <- function(network_domain, site_data){
91949368
s = sit))
91959369
}
91969370

9197-
91989371
#in case somebody asks for this stuff again:
91999372

92009373
# #domains per network

0 commit comments

Comments
 (0)