@@ -628,7 +628,7 @@ ms_read_raw_csv <- function(filepath,
628
628
preprocessed_tibble ,
629
629
datetime_cols ,
630
630
datetime_tz ,
631
- datetime_optional_chars = ' :' ,
631
+ optionalize_nontoken_characters = ' :' ,
632
632
site_name_col ,
633
633
alt_site_name ,
634
634
data_cols ,
@@ -670,7 +670,17 @@ ms_read_raw_csv <- function(filepath,
670
670
# columns.
671
671
# datetime_tz: string specifying time zone. this specification must be
672
672
# among those provided by OlsonNames()
673
- # datetime_optional_chars: see "optional" argument to dt_format_to_regex
673
+ # optionalize_nontoken_characters: character vector; used when there might be
674
+ # variation in date/time formatting within a column. in regex speak,
675
+ # optionalizing a token string means, "match this string if it exists,
676
+ # but move on to the next token if it doesn't." All datetime parsing tokens
677
+ # (like "%H") are optionalized automatically when this function converts
678
+ # them to regex. But other tokens like ":" and "-" that might be used in
679
+ # datetime strings are not. Concretely, if you wanted to read either "%H:%M:%S"
680
+ # or "%H:%M" in the same column, you'd set optionalize_nontoken_characters = ':',
681
+ # and then the parser wouldn't require there to be two colons in order to
682
+ # match the string. Don't use this if you don't have to, because it reduces
683
+ # specificity. See "optional" argument to dt_format_to_regex for more details.
674
684
# site_name_col: name of column containing site name information
675
685
# alt_site_name: optional list. Names of list elements are desired site_names
676
686
# within MacroSheds. List elements are character vectors of alternative
@@ -942,10 +952,10 @@ ms_read_raw_csv <- function(filepath,
942
952
943
953
# resolve datetime structure into POSIXct
944
954
d <- resolve_datetime(d = d ,
945
- datetime_colnames = datetime_colnames ,
946
- datetime_formats = datetime_formats ,
947
- datetime_tz = datetime_tz ,
948
- optional = datetime_optional_chars )
955
+ datetime_colnames = datetime_colnames ,
956
+ datetime_formats = datetime_formats ,
957
+ datetime_tz = datetime_tz ,
958
+ optional = optionalize_nontoken_characters )
949
959
950
960
# remove rows with NA in datetime or site_name
951
961
d <- filter(d ,
@@ -1035,18 +1045,18 @@ resolve_datetime <- function(d,
1035
1045
datetime_colnames ,
1036
1046
datetime_formats ,
1037
1047
datetime_tz ,
1038
- optional ) {
1048
+ optional ){
1039
1049
1040
- # d is a data.frame or tibble with at least one date or time column
1050
+ # d: a data.frame or tibble with at least one date or time column
1041
1051
# (all date and/or time columns must contain character strings,
1042
- # not parsed date/time/datetime objects)
1043
- # datetime_colnames is a character vector of column names that contain
1044
- # relevantdatetime information
1045
- # datetime_formats is a character vector of datetime parsing tokens
1052
+ # not parsed date/time/datetime objects).
1053
+ # datetime_colnames: character vector; column names that contain
1054
+ # relevant datetime information.
1055
+ # datetime_formats: character vector; datetime parsing tokens
1046
1056
# (like '%A, %Y-%m-%d %I:%M:%S %p' or '%j') corresponding to the
1047
- # elements of datetime_colnames
1048
- # datetime_tz is the time zone of the returned datetime column
1049
- # optional: see dt_format_to_regex
1057
+ # elements of datetime_colnames.
1058
+ # datetime_tz: character; time zone of the returned datetime column.
1059
+ # optional: character vector; see dt_format_to_regex.
1050
1060
1051
1061
# return value: d, but with a "datetime" column containing POSIXct datetimes
1052
1062
# and without the input datetime columns
@@ -1088,6 +1098,22 @@ resolve_datetime <- function(d,
1088
1098
colnames(dt_tb )),
1089
1099
datetime_formats_split )
1090
1100
1101
+ if (' H' %in% colnames(dt_tb )){
1102
+ dt_tb $ H [dt_tb $ H == ' ' ] <- ' 00'
1103
+ }
1104
+ if (' M' %in% colnames(dt_tb )){
1105
+ dt_tb $ M [dt_tb $ M == ' ' ] <- ' 00'
1106
+ }
1107
+ if (' S' %in% colnames(dt_tb )){
1108
+ dt_tb $ S [dt_tb $ S == ' ' ] <- ' 00'
1109
+ }
1110
+ if (' I' %in% colnames(dt_tb )){
1111
+ dt_tb $ I [dt_tb $ I == ' ' ] <- ' 00'
1112
+ }
1113
+ if (' P' %in% colnames(dt_tb )){
1114
+ dt_tb $ P [dt_tb $ P == ' ' ] <- ' AM'
1115
+ }
1116
+
1091
1117
dt_tb <- dt_tb %> %
1092
1118
tidyr :: unite(col = ' datetime' ,
1093
1119
everything(),
@@ -1112,15 +1138,15 @@ dt_format_to_regex <- function(fmt, optional){
1112
1138
# fmt is a character vector of datetime formatting strings, such as
1113
1139
# '%A, %Y-%m-%d %I:%M:%S %p' or '%j'. each element of fmt that is a
1114
1140
# datetime token is replaced with a regex string that matches
1115
- # the what the token represents. For example, '%Y' matches a 4-digit
1141
+ # what the token represents. For example, '%Y' matches a 4-digit
1116
1142
# year and '[0-9]{4}' matches a 4-digit numeric sequence. non-token
1117
1143
# characters (anything not following a %) are not modified. Note that
1118
1144
# tokens B, b, h, A, and a are replaced by '[a-zA-Z]+', which matches
1119
1145
# any sequence of one or more alphabetic characters of either case,
1120
1146
# not just meaningful month/day names 'Weds' or 'january'. Also note
1121
1147
# that these tokens are not currently accepted: g, G, n, t, c, r, R, T.
1122
- # optional is a character vector of characters that should be made
1123
- # optional in the exported regex (succeeded by a ? ). This is useful if
1148
+ # optional is a vector of characters that should be made
1149
+ # optional in the exported regex (followed by a '?' ). This is useful if
1124
1150
# e.g. fmt is '%H:%M:%S' and elements to be matched may either appear in
1125
1151
# HH:MM:SS or HH:MM format. making the ":" character optional here
1126
1152
# (via optional = ':') allows the hour and minute data to be retained,
@@ -1144,10 +1170,10 @@ dt_format_to_regex <- function(fmt, optional){
1144
1170
W = ' ([0-9]{2})?' ,
1145
1171
V = ' ([0-9]{2})?' ,
1146
1172
C = ' ([0-9]{2})?' ,
1147
- H = ' ([0-9]{2})?' ,
1148
- I = ' ([0-9]{2})?' ,
1149
- M = ' ([0-9]{2})?' ,
1150
- S = ' ([0-9]{2})?' ,
1173
+ H = ' ([0-9]{1, 2})?' ,
1174
+ I = ' ([0-9]{1, 2})?' ,
1175
+ M = ' ([0-9]{1, 2})?' ,
1176
+ S = ' ([0-9]{1, 2})?' ,
1151
1177
p = ' ([AP]M)?' ,
1152
1178
z = ' ([+\\ -][0-9]{4})?' ,
1153
1179
`F` = ' ([0-9]{4}-[0-9]{2}-[0-9]{2})' )
@@ -2861,6 +2887,7 @@ delineate_watershed_apriori <- function(lat, long, crs,
2861
2887
filename = dem_f ,
2862
2888
overwrite = TRUE )
2863
2889
2890
+ # loses projection
2864
2891
sf :: st_write(obj = site ,
2865
2892
dsn = point_f ,
2866
2893
delete_layer = TRUE ,
@@ -3061,6 +3088,7 @@ delineate_watershed_by_specification <- function(lat,
3061
3088
filename = dem_f ,
3062
3089
overwrite = TRUE )
3063
3090
3091
+ # loses projection
3064
3092
sf :: st_write(obj = site ,
3065
3093
dsn = point_f ,
3066
3094
delete_layer = TRUE ,
@@ -4748,14 +4776,24 @@ choose_projection <- function(lat = NULL,
4748
4776
4749
4777
abslat <- abs(lat )
4750
4778
4779
+ # THIS WORKS (PROJECTS STUFF), BUT CAN'T BE READ AUTOMATICALLY BY st_read
4751
4780
if (abslat < 23 ){ # tropical
4752
4781
PROJ4 = glue(' +proj=laea +lon_0=' , long )
4753
4782
# ' +datum=WGS84 +units=m +no_defs')
4754
4783
} else { # temperate or polar
4755
4784
PROJ4 = glue(' +proj=laea +lat_0=' , lat , ' +lon_0=' , long )
4756
4785
}
4757
- # ' +datum=WGS84 +units=m +no_defs')
4758
4786
4787
+ # # UTM/UPS would be nice for watersheds that don't fall on more than two zones
4788
+ # # (incomplete)
4789
+ # if(lat > 84 || lat < -80){ #polar; use Universal Polar Stereographic (UPS)
4790
+ # PROJ4 <- glue('+proj=ups +lon_0=', long)
4791
+ # # ' +datum=WGS84 +units=m +no_defs')
4792
+ # } else { #not polar; use UTM
4793
+ # PROJ4 <- glue('+proj=utm +lat_0=', lat, ' +lon_0=', long)
4794
+ # }
4795
+
4796
+ # # EXTRA CODE FOR CHOOSING PROJECTION BY LATITUDE ONLY
4759
4797
# if(abslat < 23){ #tropical
4760
4798
# PROJ4 <- 9835 #Lambert cylindrical equal area (ellipsoidal; should spherical 9834 be used instead?)
4761
4799
# } else if(abslat > 23 && abslat < 66){ # middle latitudes
@@ -8454,24 +8492,155 @@ pull_usgs_discharge <- function(network, domain, prodname_ms, sites, time_step)
8454
8492
return ()
8455
8493
}
8456
8494
8495
+ log_with_indent <- function (msg , logger , level = ' info' , indent = 1 ){
8496
+
8497
+ # level is one of "info", "warn", 'error".
8498
+ # indent: the number of spaces to indent after the colon.
8499
+
8500
+ indent_str <- rep(' \U 2800\U 2800' , indent )
8501
+
8502
+ if (level == ' info' ){
8503
+ loginfo(msg = paste0(enc2native(indent_str ),
8504
+ msg ),
8505
+ logger = logger )
8506
+ } else if (level == ' warn' ){
8507
+ logwarn(msg = paste0(enc2native(indent_str ),
8508
+ msg ),
8509
+ logger = logger )
8510
+ } else if (level == ' error' ){
8511
+ logerror(msg = paste0(enc2native(indent_str ),
8512
+ msg ),
8513
+ logger = logger )
8514
+ }
8515
+ }
8516
+
8457
8517
generate_portal_extras <- function (site_data ,
8458
- network_domain ){
8518
+ network_domain ,
8519
+ thin_portal_data_to_interval = NA ){
8520
+
8521
+ # thin_portal_data_to_interval: passed to the "unit" parameter of lubridate::round_date.
8522
+ # set to NA (the dafault) to prevent thinning.
8459
8523
8460
8524
# for post-derive steps that save the portal some processing.
8461
8525
8462
- loginfo(msg = ' Generating portal extras' ,
8526
+ loginfo(msg = ' Generating portal extras: ' ,
8463
8527
logger = logger_module )
8464
8528
8465
- loginfo (' scaling flux by area' , logger = logger_module )
8529
+ log_with_indent (' scaling flux by area' , logger = logger_module )
8466
8530
calculate_flux_by_area(site_data = site_data )
8467
- loginfo(' writing config datasets to local dir' , logger = logger_module )
8531
+
8532
+ log_with_indent(' writing config datasets to local dir' , logger = logger_module )
8468
8533
write_portal_config_datasets()
8469
- loginfo(' cataloguing held data' , logger = logger_module )
8470
- catalogue_held_data(site_data = site_data ,
8471
- network_domain = network_domain )
8472
- loginfo(' combining watershed boundaries' , logger = logger_module )
8534
+
8535
+ log_with_indent(' cataloging held data' , logger = logger_module )
8536
+ catalog_held_data(site_data = site_data ,
8537
+ network_domain = network_domain )
8538
+
8539
+ log_with_indent(' combining watershed boundaries' , logger = logger_module )
8473
8540
combine_ws_boundaries()
8541
+
8542
+ log_with_indent(' determining which domains have Q' , logger = logger_module )
8474
8543
list_domains_with_discharge(site_data = site_data )
8544
+
8545
+ if (! is.na(thin_portal_data_to_interval )){
8546
+ log_with_indent(' thinning portal datasets to 1 day' ,
8547
+ logger = logger_module )
8548
+ thin_portal_data(network_domain = network_domain ,
8549
+ thin_interval = thin_portal_data_to_interval )
8550
+ }
8551
+ }
8552
+
8553
+ thin_portal_data <- function (network_domain , thin_interval ){
8554
+
8555
+ # thin_interval: passed to the "unit" parameter of lubridate::round_date
8556
+
8557
+ domains <- network_domain $ domain
8558
+
8559
+ n_domains <- length(domains )
8560
+ for (i in 1 : n_domains ){
8561
+
8562
+ dmn <- domains [i ]
8563
+
8564
+ log_with_indent(msg = glue(' {d}: ({ii}/{n})' ,
8565
+ d = dmn ,
8566
+ ii = i ,
8567
+ n = n_domains ),
8568
+ logger = logger_module ,
8569
+ indent = 2 )
8570
+
8571
+ prod_dirs <- try(
8572
+ {
8573
+ list.files(path = glue(' ../portal/data/{d}/' ,
8574
+ d = dmn ),
8575
+ full.names = FALSE ,
8576
+ recursive = FALSE )
8577
+ },
8578
+ silent = TRUE
8579
+ )
8580
+
8581
+ if (length(prod_dirs )){
8582
+
8583
+ # filter products that never need to be thinned. keep the ones that might
8584
+ rgx <- paste0(' (^precipitation|^precip_chemistry|^discharge' ,
8585
+ ' |^precip_flux|^stream_chemistry|^stream_flux)' )
8586
+ prod_dirs <- grep(pattern = rgx ,
8587
+ x = prod_dirs ,
8588
+ value = TRUE )
8589
+ }
8590
+
8591
+ for (prd in prod_dirs ){
8592
+
8593
+ site_files <- list.files(path = glue(' ../portal/data/{d}/{p}' ,
8594
+ d = dmn ,
8595
+ p = prd ),
8596
+ full.names = TRUE ,
8597
+ recursive = FALSE )
8598
+
8599
+ if (prd == ' precipitation' ){
8600
+ agg_call <- quote(sum(val , na.rm = TRUE ))
8601
+ } else {
8602
+ agg_call <- quote(mean(val , na.rm = TRUE ))
8603
+ }
8604
+
8605
+ for (stf in site_files ){
8606
+
8607
+ # check whether this file needs to be thinned
8608
+ dtcol <- read_feather(stf , columns = ' datetime' )
8609
+ interval_min <- Mode(diff(as.numeric(dtcol $ datetime )) / 60 )
8610
+ needs_thin <- ! is.na(interval_min ) && interval_min < = 24 * 60
8611
+
8612
+ if (needs_thin ){
8613
+
8614
+ d <- read_feather(stf ) %> %
8615
+ mutate(
8616
+ datetime = lubridate :: round_date(
8617
+ x = datetime ,
8618
+ unit = thin_interval ),
8619
+ val = errors :: set_errors(val , val_err )) %> %
8620
+ select(- val_err )
8621
+
8622
+ if (length(unique(d $ site_name )) > 1 ){
8623
+ stop(paste(' Multiple site_names in' , stf ))
8624
+ }
8625
+
8626
+ d %> %
8627
+ group_by(datetime , var ) %> %
8628
+ summarize(
8629
+ site_name = first(site_name ),
8630
+ val = eval(agg_call ),
8631
+ ms_status = numeric_any(ms_status ),
8632
+ ms_interp = numeric_any(ms_interp )) %> %
8633
+ ungroup() %> %
8634
+ mutate(val_err = errors(val ),
8635
+ val_err = ifelse(is.na(val_err ), 0 , val_err ),
8636
+ val = errors :: drop_errors(val )) %> %
8637
+ select(datetime , site_name , var , val , ms_status , ms_interp ,
8638
+ val_err ) %> %
8639
+ write_feather(stf )
8640
+ }
8641
+ }
8642
+ }
8643
+ }
8475
8644
}
8476
8645
8477
8646
list_domains_with_discharge <- function (site_data ){
@@ -8868,7 +9037,7 @@ munge_versionless_product <- function(network,
8868
9037
}
8869
9038
}
8870
9039
8871
- catalogue_held_data <- function (network_domain , site_data ){
9040
+ catalog_held_data <- function (network_domain , site_data ){
8872
9041
8873
9042
# tabulates:
8874
9043
# + total nonspatial observations for the portal landing page
@@ -9036,11 +9205,14 @@ catalogue_held_data <- function(network_domain, site_data){
9036
9205
}
9037
9206
}
9038
9207
9039
- # setwd('../portal/data/')
9208
+ readr :: write_file(x = as.character(nobs_nonspatial ),
9209
+ file = ' ../portal/data/general/total_nonspatial_observations.txt' )
9210
+
9040
9211
dir.create(' ../portal/data/general/catalog_files' ,
9041
9212
showWarnings = FALSE )
9042
9213
9043
9214
# generate and write file describing all variables
9215
+
9044
9216
all_variable_display <- all_variable_breakdown %> %
9045
9217
group_by(VariableCode ) %> %
9046
9218
summarize(
@@ -9115,6 +9287,7 @@ catalogue_held_data <- function(network_domain, site_data){
9115
9287
# generate and write file describing all sites
9116
9288
# TODO: make sure to include a note about datum on display page
9117
9289
# also, incude url column somehow
9290
+
9118
9291
all_site_display <- all_variable_breakdown %> %
9119
9292
group_by(network , domain , site_name ) %> %
9120
9293
summarize(
@@ -9150,6 +9323,7 @@ catalogue_held_data <- function(network_domain, site_data){
9150
9323
file = ' ../portal/data/general/catalog_files/all_sites.csv' )
9151
9324
9152
9325
# generate and write individual file for each site, describing it by variable
9326
+
9153
9327
dir.create(' ../portal/data/general/catalog_files/indiv_sites' ,
9154
9328
showWarnings = FALSE )
9155
9329
@@ -9194,7 +9368,6 @@ catalogue_held_data <- function(network_domain, site_data){
9194
9368
s = sit ))
9195
9369
}
9196
9370
9197
-
9198
9371
# in case somebody asks for this stuff again:
9199
9372
9200
9373
# #domains per network
0 commit comments