Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix date issue by switching to converting UV data using location-based timezones #73

Merged
merged 9 commits into from
Sep 26, 2022
6 changes: 5 additions & 1 deletion 0_historic.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ targets:
'0_historic/src/do_historic_gw_fetch.R',
'0_historic/src/fetch_nwis_historic.R')
# Task table for `uv` includes a step to average the data to daily values.
# This step needs the timezones for all the sites.
historic_gw_sites_uv_tz:
command: fetch_gw_site_tz(historic_gw_sites_uv)
0_historic/out/historic_gw_data_uv.csv:
command: do_historic_gw_fetch(
final_target = target_name,
Expand All @@ -55,7 +58,8 @@ targets:
service_cd = I('uv'),
request_limit = historic_uv_fetch_size_limit,
'0_historic/src/do_historic_gw_fetch.R',
'0_historic/src/fetch_nwis_historic.R')
'0_historic/src/fetch_nwis_historic.R',
gw_site_tz_xwalk_nm = I('historic_gw_sites_uv_tz'))

# Special pull for just KS & just FL using `62610`. Plus, HI using `72150`.
# Read all about why on GitHub: https://github.com/USGS-VIZLAB/gw-conditions/issues/9
Expand Down
16 changes: 14 additions & 2 deletions 0_historic/src/do_historic_addl_param_fetch.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,17 @@ do_historic_addl_param_fetch <- function(final_target, addl_states, addl_gw_para
steps[["identify_states"]]$target_name, task_name)
}
)

# Need local timezones for averaging to daily values
create_addl_sites_tz_xwalk <- create_task_step(
step_name = 'sites_tz_xwalk',
target_name = function(task_name, ...) {
sprintf('addl_sites_tz_xwalk_%s', task_name)
},
command = function(..., task_name, steps) {
sprintf("fetch_gw_site_tz(%s)", steps[['inventory_sites']]$target_name)
}
)

download_addl_data <- create_task_step(
step_name = 'download_addl_data',
Expand All @@ -51,14 +62,15 @@ do_historic_addl_param_fetch <- function(final_target, addl_states, addl_gw_para
"request_limit = historic_uv_fetch_size_limit,",
"'0_historic/src/do_historic_gw_fetch.R',",
"'0_historic/src/fetch_nwis_historic.R',",
"include_ymls = I('%s'))" = task_makefile)
"include_ymls = I('%s')," = task_makefile,
"gw_site_tz_xwalk_nm = I('%s'))" = steps[['sites_tz_xwalk']]$target_name)
}
)

# Create the task plan
task_plan <- create_task_plan(
task_names = tasks,
task_steps = list(identify_states, inventory_sites, download_addl_data),
task_steps = list(identify_states, inventory_sites, create_addl_sites_tz_xwalk, download_addl_data),
final_steps = "download_addl_data",
add_complete = FALSE)

Expand Down
7 changes: 5 additions & 2 deletions 0_historic/src/do_historic_gw_fetch.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
do_historic_gw_fetch <- function(final_target, task_makefile, gw_site_nums, gw_site_nums_obj_nm, param_cd, service_cd, request_limit, ..., include_ymls = NULL) {
do_historic_gw_fetch <- function(final_target, task_makefile, gw_site_nums,
gw_site_nums_obj_nm, param_cd, service_cd, request_limit, ...,
include_ymls = NULL, gw_site_tz_xwalk_nm = NULL) {

# Number indicating how many sites to include per dataRetrieval request to prevent
# errors from requesting too much at once. More relevant for surface water requests.
Expand Down Expand Up @@ -63,7 +65,8 @@ do_historic_gw_fetch <- function(final_target, task_makefile, gw_site_nums, gw_s
command = function(..., task_name, steps) {
psprintf("convert_uv_to_dv(",
"target_name = target_name,",
"gw_uv_data_fn = '%s')" = steps[["download_data"]]$target_name)
"gw_uv_data_fn = '%s'," = steps[["download_data"]]$target_name,
"site_tz_xwalk = %s)" = gw_site_tz_xwalk_nm)
}
)
task_steps <- c(task_steps, list(average_data))
Expand Down
108 changes: 105 additions & 3 deletions 0_historic/src/fetch_nwis_historic.R
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,17 @@ pull_sites_by_service <- function(site_df, service) {
site_df %>% filter(data_type_cd == service) %>% pull(site_no)
}

fetch_gw_site_tz <- function(sites) {
readNWISsite(sites) %>%
select(site_no, tz_cd)
}

fetch_gw_site_info <- function(data_fn) {
sites <- read_csv(data_fn, col_types = 'cDn') %>%
pull(site_no) %>%
unique()
readNWISsite(sites) %>%
select(site_no, station_nm, state_cd, dec_lat_va, dec_long_va)
select(site_no, station_nm, state_cd, dec_lat_va, dec_long_va, tz_cd)
}

fetch_addl_uv_sites <- function(addl_states, param_cd, start_date, end_date) {
Expand Down Expand Up @@ -93,14 +98,111 @@ fetch_gw_historic_uv <- function(target_name, gw_sites, start_date, end_date, pa
write_feather(target_name)
}

convert_uv_to_dv <- function(target_name, gw_uv_data_fn) {
convert_uv_to_dv <- function(target_name, gw_uv_data_fn, site_tz_xwalk) {
read_feather(gw_uv_data_fn) %>%
mutate(Date = as.Date(dateTime)) %>%

### Convert to date using local timezone

# Remove the tz_cd column that is downloaded with the data
select(-tz_cd) %>%
# Join in the local timezone information for each site
left_join(site_tz_xwalk, by = "site_no") %>%
# Change the dateTime column to just the date. Need to use the timezone
# rather than UTC so that days are treated appropriately. Seems overly
# complicated but you can't use `format()` with more than one tz, so
# this vectorizes that call per timezone. Not using rowwise() because
# that would be slow. Doing a single call per timezone speeds this up.
group_by(tz_cd) %>%
nest() %>%
pmap(function(tz_cd, data) {
data %>% mutate(Date = POSIXct_to_Date_tz(dateTime, tz_cd))
}) %>%
bind_rows() %>%

### Reduce each instantaneous value to a single average for each date
group_by(site_no, Date) %>%
summarize(GWL = mean(GWL_inst, na.rm = TRUE), .groups = "keep") %>%
write_feather(target_name)
}

# Convert POSIXct to Dates for a given timezone
# Only works with one tz_abbr at a time
POSIXct_to_Date_tz <- function(posix_dates, tz_abbr) {
# The "AST" for "Atlantic Standard Time" is not recognized by `format()`
# According to https://www.r-bloggers.com/2018/07/a-tour-of-timezones-troubles-in-r/
# we should be using location-based timezones to properly handle daylight savings time
# Not going to worry about the Indiana and Phoenix nuances for now.
tz_abbr_adj <- switch(
tz_abbr,
"AST" = "America/Virgin",
"EST" = "America/New_York",
"EDT" = "America/New_York",
"CST" = "America/Chicago",
"CDT" = "America/Chicago",
"MST" = "America/Denver",
"MDT" = "America/Denver",
"PST" = "America/Los_Angeles",
"PDT" = "America/Los_Angeles",
"AKST" = "America/Juneau",
"AKDT" = "America/Juneau",
"HST" = "US/Hawaii",
"HDT" = "US/Hawaii",
tz_abbr)

# Needs to retain POSIXct class and timestamp for extracting tz with '%Z' next
format(posix_dates, "%Y-%m-%d %H:%M:%S", tz=tz_abbr_adj, usetz=TRUE) %>%
as.POSIXct(tz=tz_abbr_adj) %>%
# For some reason, timezones above will only return daylight time,
# though it might have to due with whether your computer is in
# daylight or standard time at the moment you run the conversion
# code. I believe that the function below will appropriately account
# for that because it will test ST vs DT and do the appropriate switch.
adjust_for_daylight_savings(tz_desired = tz_abbr) %>%
# Drop times and timezone before converting to a plain date or it will
# adjust using your local timezone.
format('%Y-%m-%d') %>% as.Date()

}

# Note that the output from this fxn will say 'PDT' but mean 'PST'
# because you can't have a timezone of 'PST' (it will convert to GMT,
# even when using `lubridate::force_tz(., 'PST')`). This is used
# internally before dropping time and going to a day, so I am
# accepting the risk.
adjust_for_daylight_savings <- function(posix_dates, tz_desired) {

# To go from daylight time (DT) to standard time
# (ST), subtract an hour and vice versa. If the
# `from` and `to` values are the same, don't
# change anything about the dates.
tz_conversion_xwalk <- tibble(
from = c('DT', 'ST', 'ST', 'DT'),
to = c('ST', 'DT', 'ST', 'DT'),
conversion_sec = c(-3600, 3600, 0, 0)
)

# There could be more than one timezone if the date range spans across
# the standard to daylight savings switch. Thus, we should be able to
# convert each date independently (which happens in this piped sequence)
tibble(
in_dates = posix_dates,
in_tz = format(posix_dates, "%Z")
) %>%
mutate(
# Use the last two characters in both the current and desired
# timezones for matching with the conversion xwalk
from = stringr::str_sub(in_tz, -2, -1),
to = stringr::str_sub(tz_desired, -2, -1)
) %>%
# Join in conversion xwalk
left_join(tz_conversion_xwalk) %>%
# Alter the date values to match the desired timezone.
mutate(out_dates = in_dates + conversion_sec) %>%
# Pull out just the dates to return
pull(out_dates)

}

combine_gw_fetches <- function(target_name, dv_fn, uv_fn, uv_addl_fn) {
read_csv(dv_fn, col_types = 'cDn') %>%
bind_rows(read_csv(uv_fn, col_types = 'cDn')) %>%
Expand Down
6 changes: 5 additions & 1 deletion 1_fetch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,9 @@ targets:
- viz_end_date

# Task table for `uv` includes a step to average the data to daily values.
# This step needs the timezones for all the sites.
gw_quantile_site_tz_xwalk:
command: select(gw_quantile_site_info, I('site_no'), I('tz_cd'))
1_fetch/out/gw_data_uv.csv:
command: do_gw_fetch(
final_target = target_name,
Expand All @@ -81,7 +84,8 @@ targets:
service_cd = I('uv'),
request_limit = uv_fetch_size_limit,
'1_fetch/src/do_gw_fetch.R',
'1_fetch/src/fetch_nwis.R')
'1_fetch/src/fetch_nwis.R',
gw_site_tz_xwalk_nm = I('gw_quantile_site_tz_xwalk'))
depends:
- viz_start_date
- viz_end_date
Expand Down
3 changes: 2 additions & 1 deletion 1_fetch/src/do_addl_gw_fetch.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ do_addl_gw_fetch <- function(final_target, addl_param_cds) {
"request_limit = uv_fetch_size_limit,",
"'1_fetch/src/do_gw_fetch.R',",
"'1_fetch/src/fetch_nwis.R',",
"include_ymls = I('%s'))" = task_makefile)
"include_ymls = I('%s')," = task_makefile,
"gw_site_tz_xwalk_nm = I('gw_quantile_site_tz_xwalk'))")
}
)

Expand Down
6 changes: 4 additions & 2 deletions 1_fetch/src/do_gw_fetch.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
do_gw_fetch <- function(final_target, task_makefile, gw_site_nums, gw_site_nums_obj_nm,
param_cd, service_cd, request_limit, ..., include_ymls = NULL) {
param_cd, service_cd, request_limit, ..., include_ymls = NULL,
gw_site_tz_xwalk_nm = NULL) {

# Number indicating how many sites to include per dataRetrieval request to prevent
# errors from requesting too much at once. More relevant for surface water requests.
Expand Down Expand Up @@ -57,7 +58,8 @@ do_gw_fetch <- function(final_target, task_makefile, gw_site_nums, gw_site_nums_
command = function(..., task_name, steps) {
psprintf("convert_uv_to_dv(",
"target_name = target_name,",
"gw_uv_data_fn = '%s')" = steps[["download_data"]]$target_name)
"gw_uv_data_fn = '%s'," = steps[["download_data"]]$target_name,
"site_tz_xwalk = %s)" = gw_site_tz_xwalk_nm)
}
)
task_steps <- c(task_steps, list(average_data))
Expand Down
100 changes: 98 additions & 2 deletions 1_fetch/src/fetch_nwis.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,110 @@ fetch_gw_uv <- function(target_name, gw_sites, start_date, end_date, param_cd) {
write_feather(target_name)
}

convert_uv_to_dv <- function(target_name, gw_uv_data_fn) {
convert_uv_to_dv <- function(target_name, gw_uv_data_fn, site_tz_xwalk) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this different from fetch_nwis_historic.r?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nope ... will address this in #74

read_feather(gw_uv_data_fn) %>%
mutate(Date = as.Date(dateTime)) %>%

### Convert to date using local timezone

# Remove the tz_cd column that is downloaded with the data
select(-tz_cd) %>%
# Join in the local timezone information for each site
left_join(site_tz_xwalk, by = "site_no") %>%
# Change the dateTime column to just the date. Need to use the timezone
# rather than UTC so that days are treated appropriately. Seems overly
# complicated but you can't use `format()` with more than one tz, so
# this vectorizes that call per timezone. Not using rowwise() because
# that would be slow. Doing a single call per timezone speeds this up.
group_by(tz_cd) %>%
nest() %>%
pmap(function(tz_cd, data) {
data %>% mutate(Date = POSIXct_to_Date_tz(dateTime, tz_cd))
}) %>%
bind_rows() %>%

### Reduce each instantaneous value to a single average for each date
group_by(site_no, Date) %>%
summarize(GWL = mean(GWL_inst, na.rm = TRUE)) %>%
write_feather(target_name)
}

# Convert POSIXct to Dates for a given timezone
# Only works with one tz_abbr at a time
POSIXct_to_Date_tz <- function(posix_dates, tz_abbr) {
Copy link
Member

@cnell-usgs cnell-usgs Sep 23, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These next two function are also duplicated in 0_historic/src/fetch_nwis_historic

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure I understand the need for doing that. I know we're keeping the historic fetch separate from the current, but seems like duplicating the same steps increases complexity and room for error

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are absolutely right that these are duplicate functions. My goal with duplicating was to keep the historic fetching/processing totally separate. I agree that this introduces potential issues with updating one and not the other. This is the case with a couple of other things, so I will make a separate issue to address. I want to think through how to best do that.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See #74

# The "AST" for "Atlantic Standard Time" is not recognized by `format()`
# According to https://www.r-bloggers.com/2018/07/a-tour-of-timezones-troubles-in-r/
# we should be using location-based timezones to properly handle daylight savings time
# Not going to worry about the Indiana and Phoenix nuances for now.
tz_abbr_adj <- switch(
tz_abbr,
"AST" = "America/Virgin",
"EST" = "America/New_York",
"EDT" = "America/New_York",
"CST" = "America/Chicago",
"CDT" = "America/Chicago",
"MST" = "America/Denver",
"MDT" = "America/Denver",
"PST" = "America/Los_Angeles",
"PDT" = "America/Los_Angeles",
"AKST" = "America/Juneau",
"AKDT" = "America/Juneau",
"HST" = "US/Hawaii",
"HDT" = "US/Hawaii",
tz_abbr)

# Needs to retain POSIXct class and timestamp for extracting tz with '%Z' next
format(posix_dates, "%Y-%m-%d %H:%M:%S", tz=tz_abbr_adj, usetz=TRUE) %>%
as.POSIXct(tz=tz_abbr_adj) %>%
# For some reason, timezones above will only return daylight time,
# though it might have to due with whether your computer is in
# daylight or standard time at the moment you run the conversion
# code. I believe that the function below will appropriately account
# for that because it will test ST vs DT and do the appropriate switch.
adjust_for_daylight_savings(tz_desired = tz_abbr) %>%
# Drop times and timezone before converting to a plain date or it will
# adjust using your local timezone.
format('%Y-%m-%d') %>% as.Date()
}

# Note that the output from this fxn will say 'PDT' but mean 'PST'
# because you can't have a timezone of 'PST' (it will convert to GMT,
# even when using `lubridate::force_tz(., 'PST')`). This is used
# internally before dropping time and going to a day, so I am
# accepting the risk.
adjust_for_daylight_savings <- function(posix_dates, tz_desired) {

# To go from daylight time (DT) to standard time
# (ST), subtract an hour and vice versa. If the
# `from` and `to` values are the same, don't
# change anything about the dates.
tz_conversion_xwalk <- tibble(
from = c('DT', 'ST', 'ST', 'DT'),
to = c('ST', 'DT', 'ST', 'DT'),
conversion_sec = c(-3600, 3600, 0, 0)
)

# There could be more than one timezone if the date range spans across
# the standard to daylight savings switch. Thus, we should be able to
# convert each date independently (which happens in this piped sequence)
tibble(
in_dates = posix_dates,
in_tz = format(posix_dates, "%Z")
) %>%
mutate(
# Use the last two characters in both the current and desired
# timezones for matching with the conversion xwalk
from = stringr::str_sub(in_tz, -2, -1),
to = stringr::str_sub(tz_desired, -2, -1)
) %>%
# Join in conversion xwalk
left_join(tz_conversion_xwalk) %>%
# Alter the date values to match the desired timezone.
mutate(out_dates = in_dates + conversion_sec) %>%
# Pull out just the dates to return
pull(out_dates)

}

combine_gw_fetches <- function(target_name, dv_fn, uv_fn, uv_addl_fn) {
read_csv(dv_fn, col_types = 'cDn') %>%
bind_rows(read_csv(uv_fn, col_types = 'cDn')) %>%
Expand Down
2 changes: 1 addition & 1 deletion gw-conditions/historic_gw_data_filtered.csv.ind
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
hash: 991b0a39bf0916d45c3e883dba0ab760
hash: 2975ba5bc35acc29f9a51d463594190c

2 changes: 1 addition & 1 deletion gw-conditions/historic_gw_data_unfiltered.csv.ind
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
hash: 5f41027b53594a6cff5007cfe775811a
hash: eda380bbd7f3f384c3b1719035bd80f9

2 changes: 1 addition & 1 deletion gw-conditions/historic_gw_quantiles.csv.ind
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
hash: 159c709725092b140fe0b8f3eb37d845
hash: bc49e934336aa5a31262e97675b3e8d5

2 changes: 1 addition & 1 deletion gw-conditions/historic_gw_site_info_filtered.rds.ind
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
hash: 48114fcc5818e5c82ba45c06f0174308
hash: 2ee7ab3937551ebd1cfc8f5e45d2e54e

2 changes: 1 addition & 1 deletion gw-conditions/historic_gw_site_info_unfiltered.rds.ind
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
hash: aab93b170f7889cbcf92f201b4b47165
hash: 674be56e47bc7020e9cbbb62f161a54f