2_observations.yml

target_default: 2_observations

packages:
  - sf
  - tidyverse
  - lwgeom
  - smoothr
  - igraph
  - purrr
  - dataRetrieval
  - readr
  - readxl
  - lubridate
  - purrr

sources:
  - 2_observations/src/crosswalk_functions.R
  - 2_observations/src/data_munge_functions.R
  - 2_observations/src/subset_closest.R
  - 2_observations/src/pull_ngwos_dat.R
  - 2_observations/src/munge_flow_dat.R
  - 2_observations/src/munge_diversion_data.R
  - 2_observations/src/munge_surface_area_data.R
  - 2_observations/src/flag_seg_matches.R

targets:
  2_observations:
    depends:
     - 2_observations/out/drb_temp_summary.csv
     - 2_observations/out/drb_flow_summary.csv
     - 2_observations/out/realsat_monthly_surface_area.csv.ind
     - 2_observations/out/interpolated_daily_reservoir_water_budget_components.csv.ind
     - 2_observations/out/reservoir_releases_by_type_drb.csv.ind
     - 2_observations/out/reservoir_diversions.csv.ind

  2_observations/out/NGWOS_sites_data.rds.ind:
    command: retrieve_ngwos(
      out_ind = target_name,
      sites_ind = '2_observations/in/NGWOS_site_metadata.xlsx.ind')

  # identify sites that we feel confident calling "NGWOS" sites for experiment purposes
  ngwos_sites:
    command: filter_ngwos(ngwos_ind = '2_observations/out/NGWOS_sites_data.rds.ind')

  holdout_water_years:
    command: c(I(1980:1984), I(2011:2015), I(2021))

  #seg_id_nats; Birch Run is a Christina basin headwater; Willowemoc is a northern site upstream of all reservoirs
  holdout_reach_ids:
    command: c('Willowemoc Creek' = 1455,
                'Callicoon' = 1578,
                 'Beltzville' = 1703,
                 'Beltzville' = 1697,
                 'Birch Run' = 2007,
                 'Trenton' = 1498,
                 'Schuylkill @PHL' = 2338,
                 'Maurice River' = 3570)

  # temperature data and sites info comes from the national NWIS temperature pull: https://github.com/USGS-R/2wp-temp-observations
  2_observations/out/crosswalk_site_reach.rds.ind:
    command: crosswalk_sites_to_reaches(
      out_ind = target_name,
      temp_sites_ind = '2_observations/in/stream_sites_us.rds.ind',
      flow_sites_ind = '2_observations/in/observation_summary.rds.ind',
      boundary_ind = '1_network/out/boundary.rds.ind',
      network_ind = '1_network/out/network.rds.ind')

  # do some QA on site-to-reach matches
  # NHDPlusV2 - NHM crosswalk, download and indicate
  2_observations/out/GFv1_NHDv2_xwalk.csv.ind:
    command: fetch_xwalk(
      out_ind = target_name,
      url = I("https://raw.githubusercontent.com/USGS-R/drb-network-prep/c45f469098dfb245a2d686b807a58afa00b9d0b2/2_process/out/GFv1_NHDv2_xwalk.csv"))

  # refgages - download and indicate
  2_observations/out/usgs_nldi_gages.geojson.ind:
    command: fetch_refgages(
      out_ind = target_name,
      url = I('https://github.com/internetofwater/ref_gages/releases/download/v0.6/usgs_nldi_gages.geojson'))

  # a subset of "useable" sites in DRB, munged by:
  # 1) filtering by bird and fish distance
  2_observations/out/drb_filtered_sites.rds.ind:
    command: subset_sites(
      out_ind = target_name,
      crosswalk_ind = '2_observations/out/crosswalk_site_reach.rds.ind',
      fish_dist = I(20000),
      bird_dist = I(250))

  # flag site-to-reach matches using a number of techniques,
  # including cross-referencing refgages, independently matching
  # to NHD reaches, looking at watershed size as it compares to reported
  # NWIS value, and look for "tributary" in site name
  2_observations/out/flagged_sitetoreach_matches.rds.ind:
    command: site_to_reach_flags(
      out_ind = target_name,
      sites_ind = '2_observations/out/drb_filtered_sites.rds.ind',
      cross_ind = '2_observations/out/GFv1_NHDv2_xwalk.csv.ind',
      refgages_ind = '2_observations/out/usgs_nldi_gages.geojson.ind')

  # compare the latest flagged sites to Lauren's manual/visual
  # inspection of the originally flagged sites. Write "new flagged sites"
  # that have not been manually inspected to a file and throw a warning.
  2_observations/out/check_flagged_sites.rds.ind:
    command: check_new_flags(
      out_ind = target_name,
      manualqa_ind = '2_observations/in/drb_filtered_sites_seg_match_QC.csv.ind',
      siteqa_ind = '2_observations/out/flagged_sitetoreach_matches.rds.ind')

  # use Lauren's "manual" decision about flagged sites to
  # keep them in/out of dataset
  2_observations/out/drb_filtered_sites_qa.rds.ind:
    command: remove_bad_matches(
      out_ind = target_name,
      manualqa_ind = '2_observations/in/drb_filtered_sites_seg_match_QC.csv.ind',
      sites_ind = '2_observations/out/drb_filtered_sites.rds.ind')

  # write a summary of matched sites to track changes
  2_observations/out/site_summary.csv:
    command: summarize_sites(out_file = target_name, in_ind = '2_observations/out/drb_filtered_sites_qa.rds.ind')
  # temperature observations that have been filtered to crosswalk
  # this represent all DRB dat
  2_observations/out/all_drb_temp_obs.rds.ind:
    command: filter_temp_data_to_crosswalk(
      dat_ind = '2_observations/in/daily_temperatures_qaqc.rds.ind',
      cross_ind = '2_observations/out/crosswalk_site_reach.rds.ind',
      out_ind = target_name)

  # keep only data in the filtered sites above
  2_observations/out/obs_temp_drb_raw.rds.ind:
    command: filter_temp_data_to_sites(
      sites_ind = '2_observations/out/drb_filtered_sites_qa.rds.ind',
      dat_ind = '2_observations/out/all_drb_temp_obs.rds.ind',
      out_ind = target_name)

  # take mean of multiple obs/reach-dat to reduce duplicate vals
  2_observations/out/obs_temp_drb.rds.ind:
    command: munge_split_temp_dat(
      dat_ind = '2_observations/out/obs_temp_drb_raw.rds.ind',
      holdout_water_years = holdout_water_years,
      holdout_reach_ids = holdout_reach_ids,
      out_ind = target_name,
      prioritize_nwis_sites = TRUE)


  priority_sites:
    command: c(I(c('01427207', '01417500', '01427000', '01426500', '01436690')))

  # for priority sites, repull from NWIS, append any
  # missing dates from "other" sources
  2_observations/out/obs_temp_priority_sites.csv.ind:
    command: get_priority_data(
      out_ind = target_name,
      dummy_date = I('2021-07-12'),
      sites = priority_sites,
      pcode = I('00010'),
      statcd = I(c('00001', '00002', '00003')),
      site_meta_ind = I('2_observations/out/crosswalk_site_reach.rds.ind'),
      other_dat_ind = '2_observations/out/obs_temp_drb.rds.ind',
      holdout_water_years = holdout_water_years,
      holdout_reach_ids = holdout_reach_ids)

  # ngwos reaches that should be withheld after 2017-10-01 for NGWOS experiment
  ngwos_reaches:
    command: get_ngwos_reaches(sites_ind = '2_observations/out/drb_filtered_sites.rds.ind', ngwos_sites)

  # discharge data
  2_observations/out/obs_flow_drb.rds.ind:
    command: munge_split_flow(
      out_ind = target_name,
      dat_ind = '2_observations/in/daily_flow.rds.ind',
      sites_ind = '2_observations/out/drb_filtered_sites_qa.rds.ind',
      holdout_water_years = holdout_water_years,
      holdout_reach_ids = holdout_reach_ids)

  # for priority sites, repull from NWIS, append any
  # missing dates from "other" sources
  2_observations/out/obs_flow_priority_sites.csv.ind:
    command: get_priority_data(
      out_ind = target_name,
      dummy_date = I('2021-06-17'),
      sites = priority_sites,
      pcode = I('00060'),
      statcd = I('00003'),
      site_meta_ind = I('2_observations/out/crosswalk_site_reach.rds.ind'),
      holdout_water_years = holdout_water_years,
      holdout_reach_ids = holdout_reach_ids)
  2_observations/out/drb_temp_summary.csv:
    command: summarize_dat(in_ind = '2_observations/out/obs_temp_drb.rds.ind', out_file = target_name)

  2_observations/out/drb_flow_summary.csv:
    command: summarize_dat(in_ind = '2_observations/out/obs_flow_drb.rds.ind', out_file = target_name)

  # reservoir release data provided by NYC DEP

  2_observations/out/reservoir_releases.csv.ind:
    command: clean_release_dat(
      in_ind = '2_observations/in/PCN_Spill_Release.xlsx.ind',
      out_ind = target_name,
      mgd_to_cms = I(0.0438125))

  # modern reservoir releases that can be retrieved from NWIS.
  # These are real-time releases that started in March 2021
  2_observations/out/modern_reservoir_releases.rds.ind:
    command: get_releases(
      out_ind = target_name,
      site_ids = I(c('01436499', '01417499', '01436599')),
      reservoir_names = I(c('Cannonsville', 'Pepacton', 'Neversink')))

  # reservoir diversion data
  # you must download reservoir data from here: 'https://doimspp-my.sharepoint.com/:f:/g/personal/soliver_usgs_gov/EiawwIR59vdApLCkeyob4t0Bx86d2SoTic8ahCm6MH5wRA?e=KEk1zE'
  # unzip downloaded folder to 2_observations/in
  # such that the structure becomes
  # 2_observations/in/reservoir_data
      # WY2009 data/
      # ...
      # 01416900.WY2008.xls
      # ...
  2_observations/out/observed_daily_reservoir_water_budget_components.csv.ind:
    command: read_munge_daily_diversions(out_ind = target_name)

  2_observations/out/observed_monthly_reservoir_water_budget_components.csv.ind:
    command: read_munge_monthly_diversions(
      in_file = '2_observations/in/reservoir data/manually_extracted_diversions.csv',
      out_ind = target_name)

  # interpolate to daily values
  # use daily observed values when available, populate same diversion value
  # over month when monthly values vailable
  2_observations/out/interpolated_daily_reservoir_water_budget_components.csv.ind:
    command: interpolate_diversions_to_daily(
      out_ind = target_name,
      daily_ind = '2_observations/out/observed_daily_reservoir_water_budget_components.csv.ind',
      monthly_ind = '2_observations/out/observed_monthly_reservoir_water_budget_components.csv.ind',
      end_date = I('2020-12-31'),
      mgd_to_cms = 0.0438, cfs_to_cms = 0.0283,
      mg_to_cm = 3785.41, ft_to_m = 0.3048)

 # reservoir surface area data
 # You must download reservoir Monthly Shapefiles from here: 'http://umnlcc.cs.umn.edu/realsat/'.
 # The reservoir reaLSAT ID can be found in the Base shapefile: ReaLSAT.shp.
 # ReaLSAT.shp has no attribute information other than id --> the best way to find the reservoir
 # id is to load the shapefile in ArcMap and zoom to where your reservoir is, visually id it by shape
 # and then select the polygon to identify its reaLSAT id.
 # The reservoir-specific shapefiles with surface-area data are zipped within larger monthly shapefiles.
 # For each reaLSAT id, the corresponding monthly shapefile folder is numbered according
 # to the LAST digit of the ID, e.g. the shapefile for 573567 is in the zipped monthly shapefile folder named '7'
 # Download and unzip the monthly shapefile folder and place the reservoir-specific folder with its shapefile
 # in 2_observations/in/realsat, such that the structure becomes:
 # 2_observations/in/realsat/ID:
      # in/realsat/573567/573567.shp
      # in/realsat/573536/573536.shp
  realsat_ids:
    command: c(Pepacton = I('573567'), Cannonsville = I('573536'), Cannonsville = I('556766'))

  2_observations/out/raw_realsat_reservoir_data.csv.ind:
    command: retrieve_realsat_reservoir_data(
      reservoir_ids = realsat_ids,
      out_ind = target_name)

  2_observations/out/realsat_monthly_surface_area.csv.ind:
    command: combine_realsat_reservoir_data(
      in_ind = '2_observations/out/raw_realsat_reservoir_data.csv.ind',
      out_ind = target_name)

  # combine all sources of releases to complete total release time series
  # the manually extracted ODRM data is being used to cover the gaps between
  # these data sources
  2_observations/out/complete_reservoir_releases.csv.ind:
    command: combine_release_sources(
      out_ind = target_name,
      hist_rel_ind = '2_observations/out/reservoir_releases.csv.ind',
      usgs_rel_ind = '2_observations/out/observed_daily_reservoir_water_budget_components.csv.ind',
      modern_rel_ind = '2_observations/out/modern_reservoir_releases.rds.ind',
      manual_rel_ind = '2_observations/in/manually_extracted_odrm_data.csv.ind')

  2_observations/out/reservoir_releases_by_type_drb.csv.ind:
    command: combine_releases_by_type(
      out_ind = target_name,
      hist_rel_ind = '2_observations/out/reservoir_releases.csv.ind',
      modern_rel_ind = '2_observations/out/modern_reservoir_releases.rds.ind')

  2_observations/out/reservoir_diversions.csv.ind:
    command: extract_diversions(
      out_ind = target_name,
      hist_ind = '2_observations/out/interpolated_daily_reservoir_water_budget_components.csv.ind',
      mod_ind = '2_observations/out/modern_reservoir_releases.rds.ind')