Merge pull request #177 from mrcaseb/vegas_wpa

Add vegas_wpa, out_of_bounds, fantasy player columns
nflverse · Feb 12, 2021 · ca4544d · ca4544d
2 parents 548dfc1 + 5398ea2
commit ca4544d
Show file tree

Hide file tree

Showing 15 changed files with 185 additions and 63 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: nflfastR
 Title: Functions to Efficiently Access NFL Play by Play Data
-Version: 3.2.0.9011
+Version: 3.2.0.9012
 Authors@R: 
     c(person(given = "Sebastian",
              family = "Carl",

diff --git a/NEWS.md b/NEWS.md
@@ -15,6 +15,10 @@ required for `calculate_win_probability()`
 * Fixed a bug related to the calculation of EPA on plays before a failed pass interference challenge in a few 2019 games (#175)
 * Fixed a bug related to lots of fields with `NA` on offsetting penalties (#44)
 * Switched data source for 2001-2010 to what is used for 2011 and on
+* Added columns `vegas_wpa` and `vegas_home_wpa` which contain Win Probability Added from the spread-adjusted WPA model
+* Fix bug in `epa` when possession team changes at end of 1st or 3rd quarter (#182)
+* Added column `out_of_bounds`
+* Added columns `fantasy`, `fantasy_id`, `fantasy_player_name`, and `fantasy_player_id` that indicate the rusher or receiver on the play
 
 # nflfastR 3.2.0
 

diff --git a/R/helper_add_ep_wp.R b/R/helper_add_ep_wp.R
@@ -797,13 +797,19 @@ add_wp_variables <- function(pbp_data) {
   pbp_data <- pbp_data %>%
     dplyr::mutate(
       wp = OffWinProb,
-      vegas_wp = OffWinProb_spread) %>%
+      vegas_wp = OffWinProb_spread,
+      # for figuring out posteam on NA posteam lines
+      tmp_posteam = .data$posteam
+      ) %>%
     tidyr::fill(
       .data$wp, .direction = "up"
     ) %>%
     tidyr::fill(
       .data$vegas_wp, .direction = "up"
     ) %>%
+    tidyr::fill(
+      .data$tmp_posteam, .direction = "up"
+    ) %>%
     dplyr::mutate(
       wp = dplyr::if_else(is.na(.data$posteam), NA_real_, .data$wp),
       def_wp = 1 - .data$wp,
@@ -813,9 +819,8 @@ add_wp_variables <- function(pbp_data) {
                                .data$wp, .data$def_wp),
 
       #add columns for WP taking into account spread
-      vegas_wp = dplyr::if_else(is.na(.data$posteam), NA_real_, .data$vegas_wp),
-      vegas_home_wp = dplyr::if_else(.data$posteam == .data$home_team,
-                                     .data$vegas_wp, 1 - .data$vegas_wp),
+      # vegas_wp = dplyr::if_else(is.na(.data$posteam), NA_real_, .data$vegas_wp),
+      vegas_home_wp = dplyr::if_else(.data$tmp_posteam == .data$home_team, .data$vegas_wp, 1 - .data$vegas_wp),
       #make 1 or 0 the final win prob
       vegas_home_wp = dplyr::if_else(
         stringr::str_detect(
@@ -827,8 +832,13 @@ add_wp_variables <- function(pbp_data) {
           .data$home_score == .data$away_score ~ .5
         ),
         .data$vegas_home_wp
+      ),
+      vegas_home_wpa = dplyr::lead(.data$vegas_home_wp) - .data$vegas_home_wp,
+      vegas_wpa = dplyr::if_else(.data$tmp_posteam == .data$home_team, .data$vegas_home_wpa, -.data$vegas_home_wpa),
+      vegas_wpa = dplyr::if_else(
+        stringr::str_detect(tolower(.data$desc), " kneels "), NA_real_, .data$vegas_wpa
+        )
       )
-    )
 
   # For now follow the code from before, will need to update later:
   # Create the possible WPA values

diff --git a/R/helper_add_nflscrapr_mutations.R b/R/helper_add_nflscrapr_mutations.R
@@ -36,43 +36,35 @@ add_nflscrapr_mutations <- function(pbp) {
     dplyr::arrange(.data$order_sequence, .data$quarter, !is.na(.data$quarter_seconds_remaining), -.data$quarter_seconds_remaining, !is.na(.data$drive), .data$drive, .data$index, .by_group = TRUE) %>%
     dplyr::ungroup() %>%
     dplyr::mutate(
-      # Fill in the rows with missing posteam with the lag:
+
+      # Make the possession team for kickoffs be the return team, since that is
+      # more intuitive from the EPA / WPA point of view:
+      posteam = dplyr::case_when(
+        # kickoff_finder is defined below
+        (.data$kickoff_attempt == 1 | stringr::str_detect(.data$play_description, kickoff_finder)) & .data$posteam == .data$home_team ~ .data$away_team,
+        (.data$kickoff_attempt == 1 | stringr::str_detect(.data$play_description, kickoff_finder)) & .data$posteam == .data$away_team ~ .data$home_team,
+        TRUE ~ .data$posteam
+      ),
+
+      # Fill in the rows with missing posteam with the lead:
       posteam = dplyr::if_else(
         (.data$quarter_end == 1 | .data$posteam == ""),
-        dplyr::lag(.data$posteam),
+        dplyr::lead(.data$posteam),
         .data$posteam),
       posteam_id = dplyr::if_else(
         (.data$quarter_end == 1 | .data$posteam_id == ""),
-        dplyr::lag(.data$posteam_id),
+        dplyr::lead(.data$posteam_id),
         .data$posteam_id),
+
       # Denote whether the home or away team has possession:
       posteam_type = dplyr::if_else(.data$posteam == .data$home_team, "home", "away"),
+
       # Column denoting which team is on defense:
       defteam = dplyr::if_else(
-        .data$posteam_type == "home",
+        .data$posteam == .data$home_team,
         .data$away_team, .data$home_team
       ),
-      # Make the possession team for kickoffs be the return team, since that is
-      # more intuitive from the EPA / WPA point of view:
-      posteam = dplyr::case_when(
-        # kickoff_finder is defined below
-        (.data$kickoff_attempt == 1 | stringr::str_detect(.data$play_description, kickoff_finder)) & .data$posteam_type == "home" ~ .data$away_team,
-        (.data$kickoff_attempt == 1 | stringr::str_detect(.data$play_description, kickoff_finder)) & .data$posteam_type == "away" ~ .data$home_team,
-        TRUE ~ .data$posteam
-      ),
-      defteam = dplyr::case_when(
-        # kickoff_finder is defined below
-        (.data$kickoff_attempt == 1 | stringr::str_detect(.data$play_description, kickoff_finder)) & .data$posteam_type == "home" ~ .data$home_team,
-        (.data$kickoff_attempt == 1 | stringr::str_detect(.data$play_description, kickoff_finder)) & .data$posteam_type == "away" ~ .data$away_team,
-        TRUE ~ .data$defteam
-      ),
-      # Now flip the posteam_type as well:
-      posteam_type = dplyr::case_when(
-        # kickoff_finder is defined below
-        (.data$kickoff_attempt == 1 | stringr::str_detect(.data$play_description, kickoff_finder)) & .data$posteam == .data$home_team ~ "home",
-        (.data$kickoff_attempt == 1 | stringr::str_detect(.data$play_description, kickoff_finder)) & .data$posteam == .data$away_team ~ "away",
-        TRUE ~ .data$posteam_type
-      ),
+
       yardline = dplyr::if_else(.data$yardline == "50", "MID 50", .data$yardline),
       yardline = dplyr::if_else(
         nchar(.data$yardline) == 0 | is.null(.data$yardline) | .data$yardline == "NULL" | is.na(.data$yardline),

diff --git a/R/helper_additional_functions.R b/R/helper_additional_functions.R
@@ -36,9 +36,14 @@
 #' \item{rusher_id}{ID of the player in the 'rusher' column (NOTE: ids vary pre and post 2011 but are consistent for each player. Please see details for further information)}
 #' \item{receiver_id}{ID of the player in the 'receiver' column (NOTE: ids vary pre and post 2011 but are consistent for each player. Please see details for further information)}
 #' \item{name}{Name of the 'passer' if it is not 'NA', or name of the 'rusher' otherwise.}
+#' \item{fantasy}{Name of the rusher on rush plays or receiver on pass plays.}
+#' \item{fantasy_id}{ID of the rusher on rush plays or receiver on pass plays.}
+#' \item{fantasy_player_name}{Name of the rusher on rush plays or receiver on pass plays (from official stats).}
+#' \item{fantasy_player_id}{ID of the rusher on rush plays or receiver on pass plays (from official stats).}
 #' \item{jersey_number}{Jersey number of the player listed in the 'name' column.}
 #' \item{id}{ID of the player in the 'name' column (NOTE: ids vary pre and post 2011 but are consistent for each player. Please see details for further information)}
 #' \item{qb_epa}{Gives QB credit for EPA for up to the point where a receiver lost a fumble after a completed catch and makes EPA work more like passing yards on plays with fumbles.}
+#' \item{out_of_bounds}{1 if play description contains ran ob, pushed ob, or sacked ob; 0 otherwise.}
 #' }
 #' @export
 #' @import dplyr
@@ -51,7 +56,7 @@ clean_pbp <- function(pbp, ...) {
     usethis::ui_info("Nothing to clean. Return passed data frame.")
     r <- pbp
   } else{
-    rlang::inform(paste0(crayon::red(cli::symbol$bullet), " Cleaning up play-by-play... (", crayon::yellow(cli::symbol$info), " If you run this with a lot of seasons this could take a few minutes.)"))
+    rlang::inform(paste0(crayon::red(cli::symbol$bullet), " Cleaning up play-by-play..."))
 
     # Load id map to standardize player ids for players that were active before 2011
     # and in or after 2011 meaning they appear with old gsis_ids and new ids
@@ -213,7 +218,35 @@ clean_pbp <- function(pbp, ...) {
         dplyr::vars(.data$passer_id, .data$rusher_id, .data$receiver_id, .data$id, ends_with("player_id")),
         update_ids, legacy_id_map) %>%
       dplyr::arrange(.data$index) %>%
-      dplyr::select(-"index")
+      dplyr::select(-"index") %>%
+      # add action player
+      dplyr::mutate(
+        fantasy_player_name = case_when(
+          !is.na(.data$rusher_player_name) ~ .data$rusher_player_name,
+          is.na(.data$rusher_player_name) & !is.na(.data$receiver_player_name) ~ .data$receiver_player_name,
+          TRUE ~ NA_character_
+        ),
+        fantasy_player_id = case_when(
+          !is.na(.data$rusher_player_id) ~ .data$rusher_player_id,
+          is.na(.data$rusher_player_id) & !is.na(.data$receiver_player_id) ~ .data$receiver_player_id,
+          TRUE ~ NA_character_
+        ),
+        fantasy = case_when(
+          !is.na(.data$rusher) ~ .data$rusher,
+          is.na(.data$rusher) & !is.na(.data$receiver) ~ .data$receiver,
+          .data$qb_scramble == 1 ~ .data$passer,
+          TRUE ~ NA_character_
+        ),
+        fantasy_id = case_when(
+          !is.na(.data$rusher_id) ~ .data$rusher_id,
+          is.na(.data$rusher_id) & !is.na(.data$receiver_id) ~ .data$receiver_id,
+          .data$qb_scramble == 1 ~ .data$passer_id,
+          TRUE ~ NA_character_
+        ),
+        out_of_bounds = dplyr::if_else(
+          stringr::str_detect(.data$desc, "(ran ob)|(pushed ob)|(sacked ob)"), 1, 0
+        )
+      )
   }
 
   message_completed("Cleaning completed", ...)

diff --git a/R/helper_decode_player_ids.R b/R/helper_decode_player_ids.R
@@ -4,7 +4,7 @@
 # Code Style Guide: styler::tidyverse_style()
 ################################################################################
 
-#' Decode the player IDs in nflfastR play-by-ply data
+#' Decode the player IDs in nflfastR play-by-play data
 #'
 #' @inheritParams clean_pbp
 #' @param fast If \code{TRUE} the IDs will be decoded with the high efficient
@@ -14,9 +14,9 @@
 #' which can take several days to fix on CRAN.)
 #'
 #' @description Takes all columns ending with \code{'player_id'} as well as the
-#' variables \code{'passer_id'}, \code{'rusher_id'}, \code{'receiver_id'} and \code{'id'}
-#' of an nflfastR play-by-play data set and decodes the player IDs to the commonly
-#' known GSIS ID format 00-00xxxxx.
+#' variables \code{'passer_id'}, \code{'rusher_id'}, \code{'fantasy_id'},
+#' \code{'receiver_id'}, and \code{'id'} of an nflfastR play-by-play data set
+#' and decodes the player IDs to the commonly known GSIS ID format 00-00xxxxx.
 #'
 #' The function uses by default the high efficient \link[gsisdecoder]{decode_ids}
 #' of the package \href{https://cran.r-project.org/package=gsisdecoder}{\code{gsisdecoder}}.
@@ -62,7 +62,7 @@ decode_player_ids <- function(pbp, ..., fast = TRUE) {
     ret <- pbp %>%
       dplyr::mutate_at(
         dplyr::vars(
-          tidyselect::any_of(c("passer_id", "rusher_id", "receiver_id", "id")),
+          tidyselect::any_of(c("passer_id", "rusher_id", "receiver_id", "id", "fantasy_id")),
           tidyselect::ends_with("player_id")
         ),
         decode_ids, pp

diff --git a/R/helper_variable_selector.R b/R/helper_variable_selector.R
@@ -22,7 +22,7 @@ select_variables <- function(pbp) {
 }
 
 # columns that are not in gamecenter that we created
-new_cols <- c("season", "cp", "cpoe", "series", "series_success", "series_result")
+new_cols <- c("season", "cp", "cpoe", "series", "series_success", "series_result", "vegas_home_wpa", "vegas_wpa")
 
 # original nflscrapr columns
 nflscrapr_cols <-

diff --git a/R/top-level_scraper.R b/R/top-level_scraper.R
@@ -119,6 +119,8 @@
 #' \item{home_wp}{Estimated win probability for the home team.}
 #' \item{away_wp}{Estimated win probability for the away team.}
 #' \item{wpa}{Win probability added (WPA) for the posteam.}
+#' \item{vegas_wpa}{Win probability added (WPA) for the posteam: spread_adjusted model.}
+#' \item{vegas_home_wpa}{Win probability added (WPA) for the home team: spread_adjusted model.}
 #' \item{home_wp_post}{Estimated win probability for the home team at the end of the play.}
 #' \item{away_wp_post}{Estimated win probability for the away team at the end of the play.}
 #' \item{vegas_wp}{Estimated win probabiity for the posteam given the current situation at the start of the given play, incorporating pre-game Vegas line.}

diff --git a/data-raw/create_field_descriptions.R b/data-raw/create_field_descriptions.R
@@ -9,4 +9,5 @@ field_descriptions <- tibble(x = x) %>%
   separate(x,"{",into = c(NA,"Field","Description")) %>%
   mutate_all(str_remove_all,"\\}")
 
-usethis::use_data(field_descriptions,overwrite = TRUE)
+# usethis::use_data(field_descriptions,overwrite = TRUE)
+save(field_descriptions, file = "vignettes/field_descriptions.rda")
diff --git a/data-raw/variable_list.txt b/data-raw/variable_list.txt
@@ -93,6 +93,8 @@
 #' \item{home_wp}{Estimated win probability for the home team.}
 #' \item{away_wp}{Estimated win probability for the away team.}
 #' \item{wpa}{Win probability added (WPA) for the posteam.}
+#' \item{vegas_wpa}{Win probability added (WPA) for the posteam: spread_adjusted model.}
+#' \item{vegas_home_wpa}{Win probability added (WPA) for the home team: spread_adjusted model.}
 #' \item{home_wp_post}{Estimated win probability for the home team at the end of the play.}
 #' \item{away_wp_post}{Estimated win probability for the away team at the end of the play.}
 #' \item{vegas_wp}{Estimated win probabiity for the posteam given the current situation at the start of the given play, incorporating pre-game Vegas line.}
@@ -332,9 +334,16 @@
 #' \item{name}{Name of the 'passer' if it is not 'NA', or name of the 'rusher' otherwise.}
 #' \item{jersey_number}{Jersey number of the player listed in the 'name' column.}
 #' \item{id}{ID of the player in the 'name' column (NOTE: ids vary pre and post 2011 but are consistent for each player. Please see details for further information)}
+#' \item{fantasy}{Name of the rusher on rush plays or receiver on pass plays.}
+#' \item{fantasy_id}{ID of the rusher on rush plays or receiver on pass plays.}
+#' \item{fantasy_player_name}{Name of the rusher on rush plays or receiver on pass plays (from official stats).}
+#' \item{fantasy_player_id}{ID of the rusher on rush plays or receiver on pass plays (from official stats).}
+#' \item{out_of_bounds}{1 if play description contains ran ob, pushed ob, or sacked ob; 0 otherwise.}
 #' \item{qb_epa}{Gives QB credit for EPA for up to the point where a receiver lost a fumble after a completed catch and makes EPA work more like passing yards on plays with fumbles.}
 #' \item{xyac_epa}{Expected value of EPA gained after the catch, starting from where the catch was made. Zero yards after the catch would be listed as zero EPA.}
 #' \item{xyac_success}{Probability play earns positive EPA (relative to where play started) based on where ball was caught.}
 #' \item{xyac_fd}{Probability play earns a first down based on where the ball was caught.}
 #' \item{xyac_mean_yardage}{Average expected yards after the catch based on where the ball was caught.}
 #' \item{xyac_median_yardage}{Median expected yards after the catch based on where the ball was caught.}
+#' \item{xpass}{Probability of dropback scaled from 0 to 1.}
+#' \item{pass_oe}{Dropback percent over expected on a given play scaled from 0 to 100.}
diff --git a/man/clean_pbp.Rd b/man/clean_pbp.Rd
diff --git a/man/decode_player_ids.Rd b/man/decode_player_ids.Rd
diff --git a/man/fast_scraper.Rd b/man/fast_scraper.Rd
diff --git a/vignettes/field_descriptions.rda b/vignettes/field_descriptions.rda