Skip to content

Commit

Permalink
Merge pull request #177 from mrcaseb/vegas_wpa
Browse files Browse the repository at this point in the history
Add vegas_wpa, out_of_bounds, fantasy player columns
  • Loading branch information
guga31bb authored Feb 12, 2021
2 parents 548dfc1 + 5398ea2 commit ca4544d
Show file tree
Hide file tree
Showing 15 changed files with 185 additions and 63 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: nflfastR
Title: Functions to Efficiently Access NFL Play by Play Data
Version: 3.2.0.9011
Version: 3.2.0.9012
Authors@R:
c(person(given = "Sebastian",
family = "Carl",
Expand Down
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ required for `calculate_win_probability()`
* Fixed a bug related to the calculation of EPA on plays before a failed pass interference challenge in a few 2019 games (#175)
* Fixed a bug related to lots of fields with `NA` on offsetting penalties (#44)
* Switched data source for 2001-2010 to what is used for 2011 and on
* Added columns `vegas_wpa` and `vegas_home_wpa` which contain Win Probability Added from the spread-adjusted WPA model
* Fix bug in `epa` when possession team changes at end of 1st or 3rd quarter (#182)
* Added column `out_of_bounds`
* Added columns `fantasy`, `fantasy_id`, `fantasy_player_name`, and `fantasy_player_id` that indicate the rusher or receiver on the play

# nflfastR 3.2.0

Expand Down
20 changes: 15 additions & 5 deletions R/helper_add_ep_wp.R
Original file line number Diff line number Diff line change
Expand Up @@ -797,13 +797,19 @@ add_wp_variables <- function(pbp_data) {
pbp_data <- pbp_data %>%
dplyr::mutate(
wp = OffWinProb,
vegas_wp = OffWinProb_spread) %>%
vegas_wp = OffWinProb_spread,
# for figuring out posteam on NA posteam lines
tmp_posteam = .data$posteam
) %>%
tidyr::fill(
.data$wp, .direction = "up"
) %>%
tidyr::fill(
.data$vegas_wp, .direction = "up"
) %>%
tidyr::fill(
.data$tmp_posteam, .direction = "up"
) %>%
dplyr::mutate(
wp = dplyr::if_else(is.na(.data$posteam), NA_real_, .data$wp),
def_wp = 1 - .data$wp,
Expand All @@ -813,9 +819,8 @@ add_wp_variables <- function(pbp_data) {
.data$wp, .data$def_wp),

#add columns for WP taking into account spread
vegas_wp = dplyr::if_else(is.na(.data$posteam), NA_real_, .data$vegas_wp),
vegas_home_wp = dplyr::if_else(.data$posteam == .data$home_team,
.data$vegas_wp, 1 - .data$vegas_wp),
# vegas_wp = dplyr::if_else(is.na(.data$posteam), NA_real_, .data$vegas_wp),
vegas_home_wp = dplyr::if_else(.data$tmp_posteam == .data$home_team, .data$vegas_wp, 1 - .data$vegas_wp),
#make 1 or 0 the final win prob
vegas_home_wp = dplyr::if_else(
stringr::str_detect(
Expand All @@ -827,8 +832,13 @@ add_wp_variables <- function(pbp_data) {
.data$home_score == .data$away_score ~ .5
),
.data$vegas_home_wp
),
vegas_home_wpa = dplyr::lead(.data$vegas_home_wp) - .data$vegas_home_wp,
vegas_wpa = dplyr::if_else(.data$tmp_posteam == .data$home_team, .data$vegas_home_wpa, -.data$vegas_home_wpa),
vegas_wpa = dplyr::if_else(
stringr::str_detect(tolower(.data$desc), " kneels "), NA_real_, .data$vegas_wpa
)
)
)

# For now follow the code from before, will need to update later:
# Create the possible WPA values
Expand Down
42 changes: 17 additions & 25 deletions R/helper_add_nflscrapr_mutations.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,43 +36,35 @@ add_nflscrapr_mutations <- function(pbp) {
dplyr::arrange(.data$order_sequence, .data$quarter, !is.na(.data$quarter_seconds_remaining), -.data$quarter_seconds_remaining, !is.na(.data$drive), .data$drive, .data$index, .by_group = TRUE) %>%
dplyr::ungroup() %>%
dplyr::mutate(
# Fill in the rows with missing posteam with the lag:

# Make the possession team for kickoffs be the return team, since that is
# more intuitive from the EPA / WPA point of view:
posteam = dplyr::case_when(
# kickoff_finder is defined below
(.data$kickoff_attempt == 1 | stringr::str_detect(.data$play_description, kickoff_finder)) & .data$posteam == .data$home_team ~ .data$away_team,
(.data$kickoff_attempt == 1 | stringr::str_detect(.data$play_description, kickoff_finder)) & .data$posteam == .data$away_team ~ .data$home_team,
TRUE ~ .data$posteam
),

# Fill in the rows with missing posteam with the lead:
posteam = dplyr::if_else(
(.data$quarter_end == 1 | .data$posteam == ""),
dplyr::lag(.data$posteam),
dplyr::lead(.data$posteam),
.data$posteam),
posteam_id = dplyr::if_else(
(.data$quarter_end == 1 | .data$posteam_id == ""),
dplyr::lag(.data$posteam_id),
dplyr::lead(.data$posteam_id),
.data$posteam_id),

# Denote whether the home or away team has possession:
posteam_type = dplyr::if_else(.data$posteam == .data$home_team, "home", "away"),

# Column denoting which team is on defense:
defteam = dplyr::if_else(
.data$posteam_type == "home",
.data$posteam == .data$home_team,
.data$away_team, .data$home_team
),
# Make the possession team for kickoffs be the return team, since that is
# more intuitive from the EPA / WPA point of view:
posteam = dplyr::case_when(
# kickoff_finder is defined below
(.data$kickoff_attempt == 1 | stringr::str_detect(.data$play_description, kickoff_finder)) & .data$posteam_type == "home" ~ .data$away_team,
(.data$kickoff_attempt == 1 | stringr::str_detect(.data$play_description, kickoff_finder)) & .data$posteam_type == "away" ~ .data$home_team,
TRUE ~ .data$posteam
),
defteam = dplyr::case_when(
# kickoff_finder is defined below
(.data$kickoff_attempt == 1 | stringr::str_detect(.data$play_description, kickoff_finder)) & .data$posteam_type == "home" ~ .data$home_team,
(.data$kickoff_attempt == 1 | stringr::str_detect(.data$play_description, kickoff_finder)) & .data$posteam_type == "away" ~ .data$away_team,
TRUE ~ .data$defteam
),
# Now flip the posteam_type as well:
posteam_type = dplyr::case_when(
# kickoff_finder is defined below
(.data$kickoff_attempt == 1 | stringr::str_detect(.data$play_description, kickoff_finder)) & .data$posteam == .data$home_team ~ "home",
(.data$kickoff_attempt == 1 | stringr::str_detect(.data$play_description, kickoff_finder)) & .data$posteam == .data$away_team ~ "away",
TRUE ~ .data$posteam_type
),

yardline = dplyr::if_else(.data$yardline == "50", "MID 50", .data$yardline),
yardline = dplyr::if_else(
nchar(.data$yardline) == 0 | is.null(.data$yardline) | .data$yardline == "NULL" | is.na(.data$yardline),
Expand Down
37 changes: 35 additions & 2 deletions R/helper_additional_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,14 @@
#' \item{rusher_id}{ID of the player in the 'rusher' column (NOTE: ids vary pre and post 2011 but are consistent for each player. Please see details for further information)}
#' \item{receiver_id}{ID of the player in the 'receiver' column (NOTE: ids vary pre and post 2011 but are consistent for each player. Please see details for further information)}
#' \item{name}{Name of the 'passer' if it is not 'NA', or name of the 'rusher' otherwise.}
#' \item{fantasy}{Name of the rusher on rush plays or receiver on pass plays.}
#' \item{fantasy_id}{ID of the rusher on rush plays or receiver on pass plays.}
#' \item{fantasy_player_name}{Name of the rusher on rush plays or receiver on pass plays (from official stats).}
#' \item{fantasy_player_id}{ID of the rusher on rush plays or receiver on pass plays (from official stats).}
#' \item{jersey_number}{Jersey number of the player listed in the 'name' column.}
#' \item{id}{ID of the player in the 'name' column (NOTE: ids vary pre and post 2011 but are consistent for each player. Please see details for further information)}
#' \item{qb_epa}{Gives QB credit for EPA for up to the point where a receiver lost a fumble after a completed catch and makes EPA work more like passing yards on plays with fumbles.}
#' \item{out_of_bounds}{1 if play description contains ran ob, pushed ob, or sacked ob; 0 otherwise.}
#' }
#' @export
#' @import dplyr
Expand All @@ -51,7 +56,7 @@ clean_pbp <- function(pbp, ...) {
usethis::ui_info("Nothing to clean. Return passed data frame.")
r <- pbp
} else{
rlang::inform(paste0(crayon::red(cli::symbol$bullet), " Cleaning up play-by-play... (", crayon::yellow(cli::symbol$info), " If you run this with a lot of seasons this could take a few minutes.)"))
rlang::inform(paste0(crayon::red(cli::symbol$bullet), " Cleaning up play-by-play..."))

# Load id map to standardize player ids for players that were active before 2011
# and in or after 2011 meaning they appear with old gsis_ids and new ids
Expand Down Expand Up @@ -213,7 +218,35 @@ clean_pbp <- function(pbp, ...) {
dplyr::vars(.data$passer_id, .data$rusher_id, .data$receiver_id, .data$id, ends_with("player_id")),
update_ids, legacy_id_map) %>%
dplyr::arrange(.data$index) %>%
dplyr::select(-"index")
dplyr::select(-"index") %>%
# add action player
dplyr::mutate(
fantasy_player_name = case_when(
!is.na(.data$rusher_player_name) ~ .data$rusher_player_name,
is.na(.data$rusher_player_name) & !is.na(.data$receiver_player_name) ~ .data$receiver_player_name,
TRUE ~ NA_character_
),
fantasy_player_id = case_when(
!is.na(.data$rusher_player_id) ~ .data$rusher_player_id,
is.na(.data$rusher_player_id) & !is.na(.data$receiver_player_id) ~ .data$receiver_player_id,
TRUE ~ NA_character_
),
fantasy = case_when(
!is.na(.data$rusher) ~ .data$rusher,
is.na(.data$rusher) & !is.na(.data$receiver) ~ .data$receiver,
.data$qb_scramble == 1 ~ .data$passer,
TRUE ~ NA_character_
),
fantasy_id = case_when(
!is.na(.data$rusher_id) ~ .data$rusher_id,
is.na(.data$rusher_id) & !is.na(.data$receiver_id) ~ .data$receiver_id,
.data$qb_scramble == 1 ~ .data$passer_id,
TRUE ~ NA_character_
),
out_of_bounds = dplyr::if_else(
stringr::str_detect(.data$desc, "(ran ob)|(pushed ob)|(sacked ob)"), 1, 0
)
)
}

message_completed("Cleaning completed", ...)
Expand Down
10 changes: 5 additions & 5 deletions R/helper_decode_player_ids.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# Code Style Guide: styler::tidyverse_style()
################################################################################

#' Decode the player IDs in nflfastR play-by-ply data
#' Decode the player IDs in nflfastR play-by-play data
#'
#' @inheritParams clean_pbp
#' @param fast If \code{TRUE} the IDs will be decoded with the high efficient
Expand All @@ -14,9 +14,9 @@
#' which can take several days to fix on CRAN.)
#'
#' @description Takes all columns ending with \code{'player_id'} as well as the
#' variables \code{'passer_id'}, \code{'rusher_id'}, \code{'receiver_id'} and \code{'id'}
#' of an nflfastR play-by-play data set and decodes the player IDs to the commonly
#' known GSIS ID format 00-00xxxxx.
#' variables \code{'passer_id'}, \code{'rusher_id'}, \code{'fantasy_id'},
#' \code{'receiver_id'}, and \code{'id'} of an nflfastR play-by-play data set
#' and decodes the player IDs to the commonly known GSIS ID format 00-00xxxxx.
#'
#' The function uses by default the high efficient \link[gsisdecoder]{decode_ids}
#' of the package \href{https://cran.r-project.org/package=gsisdecoder}{\code{gsisdecoder}}.
Expand Down Expand Up @@ -62,7 +62,7 @@ decode_player_ids <- function(pbp, ..., fast = TRUE) {
ret <- pbp %>%
dplyr::mutate_at(
dplyr::vars(
tidyselect::any_of(c("passer_id", "rusher_id", "receiver_id", "id")),
tidyselect::any_of(c("passer_id", "rusher_id", "receiver_id", "id", "fantasy_id")),
tidyselect::ends_with("player_id")
),
decode_ids, pp
Expand Down
2 changes: 1 addition & 1 deletion R/helper_variable_selector.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ select_variables <- function(pbp) {
}

# columns that are not in gamecenter that we created
new_cols <- c("season", "cp", "cpoe", "series", "series_success", "series_result")
new_cols <- c("season", "cp", "cpoe", "series", "series_success", "series_result", "vegas_home_wpa", "vegas_wpa")

# original nflscrapr columns
nflscrapr_cols <-
Expand Down
2 changes: 2 additions & 0 deletions R/top-level_scraper.R
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,8 @@
#' \item{home_wp}{Estimated win probability for the home team.}
#' \item{away_wp}{Estimated win probability for the away team.}
#' \item{wpa}{Win probability added (WPA) for the posteam.}
#' \item{vegas_wpa}{Win probability added (WPA) for the posteam: spread_adjusted model.}
#' \item{vegas_home_wpa}{Win probability added (WPA) for the home team: spread_adjusted model.}
#' \item{home_wp_post}{Estimated win probability for the home team at the end of the play.}
#' \item{away_wp_post}{Estimated win probability for the away team at the end of the play.}
#' \item{vegas_wp}{Estimated win probabiity for the posteam given the current situation at the start of the given play, incorporating pre-game Vegas line.}
Expand Down
3 changes: 2 additions & 1 deletion data-raw/create_field_descriptions.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@ field_descriptions <- tibble(x = x) %>%
separate(x,"{",into = c(NA,"Field","Description")) %>%
mutate_all(str_remove_all,"\\}")

usethis::use_data(field_descriptions,overwrite = TRUE)
# usethis::use_data(field_descriptions,overwrite = TRUE)
save(field_descriptions, file = "vignettes/field_descriptions.rda")
9 changes: 9 additions & 0 deletions data-raw/variable_list.txt
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@
#' \item{home_wp}{Estimated win probability for the home team.}
#' \item{away_wp}{Estimated win probability for the away team.}
#' \item{wpa}{Win probability added (WPA) for the posteam.}
#' \item{vegas_wpa}{Win probability added (WPA) for the posteam: spread_adjusted model.}
#' \item{vegas_home_wpa}{Win probability added (WPA) for the home team: spread_adjusted model.}
#' \item{home_wp_post}{Estimated win probability for the home team at the end of the play.}
#' \item{away_wp_post}{Estimated win probability for the away team at the end of the play.}
#' \item{vegas_wp}{Estimated win probabiity for the posteam given the current situation at the start of the given play, incorporating pre-game Vegas line.}
Expand Down Expand Up @@ -332,9 +334,16 @@
#' \item{name}{Name of the 'passer' if it is not 'NA', or name of the 'rusher' otherwise.}
#' \item{jersey_number}{Jersey number of the player listed in the 'name' column.}
#' \item{id}{ID of the player in the 'name' column (NOTE: ids vary pre and post 2011 but are consistent for each player. Please see details for further information)}
#' \item{fantasy}{Name of the rusher on rush plays or receiver on pass plays.}
#' \item{fantasy_id}{ID of the rusher on rush plays or receiver on pass plays.}
#' \item{fantasy_player_name}{Name of the rusher on rush plays or receiver on pass plays (from official stats).}
#' \item{fantasy_player_id}{ID of the rusher on rush plays or receiver on pass plays (from official stats).}
#' \item{out_of_bounds}{1 if play description contains ran ob, pushed ob, or sacked ob; 0 otherwise.}
#' \item{qb_epa}{Gives QB credit for EPA for up to the point where a receiver lost a fumble after a completed catch and makes EPA work more like passing yards on plays with fumbles.}
#' \item{xyac_epa}{Expected value of EPA gained after the catch, starting from where the catch was made. Zero yards after the catch would be listed as zero EPA.}
#' \item{xyac_success}{Probability play earns positive EPA (relative to where play started) based on where ball was caught.}
#' \item{xyac_fd}{Probability play earns a first down based on where the ball was caught.}
#' \item{xyac_mean_yardage}{Average expected yards after the catch based on where the ball was caught.}
#' \item{xyac_median_yardage}{Median expected yards after the catch based on where the ball was caught.}
#' \item{xpass}{Probability of dropback scaled from 0 to 1.}
#' \item{pass_oe}{Dropback percent over expected on a given play scaled from 0 to 100.}
5 changes: 5 additions & 0 deletions man/clean_pbp.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 4 additions & 4 deletions man/decode_player_ids.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions man/fast_scraper.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Binary file modified vignettes/field_descriptions.rda
Binary file not shown.
Loading

0 comments on commit ca4544d

Please sign in to comment.