Skip to content

Commit

Permalink
Merge pull request #230 from mrcaseb/wp
Browse files Browse the repository at this point in the history
fix wpa on end game line, fix pat wp and fantasy id decoding
  • Loading branch information
mrcaseb authored Mar 28, 2021
2 parents 66d0de8 + c4eef03 commit 5af7e30
Show file tree
Hide file tree
Showing 5 changed files with 90 additions and 122 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: nflfastR
Title: Functions to Efficiently Access NFL Play by Play Data
Version: 4.1.0
Version: 4.1.0.9001
Authors@R:
c(person(given = "Sebastian",
family = "Carl",
Expand Down
8 changes: 8 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
# nflfastR (development version)

* All `wpa` variables are `NA` on end game line
* All `wp` variables are 0, 0.5, 1, or `NA` on end game line
* Fix bug where win prob on PATs assumed a PAT placed at 15 yard line, even in older seasons
* The function `decode_player_ids()` now really decodes the new variable `fantasy_id` (#229)
* Fixed a bug that caused slightly differing `wp` values depending on the first game in the data set (#183)

# nflfastR 4.1.0

## Breaking changes
Expand Down
175 changes: 64 additions & 111 deletions R/helper_add_ep_wp.R
Original file line number Diff line number Diff line change
Expand Up @@ -558,7 +558,7 @@ add_ep_variables <- function(pbp_data) {
add_wp_variables <- function(pbp_data) {

#testing only
#pbp_data <- g
# pbp_data <- g

# Initialize the df to store predicted win probability
OffWinProb <- rep(NA_real_, nrow(pbp_data))
Expand Down Expand Up @@ -645,10 +645,14 @@ add_wp_variables <- function(pbp_data) {

## start PAT fix

make_pat_prob <- as.numeric(mgcv::predict.bam(fastrmodels::fg_model, newdata = pbp_data %>% mutate(yardline_100 = 15), type="response"))
make_pat_prob <- make_pat_prob[1]

pat_data <- pbp_data
make_pat_prob <- as.numeric(
mgcv::predict.bam(
fastrmodels::fg_model,
newdata = pbp_data %>%
mutate(
yardline_100 = ifelse(.data$season >= 2015, 15, 3)
), type="response")
)

# plays with 1 point PAT attempts
pat_i <- which(
Expand All @@ -672,6 +676,10 @@ add_wp_variables <- function(pbp_data) {
!is.na(pbp_data$two_point_conv_result)
)

# some rare 2 point PAT attempts have duplicated matches in 1 point PAT attempts
# so we remove them in the next line
pat_i <- pat_i[!pat_i %in% two_pt_i]

# make df of post-PAT plays
pat_data <- pbp_data %>%
dplyr::mutate(
Expand Down Expand Up @@ -780,112 +788,71 @@ add_wp_variables <- function(pbp_data) {
tidyr::fill(
.data$tmp_posteam, .direction = "up"
) %>%
dplyr::group_by(.data$game_id) %>%
dplyr::mutate(
wp = dplyr::if_else(is.na(.data$posteam), NA_real_, .data$wp),
def_wp = 1 - .data$wp,
home_wp = dplyr::if_else(.data$posteam == .data$home_team,
.data$wp, .data$def_wp),
away_wp = dplyr::if_else(.data$posteam == .data$away_team,
.data$wp, .data$def_wp),

#add columns for WP taking into account spread
# vegas_wp = dplyr::if_else(is.na(.data$posteam), NA_real_, .data$vegas_wp),
#add columns for home WP
home_wp = dplyr::if_else(.data$tmp_posteam == .data$home_team, .data$wp, 1 - .data$wp),
vegas_home_wp = dplyr::if_else(.data$tmp_posteam == .data$home_team, .data$vegas_wp, 1 - .data$vegas_wp),

# convenience to mark end of game
end_game = ifelse(
stringr::str_detect(tolower(.data$desc), "(end of game)|(end game)"),
1, 0
),

# convenience for marking home win prob on last line
final_value = dplyr::case_when(
.data$home_score > .data$away_score ~ 1,
.data$away_score > .data$home_score ~ 0,
.data$home_score == .data$away_score ~ .5
),

#make 1 or 0 the final win prob
vegas_home_wp = dplyr::if_else(
stringr::str_detect(
tolower(.data$desc), "(end of game)|(end game)"
),
dplyr::case_when(
.data$home_score > .data$away_score ~ 1,
.data$away_score > .data$home_score ~ 0,
.data$home_score == .data$away_score ~ .5
),
.data$end_game == 1,
.data$final_value,
.data$vegas_home_wp
),

# can we make this and the above into a function? feels like a lot of repitition
home_wp = dplyr::if_else(
.data$end_game == 1,
.data$final_value,
.data$home_wp
),

away_wp = 1 - .data$home_wp,

# make wp of posteam on last line NA because there's no posteam
vegas_wp = dplyr::if_else(
stringr::str_detect(
tolower(.data$desc), "(end of game)|(end game)"
),
.data$end_game == 1,
NA_real_,
.data$vegas_wp
),

wp = dplyr::if_else(
.data$end_game == 1,
NA_real_,
.data$wp
),

def_wp = 1 - .data$wp,

# make wpa
vegas_home_wpa = dplyr::lead(.data$vegas_home_wp) - .data$vegas_home_wp,
vegas_wpa = dplyr::if_else(.data$tmp_posteam == .data$home_team, .data$vegas_home_wpa, -.data$vegas_home_wpa),
vegas_wpa = dplyr::if_else(
stringr::str_detect(tolower(.data$desc), " kneels "), NA_real_, .data$vegas_wpa
)
)

# For now follow the code from before, will need to update later:
# Create the possible WPA values
pbp_data <- dplyr::mutate(pbp_data,
# Team keeps possession (most general case):
WPA_base = dplyr::lead(.data$wp) - .data$wp,
# Team keeps possession but either Timeout, Two Minute Warning,
# Quarter End is the following row
WPA_base_nxt = dplyr::lead(.data$wp,2) - .data$wp,
# Change of possession and no timeout,
# two minute warning, or quarter end follows:
WPA_change = (1 - dplyr::lead(.data$wp)) - .data$wp,
# Change of possession but either Timeout,
# Two Minute Warning, or
# Quarter End is the following row:
WPA_change_nxt = (1 - dplyr::lead(.data$wp, 2)) - .data$wp,
# End of quarter, half or end rows:
WPA_halfend_to = 0)
# Create a WPA column for the last play of the game:
pbp_data$WPA_final <- ifelse(pbp_data$score_differential_post > 0 & pbp_data$posteam == pbp_data$home_team,
1 - pbp_data$home_wp,
ifelse(pbp_data$score_differential_post > 0 & pbp_data$posteam == pbp_data$away_team,
1 - pbp_data$away_wp,
ifelse(pbp_data$score_differential_post <= 0 & pbp_data$posteam == pbp_data$home_team,
0 - pbp_data$home_wp,
ifelse(pbp_data$score_differential_post <= 0 & pbp_data$posteam == pbp_data$away_team,
0 - pbp_data$away_wp, 0))))

pbp_data$WPA_base_nxt_ind <- with(pbp_data,
ifelse(posteam == dplyr::lead(posteam, 2) &
(is.na(dplyr::lead(play_type)) |
(dplyr::lead(timeout) == 1 &
dplyr::lead(play_type) == "no_play")), 1, 0))

pbp_data$WPA_change_nxt_ind <- with(pbp_data,
ifelse(posteam != dplyr::lead(posteam, 2) &
(is.na(dplyr::lead(play_type)) |
(dplyr::lead(timeout) == 1 &
dplyr::lead(play_type) == "no_play")), 1, 0))

pbp_data$WPA_change_ind <- with(pbp_data,
ifelse(posteam != dplyr::lead(posteam) &
!is.na(dplyr::lead(play_type)) &
(dplyr::lead(timeout) == 0 |
(dplyr::lead(timeout) == 1 &
dplyr::lead(play_type) != "no_play")), 1, 0))
pbp_data$WPA_halfend_to_ind <- with(pbp_data,
ifelse(is.na(play_type) |
(timeout == 1 & play_type == "no_play"), 1, 0))
pbp_data$WPA_final_ind <- with(pbp_data, ifelse(stringr::str_detect(dplyr::lead(tolower(desc)),
"(end of game)|(end game)"), 1, 0))

# Replace the missings with 0 due to how ifelse treats missings
pbp_data$WPA_base_nxt_ind[is.na(pbp_data$WPA_base_nxt_ind)] <- 0
pbp_data$WPA_change_nxt_ind[is.na(pbp_data$WPA_change_nxt_ind)] <- 0
pbp_data$WPA_change_ind[is.na(pbp_data$WPA_change_ind)] <- 0
pbp_data$WPA_halfend_to_ind[is.na(pbp_data$WPA_halfend_to_ind)] <- 0
pbp_data$WPA_final_ind[is.na(pbp_data$WPA_final_ind)] <- 0


# Assign WPA using these indicator columns:
pbp_data$wpa <- with(pbp_data,
ifelse(WPA_final_ind == 1, WPA_final,
ifelse(WPA_halfend_to_ind == 1, WPA_halfend_to,
ifelse(WPA_change_nxt_ind == 1, WPA_change_nxt,
ifelse(WPA_base_nxt_ind == 1, WPA_base_nxt,
ifelse(WPA_change_ind == 1, WPA_change,
WPA_base))))))
stringr::str_detect(tolower(.data$desc), "( kneels )|(end of game)|(end game)"), NA_real_, .data$vegas_wpa
),

# home wpa isn't saved but needed for next line
home_wpa = dplyr::lead(.data$home_wp) - .data$home_wp,
wpa = dplyr::if_else(.data$tmp_posteam == .data$home_team, .data$home_wpa, -.data$home_wpa),
wpa = dplyr::if_else(
stringr::str_detect(tolower(.data$desc), "( kneels )|(end of game)|(end game)"), NA_real_, .data$wpa
)
) %>%
dplyr::ungroup()

# Home and Away post:

Expand Down Expand Up @@ -913,21 +880,11 @@ add_wp_variables <- function(pbp_data) {

# For plays with playtype of End of Game, use the previous play's WP_post columns
# as the pre and post, since those are already set to be 1 and 0:
pbp_data$home_wp <- with(pbp_data,
ifelse(stringr::str_detect(tolower(desc),
"(end of game)|(end game)"),
dplyr::lag(home_wp_post),
home_wp))

pbp_data$home_wp_post <- with(pbp_data,
ifelse(stringr::str_detect(tolower(desc),
"(end of game)|(end game)"), dplyr::lag(home_wp_post),
ifelse(dplyr::lag(play_type) == "no_play" & play_type == "no_play", dplyr::lag(home_wp_post),home_wp_post)))
pbp_data$away_wp <- with(pbp_data,
ifelse(stringr::str_detect(tolower(desc),
"(end of game)|(end game)"),
dplyr::lag(away_wp_post),
away_wp))

pbp_data$away_wp_post <- with(pbp_data,
ifelse(stringr::str_detect(tolower(desc),
Expand All @@ -937,11 +894,7 @@ add_wp_variables <- function(pbp_data) {


# Now drop the unnecessary columns, rename variables back, and return:
pbp_data %>% dplyr::select(-c("WPA_base","WPA_base_nxt","WPA_change_nxt","WPA_change",
"WPA_halfend_to", "WPA_final",
"WPA_base_nxt_ind", "WPA_change_nxt_ind",
"WPA_change_ind", "WPA_halfend_to_ind", "WPA_final_ind"
)) %>%
pbp_data %>%
dplyr::group_by(.data$game_id) %>%
dplyr::mutate(
# Generate columns to keep track of cumulative rushing and
Expand Down
2 changes: 1 addition & 1 deletion R/helper_decode_player_ids.R
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ decode_player_ids <- function(pbp, ..., fast = TRUE) {
ret <- pbp %>%
dplyr::mutate_at(
dplyr::vars(
tidyselect::any_of(c("passer_id", "rusher_id", "receiver_id", "id")),
tidyselect::any_of(c("passer_id", "rusher_id", "receiver_id", "id", "fantasy_id")),
tidyselect::ends_with("player_id")
),
gsisdecoder::decode_ids
Expand Down
25 changes: 16 additions & 9 deletions data-raw/compare_dfs.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ compare_pbp <- function(id, cols) {
mutate(
ep = round(ep, 2),
epa = round(epa, 2),
vegas_home_wp = round(vegas_home_wp, 2)
vegas_home_wp = round(vegas_home_wp, 2),
vegas_home_wpa = round(vegas_home_wpa, 2),
home_wp = round(home_wp, 2)
)

repo_pbp <- readRDS(url(glue::glue("https://raw.githubusercontent.com/guga31bb/nflfastR-data/master/data/play_by_play_{s}.rds"))) %>%
Expand All @@ -30,18 +32,23 @@ compare_pbp <- function(id, cols) {
mutate(
ep = round(ep, 2),
epa = round(epa, 2),
vegas_home_wp = round(vegas_home_wp, 2)
vegas_home_wp = round(vegas_home_wp, 2),
vegas_home_wpa = round(vegas_home_wpa, 2),
home_wp = round(home_wp, 2)
)

sum <- arsenal::diffs(arsenal::comparedf(
new_pbp %>% select(-desc),
repo_pbp %>% select(-desc)
new_pbp %>% select(-desc, -game_id, -play_id),
repo_pbp %>% select(-desc, -game_id, -play_id)
))
dfs <- bind_cols(
new_pbp %>% select(-desc),
repo_pbp %>% select(-desc))
new_pbp %>% select(-desc, -game_id, -play_id),
repo_pbp %>% select(-desc, -game_id, -play_id))

dfs$desc <- new_pbp$desc
dfs$play_id <- new_pbp$play_id
dfs$game_id <- new_pbp$game_id


return(
list(sum, dfs)
Expand All @@ -52,10 +59,10 @@ compare_pbp <- function(id, cols) {

cols <- c(
# DO NOT REMOVE THESE ONES OR THE COMPARISON WILL BREAK
"game_id", "play_id", "desc", "ep", "epa", "vegas_home_wp",
"game_id", "play_id", "desc", "ep", "epa",
"vegas_home_wp", "vegas_home_wpa", "home_wp"

# here is stuff you can choose whether to include
"posteam", "home_team", "away_team", "name", "rusher"
# , "posteam_timeouts_remaining", "defteam_timeouts_remaining"
)

Expand All @@ -68,7 +75,7 @@ id <- "2019_01_SF_TB"
id <- "2017_12_JAX_ARI"

ids <- nflfastR::fast_scraper_schedules(2020) %>%
dplyr::slice(11:20) %>%
dplyr::slice(1:20) %>%
pull(game_id)

compared <- compare_pbp(
Expand Down

0 comments on commit 5af7e30

Please sign in to comment.