Skip to content

Commit

Permalink
Push nflfastR 2.2.1
Browse files Browse the repository at this point in the history
  • Loading branch information
mrcaseb committed Sep 1, 2020
1 parent a268308 commit 8381508
Show file tree
Hide file tree
Showing 22 changed files with 316 additions and 139 deletions.
7 changes: 4 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: nflfastR
Title: Functions to Efficiently Scrape NFL Play by Play Data
Version: 2.2.0.9001
Title: Functions to Efficiently Access NFL Play by Play Data
Version: 2.2.1
Authors@R:
c(person(given = "Sebastian",
family = "Carl",
Expand All @@ -26,7 +26,8 @@ Authors@R:
family = "Ventura",
role = "ctb",
email = "samventura22@gmail.com"))
Description: A set of functions to efficiently scrape NFL play-by-play data.
Description: A set of functions to access National Football League play-by-play
data from <https://www.nfl.com/>.
License: MIT + file LICENSE
URL: https://mrcaseb.github.io/nflfastR/, https://github.com/mrcaseb/nflfastR
BugReports: https://github.com/mrcaseb/nflfastR/issues
Expand Down
11 changes: 10 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,19 @@
# nflfastR (development version)
# nflfastR 2.2.1

* Fix `add_xyac()` breaking with some old packages
* Fix `add_xyac()` and `add_qb_epa()` calculations being wrong for some failed 4th downs
* Updated Readme with ep and cp model plots
* Updated `vignette("examples")` with the new `add_xyac()` function
* Added xYAC model to `vignette("nflfastR-models")`
* Added variables `fixed_drive` and `fixed_drive_result` to the output of
`fast_scraper()` because the NFL-provided drive info is extremely buggy
* Added variable `series_result`
* `clean_pbp()` now adds 4 new variables `passer_jersey_number`,
`rusher_jersey_number`, `receiver_jersey_number` and `jersey_number`. These can
be used to join rosters.
* Fixed incorrect `timeout_team`, `return_team`, `fumble_recovery_1_team` for JAX
games from 2011-2015
* Re-trained EPA model with `fixed_drive` and corrections to `timeout_team`

# nflfastR 2.2.0

Expand Down
14 changes: 8 additions & 6 deletions R/helper_add_ep_wp.R
Original file line number Diff line number Diff line change
Expand Up @@ -689,7 +689,7 @@ add_ep_variables <- function(pbp_data) {
total_home_pass_epa = cumsum(.data$home_team_pass_epa),
total_away_pass_epa = cumsum(.data$away_team_pass_epa)) %>%
dplyr::ungroup() %>%
return
return()
}


Expand Down Expand Up @@ -832,13 +832,15 @@ add_wp_variables <- function(pbp_data) {
#because other team will have the ball so WP from their perspective
#this is for backfilling WP on PATs
wp =
dplyr::if_else((.data$kickoff_attempt == 0 & (stringr::str_detect(.data$desc, 'Kick formation') | stringr::str_detect(.data$desc, 'Pass formation')) & is.na(.data$down)) |
dplyr::if_else((.data$kickoff_attempt == 0 & !(stringr::str_detect(.data$desc, 'Onside Kick')) &
(stringr::str_detect(.data$desc, 'Kick formation') | stringr::str_detect(.data$desc, 'Pass formation')) & is.na(.data$down)) |
stringr::str_detect(.data$desc, 'extra point') |
!is.na(.data$two_point_conv_result) |
!is.na(.data$extra_point_result),
1 - .data$wp, .data$wp),
vegas_wp =
dplyr::if_else((.data$kickoff_attempt == 0 & (stringr::str_detect(.data$desc, 'Kick formation') | stringr::str_detect(.data$desc, 'Pass formation')) & is.na(.data$down)) |
dplyr::if_else((.data$kickoff_attempt == 0 & !(stringr::str_detect(.data$desc, 'Onside Kick')) &
(stringr::str_detect(.data$desc, 'Kick formation') | stringr::str_detect(.data$desc, 'Pass formation')) & is.na(.data$down)) |
stringr::str_detect(.data$desc, 'extra point') |
!is.na(.data$two_point_conv_result) |
!is.na(.data$extra_point_result),
Expand Down Expand Up @@ -1026,7 +1028,7 @@ add_wp_variables <- function(pbp_data) {
total_home_pass_wpa = cumsum(.data$home_team_pass_wpa),
total_away_pass_wpa = cumsum(.data$away_team_pass_wpa)) %>%
dplyr::ungroup() %>%
return
return()

}

Expand Down Expand Up @@ -1187,7 +1189,7 @@ add_air_yac_ep_variables <- function(pbp_data) {
total_home_raw_yac_epa = cumsum(.data$home_team_raw_yac_epa),
total_away_raw_yac_epa = cumsum(.data$away_team_raw_yac_epa)) %>%
dplyr::ungroup() %>%
return
return()
}


Expand Down Expand Up @@ -1501,6 +1503,6 @@ add_air_yac_wp_variables <- function(pbp_data) {
total_home_raw_yac_wpa = cumsum(.data$home_team_raw_yac_wpa),
total_away_raw_yac_wpa = cumsum(.data$away_team_raw_yac_wpa)) %>%
dplyr::ungroup() %>%
return
return()

}
64 changes: 64 additions & 0 deletions R/helper_add_fixed_drives.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
################################################################################
# Author: Sebastian Carl, Ben Baldwin
# Purpose: Function to add drive variables
# Code Style Guide: styler::tidyverse_style()
################################################################################

## fixed_drive =
## starts at 1, each new drive, numbers shared across both teams
## fixed_drive_result =
## result of given drive

#' @import dplyr
#' @importFrom rlang .data
#' @importFrom stats na.omit
add_drive_results <- function(d) {
drive_df <- d %>%
dplyr::group_by(.data$game_id, .data$game_half) %>%
dplyr::mutate(
row = 1:dplyr::n(),
new_drive = dplyr::if_else(
# change in posteam
.data$posteam != dplyr::lag(.data$posteam) |
# change in posteam in t-2 and na posteam in t-1
(.data$posteam != dplyr::lag(.data$posteam, 2) & is.na(dplyr::lag(.data$posteam))) |
# change in posteam in t-3 and na posteam in t-1 and t-2
(.data$posteam != dplyr::lag(.data$posteam, 3) & is.na(dplyr::lag(.data$posteam, 2)) & is.na(dplyr::lag(.data$posteam))),
1, 0
),
# first observation of a half is also a new drive
new_drive = dplyr::if_else(.data$row == 1, 1, .data$new_drive),
new_drive = dplyr::if_else(is.na(.data$new_drive), 0, .data$new_drive)
) %>%
dplyr::group_by(.data$game_id) %>%
dplyr::mutate(
fixed_drive = cumsum(.data$new_drive),
tmp_result = dplyr::case_when(
.data$touchdown == 1 & .data$posteam == .data$td_team ~ "Touchdown",
.data$touchdown == 1 & .data$posteam != .data$td_team ~ "Opp touchdown",
.data$field_goal_result == "made" ~ "Field goal",
.data$field_goal_result %in% c("blocked", "missed") ~ "Missed field goal",
.data$safety == 1 ~ "Safety",
.data$interception == 1 | .data$fumble_lost == 1 ~ "Turnover",
.data$play_type == "punt" | .data$punt_attempt == 1 ~ "Punt",
.data$down == 4 & .data$yards_gained < .data$ydstogo & .data$play_type != "no_play" ~ "Turnover on downs",
.data$desc %in% c("END GAME", "END QUARTER 2", "END QUARTER 4") ~ "End of half"
)
) %>%
dplyr::group_by(.data$game_id, .data$fixed_drive) %>%
dplyr::mutate(
fixed_drive_result =
dplyr::if_else(
# if it's end of half, take the first thing we see
dplyr::last(stats::na.omit(.data$tmp_result)) == "End of half",
dplyr::first(stats::na.omit(.data$tmp_result)),
# otherwise take the last
dplyr::last(stats::na.omit(.data$tmp_result))
)
) %>%
dplyr::ungroup() %>%
dplyr::select(-"row", -"new_drive", -"tmp_result")

message("added fixed drive variables")
return(drive_df)
}
15 changes: 10 additions & 5 deletions R/helper_add_nflscrapr_mutations.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ add_nflscrapr_mutations <- function(pbp) {
dplyr::group_by(.data$game_id) %>%
# the !is.na(drive), drive part is to make the initial GAME line show up first
# https://stackoverflow.com/questions/43343590/how-to-sort-putting-nas-first-in-dplyr
dplyr::arrange(.data$quarter, !is.na(.data$quarter_seconds_remaining), -.data$quarter_seconds_remaining, !is.na(.data$drive), .data$drive, .data$index, .by_group = TRUE) %>%
dplyr::arrange(.data$order_sequence, .data$quarter, !is.na(.data$quarter_seconds_remaining), -.data$quarter_seconds_remaining, !is.na(.data$drive), .data$drive, .data$index, .by_group = TRUE) %>%
dplyr::ungroup() %>%
dplyr::mutate(
# Fill in the rows with missing posteam with the lag:
Expand All @@ -55,15 +55,16 @@ add_nflscrapr_mutations <- function(pbp) {
# Make the possession team for kickoffs be the return team, since that is
# more intuitive from the EPA / WPA point of view:
posteam = dplyr::if_else(
.data$kickoff_attempt == 1 | stringr::str_detect(.data$play_description, "Offside on Free Kick"),
# kickoff_finder is defined below
.data$kickoff_attempt == 1 | stringr::str_detect(.data$play_description, kickoff_finder),
dplyr::if_else(
.data$posteam_type == "home",
.data$away_team, .data$home_team
),
.data$posteam
),
defteam = dplyr::if_else(
.data$kickoff_attempt == 1 | stringr::str_detect(.data$play_description, "Offside on Free Kick"),
.data$kickoff_attempt == 1 | stringr::str_detect(.data$play_description, kickoff_finder),
dplyr::if_else(
.data$posteam_type == "home",
.data$home_team, .data$away_team
Expand All @@ -72,7 +73,7 @@ add_nflscrapr_mutations <- function(pbp) {
),
# Now flip the posteam_type as well:
posteam_type = dplyr::if_else(
.data$kickoff_attempt == 1 | stringr::str_detect(.data$play_description, "Offside on Free Kick"),
.data$kickoff_attempt == 1 | stringr::str_detect(.data$play_description, kickoff_finder),
dplyr::if_else(
.data$posteam_type == "home",
"away", "home"
Expand All @@ -82,7 +83,7 @@ add_nflscrapr_mutations <- function(pbp) {
yardline = dplyr::if_else(.data$yardline == "50", "MID 50", .data$yardline),
yardline = dplyr::if_else(
nchar(.data$yardline) == 0 | is.null(.data$yardline) | .data$yardline == "NULL" | is.na(.data$yardline),
dplyr::lag(.data$yardline), .data$yardline
dplyr::lead(.data$yardline), .data$yardline
),
yardline_number = dplyr::if_else(
.data$yardline == "MID 50", 50, .data$yardline_number
Expand Down Expand Up @@ -518,6 +519,10 @@ add_nflscrapr_mutations <- function(pbp) {
return(out)
}

# to help find kickoffs on plays with penalties
# otherwise win prob breaks down the road
kickoff_finder <- "(Offside on Free Kick)|(Delay of Kickoff)|(Onside Kick formation)|(kicks onside)|( kicks [:digit:]+ yards from)"


##some steps to prepare the data for the EP/WP/CP/FG models
#' @import dplyr
Expand Down
88 changes: 39 additions & 49 deletions R/helper_add_series_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,71 +9,61 @@
## NA: kickoffs, extra point/two point conversion attempts, non-plays, no posteam
## series_success =
## 1: scored touchdown, gained enough yards for first down
## 0: punt, interception, fumble lost, turnover on downs, 4th down FG attempt
## NA: series is NA, series contains QB spike/kneel, half ended with none of above

## 0: everything else
#' @import dplyr
#' @importFrom tidyr replace_na
#' @importFrom rlang .data
add_series_data <- function(pbp) {
out <-
pbp %>%
dplyr::group_by(.data$game_id) %>%
dplyr::group_by(.data$game_id, .data$game_half) %>%
dplyr::mutate(
# make down numeric
down = as.numeric(.data$down),
# create a first down indicator which marks first down for the offense
# AND first down after change of possesion (-> drivenumber increases)
# we don't want a first down being indicated for XP, 2P, KO
first_down = dplyr::if_else(
#earn first down
(.data$first_down_rush == 1 | .data$first_down_pass == 1 | .data$first_down_penalty == 1 |
#defensive TD
(.data$touchdown == 1 & .data$td_team != .data$posteam) |
# posteam changes
(
#change in posteam
.data$posteam != dplyr::lead(.data$posteam) |
#change in posteam in t+2 and na posteam in t+1
(.data$posteam != dplyr::lead(.data$posteam, 2) & is.na(dplyr::lead(.data$posteam))) |
#change in posteam in t+3 and na posteam in t+1 and t+2
(.data$posteam != dplyr::lead(.data$posteam, 3) & is.na(dplyr::lead(.data$posteam, 2)) & is.na(dplyr::lead(.data$posteam)))
)
) &
(.data$extra_point_attempt == 0 & .data$two_point_attempt == 0 & .data$kickoff_attempt == 0),
row = 1:dplyr::n(),
new_series = dplyr::if_else(
# a new drive
.data$fixed_drive != dplyr::lag(.data$fixed_drive) |
# or a first down on the prior play
dplyr::lag(.data$first_down_rush == 1) | dplyr::lag(.data$first_down_pass) == 1 | dplyr::lag(.data$first_down_penalty) == 1 |
# or the first play
.data$row == 1,
1, 0
),
# after setting the first down indicator we modify it for the end of a half
first_down = dplyr::if_else(.data$game_half != dplyr::lead(.data$game_half), 1, .data$first_down),
# the 'trigger' is being used for calculatung cumsum because we don't want the
# series number to increase in the play the first down occured but in the next play
trigger = dplyr::lag(.data$first_down, 1, 0)
new_series = dplyr::if_else(is.na(.data$new_series), 0, .data$new_series)
) %>%
# now compute series number with cumsum (for the calculation NA are being relaced with 0)
dplyr::mutate(series = cumsum(tidyr::replace_na(.data$trigger, 0)) + 1) %>%
dplyr::group_by(.data$game_id) %>%
dplyr::mutate(
# now modificated series number for special cases
series = dplyr::if_else(
.data$kickoff_attempt == 1 | .data$extra_point_attempt == 1 |
.data$two_point_attempt == 1 | is.na(.data$down) |
is.na(.data$posteam),
NA_real_,
.data$series
),
series_success = dplyr::case_when(
is.na(.data$series) | .data$qb_kneel == 1 | .data$qb_spike == 1 ~ NA_real_,
(.data$touchdown == 1 & .data$td_team == posteam) | .data$first_down_rush == 1 | .data$first_down_pass == 1 |
.data$first_down_penalty == 1 ~ 1,
.data$punt_attempt == 1 | .data$interception == 1 | .data$fumble_lost == 1 |
.data$fourth_down_failed == 1 | .data$field_goal_attempt == 1 ~ 0,
TRUE ~ 0
series = cumsum(.data$new_series),
tmp_result = dplyr::case_when(
(.data$first_down_penalty == 1 | .data$first_down_rush == 1 | .data$first_down_pass == 1) & touchdown == 0 ~ "First down",
.data$touchdown == 1 & .data$posteam == .data$td_team ~ "Touchdown",
.data$touchdown == 1 & .data$posteam != .data$td_team ~ "Opp touchdown",
.data$field_goal_result == "made" ~ "Field goal",
.data$field_goal_result %in% c("blocked", "missed") ~ "Missed field goal",
.data$safety == 1 ~ "Safety",
.data$interception == 1 | .data$fumble_lost == 1 ~ "Turnover",
.data$play_type == "punt" | .data$punt_attempt == 1 ~ "Punt",
.data$down == 4 & .data$yards_gained < .data$ydstogo & .data$play_type != "no_play" ~ "Turnover on downs",
.data$qb_kneel == 1 ~ "QB kneel",
.data$desc %in% c("END GAME", "END QUARTER 2", "END QUARTER 4") ~ "End of half"
)
) %>%
dplyr::group_by(.data$game_id, .data$series) %>%
# set series_success value for the whole series
dplyr::mutate(series_success = dplyr::last(.data$series_success)) %>%
dplyr::mutate(
series_result =
dplyr::if_else(
# if it's end of half, take the first thing we see
dplyr::last(stats::na.omit(.data$tmp_result)) == "End of half",
dplyr::first(stats::na.omit(.data$tmp_result)),
# otherwise take the last
dplyr::last(stats::na.omit(.data$tmp_result))
),
series_success = dplyr::if_else(
.data$series_result %in% c("Touchdown", "First down"), 1, 0
)
) %>%
dplyr::ungroup() %>%
dplyr::select(-"first_down", -"trigger")
dplyr::select(-"row", -"tmp_result", -"new_series")

message("added series variables")
return(out)
Expand Down
4 changes: 2 additions & 2 deletions R/helper_add_xyac.R
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,8 @@ add_xyac <- function(pbp) {
message("No non-NA values for xyac calculation detected. xyac variables set to NA")
}

# on old versions of dplyr, a .group column is created, which we don't want
pbp <- pbp %>% dplyr::select(-tidyselect::any_of(".group"))
# on old versions of dplyr, a .groups column is created, which we don't want
pbp <- pbp %>% dplyr::select(-tidyselect::any_of(".groups"))

return(pbp)
}
Expand Down
Loading

0 comments on commit 8381508

Please sign in to comment.