Skip to content

Commit

Permalink
Push nflfastR 2.1.2
Browse files Browse the repository at this point in the history
  • Loading branch information
mrcaseb committed Jul 25, 2020
1 parent a02e069 commit d5b52b8
Show file tree
Hide file tree
Showing 19 changed files with 113 additions and 59 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: nflfastR
Title: Functions to Efficiently Scrape NFL Play by Play Data
Version: 2.1.1
Version: 2.1.2
Authors@R:
c(person(given = "Sebastian",
family = "Carl",
Expand Down Expand Up @@ -48,7 +48,7 @@ Imports:
stringr (>= 1.3.0),
tibble (>= 3.0),
tidyr (>= 1.0.0),
tidyselect (>= 1.0.0),
tidyselect (>= 1.1.0),
xgboost (>= 1.1)
Suggests:
DBI,
Expand Down
2 changes: 1 addition & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -53,5 +53,5 @@ importFrom(tibble,tibble)
importFrom(tidyr,replace_na)
importFrom(tidyr,unnest)
importFrom(tidyr,unnest_wider)
importFrom(tidyselect,any_of)
importFrom(tidyselect,matches)
importFrom(tidyselect,one_of)
13 changes: 13 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
# nflfastR 2.1.2

* Added the following columns that are available 2011 and later: `special_teams_play`, `st_play_type`, `time_of_day`, and `order_sequence`
* Added `old_game_id` column (useful for merging to external data that still uses this ID: format is YYYYMMDDxx)
* The `clean_pbp()` function now adds an `aborted_play` column
* Fixed a bug where pass plays with a penalty at end of play were classified as `play_type` = `no_play` rather than `pass`
* Fixed bug where EPA on defensive 2 point return was -0.95 instead of -2.95
* Fixed some remaining failed challenge plays that incorrectly had 0 for EPA
* Updated the included dataframe `teams_colors_logos` for the interim name of
the 'Washington Football Team' and the corresponding logo urls.
* Some internal code improvements causing the required `tidyselect` version
to be >= 1.1.0

# nflfastR 2.1.1

### Functions
Expand Down
8 changes: 4 additions & 4 deletions R/ep_wp_calculators.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,14 @@
#' }
#' @importFrom rlang .data
#' @importFrom dplyr select mutate bind_cols
#' @importFrom tidyselect one_of
#' @importFrom tidyselect any_of
#' @importFrom stats predict
#' @export
calculate_expected_points <- function(pbp_data) {
suppressWarnings(
model_data <- pbp_data %>%
# drop existing values of ep and the probs before making new ones
dplyr::select(-one_of(drop.cols)) %>%
dplyr::select(-any_of(drop.cols)) %>%
make_model_mutations() %>%
ep_model_select()
)
Expand Down Expand Up @@ -114,15 +114,15 @@ drop.cols <- c(
#' }
#' @importFrom rlang .data
#' @importFrom dplyr select mutate if_else rename bind_cols
#' @importFrom tidyselect one_of
#' @importFrom tidyselect any_of
#' @importFrom stats predict
#' @importFrom tibble as_tibble
#' @export
calculate_win_probability <- function(pbp_data) {
suppressWarnings(
model_data <- pbp_data %>%
# drop existing values of ep and the probs before making new ones
dplyr::select(-one_of(drop.cols.wp)) %>%
dplyr::select(-any_of(drop.cols.wp)) %>%
dplyr::mutate(
home = dplyr::if_else(.data$posteam == .data$home_team, 1, 0),
ExpScoreDiff = .data$ep + .data$score_differential,
Expand Down
52 changes: 47 additions & 5 deletions R/helper_add_ep_wp.R
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,7 @@ add_ep_variables <- function(pbp_data) {
base_ep_preds$TwoPoint_Prob <- 0

# Find the indices for these types of plays:
extrapoint_i <- which(pbp_data$play_type == "extra_point")
extrapoint_i <- which(pbp_data$play_type == "extra_point" & pbp_data$play_type_nfl != "PAT2")
twopoint_i <- which(pbp_data$two_point_attempt == 1)

#new: special case for PAT or kickoff with penalty
Expand All @@ -364,8 +364,22 @@ add_ep_variables <- function(pbp_data) {

# ----------------------------------------------------------------------------------
# Insert NAs for timeouts and end of play rows:
missing_i <- which((pbp_data$timeout == 1 & pbp_data$play_type == "no_play" &
!stringr::str_detect(pbp_data$desc, ' pass ')) | is.na(pbp_data$play_type))
missing_i <- which(
(pbp_data$timeout == 1 &
pbp_data$play_type == "no_play" &
!stringr::str_detect(pbp_data$desc, ' pass ') &
!stringr::str_detect(pbp_data$desc, ' sacked ') &
!stringr::str_detect(pbp_data$desc, ' scramble ') &
!stringr::str_detect(pbp_data$desc, ' punts ') &
!stringr::str_detect(pbp_data$desc, ' up the middle ') &
!stringr::str_detect(pbp_data$desc, ' left end ') &
!stringr::str_detect(pbp_data$desc, ' left guard ') &
!stringr::str_detect(pbp_data$desc, ' left tackle ') &
!stringr::str_detect(pbp_data$desc, ' right end ') &
!stringr::str_detect(pbp_data$desc, ' right guard ') &
!stringr::str_detect(pbp_data$desc, ' right tackle ')
) |
is.na(pbp_data$play_type))

# Now update the probabilities for missing and PATs:
base_ep_preds$Field_Goal[c(missing_i, extrapoint_i, twopoint_i, st_penalty_i)] <- 0
Expand Down Expand Up @@ -444,6 +458,10 @@ add_ep_variables <- function(pbp_data) {
.data$two_point_pass_failed == 1 |
.data$two_point_pass_reception_failed == 1)),
0 - .data$ExpPts, .data$EPA),
# Opponent scores defensive 2 point:
EPA = dplyr::if_else(
.data$defensive_two_point_conv == 1, -2 - .data$ExpPts, .data$EPA
),
# Opponent safety:
EPA = dplyr::if_else(is.na(.data$td_team) & .data$field_goal_made == 0 &
.data$extra_point_good == 0 &
Expand Down Expand Up @@ -599,9 +617,33 @@ add_ep_variables <- function(pbp_data) {
extra_point_prob = "ExPoint_Prob",
two_point_conversion_prob = "TwoPoint_Prob") %>%
# Create columns with cumulative epa totals for both teams:
dplyr::mutate(ep = dplyr::if_else(.data$timeout == 1 & .data$play_type == "no_play" & !stringr::str_detect(.data$desc, ' pass '),
dplyr::mutate(ep = dplyr::if_else(.data$timeout == 1 & .data$play_type == "no_play" &
!stringr::str_detect(.data$desc, ' pass ') &
!stringr::str_detect(.data$desc, ' sacked ') &
!stringr::str_detect(.data$desc, ' scramble ') &
!stringr::str_detect(.data$desc, ' punts ') &
!stringr::str_detect(.data$desc, ' up the middle ') &
!stringr::str_detect(.data$desc, ' left end ') &
!stringr::str_detect(.data$desc, ' left guard ') &
!stringr::str_detect(.data$desc, ' left tackle ') &
!stringr::str_detect(.data$desc, ' right end ') &
!stringr::str_detect(.data$desc, ' right guard ') &
!stringr::str_detect(.data$desc, ' right tackle ')
,
dplyr::lead(.data$ep), .data$ep),
epa = dplyr::if_else(.data$timeout == 1 & .data$play_type == "no_play" & !stringr::str_detect(.data$desc, ' pass '),
epa = dplyr::if_else(.data$timeout == 1 & .data$play_type == "no_play" &
!stringr::str_detect(.data$desc, ' pass ') &
!stringr::str_detect(.data$desc, ' sacked ') &
!stringr::str_detect(.data$desc, ' scramble ') &
!stringr::str_detect(.data$desc, ' punts ') &
!stringr::str_detect(.data$desc, ' up the middle ') &
!stringr::str_detect(.data$desc, ' left end ') &
!stringr::str_detect(.data$desc, ' left guard ') &
!stringr::str_detect(.data$desc, ' left tackle ') &
!stringr::str_detect(.data$desc, ' right end ') &
!stringr::str_detect(.data$desc, ' right guard ') &
!stringr::str_detect(.data$desc, ' right tackle ')
,
0, .data$epa),
# Change epa for plays occurring at end of half with no scoring
# plays to be just the difference between 0 and starting ep:
Expand Down
2 changes: 1 addition & 1 deletion R/helper_add_game_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ add_game_data <- function(pbp) {
dplyr::left_join(
readRDS(url(url)) %>%
dplyr::select(
"game_id", "away_score", "home_score", "location", "result", "total",
"game_id", "old_game_id", "away_score", "home_score", "location", "result", "total",
"spread_line", "total_line", "div_game", "roof", "surface", "temp", "wind",
"home_coach", "away_coach", "stadium", "stadium_id", "gameday"
) %>%
Expand Down
16 changes: 8 additions & 8 deletions R/helper_add_nflscrapr_mutations.R
Original file line number Diff line number Diff line change
Expand Up @@ -293,23 +293,23 @@ add_nflscrapr_mutations <- function(pbp) {
),
play_type = dplyr::if_else(
(.data$penalty == 0 |
(.data$penalty == 1 & .data$penalty_fix == 1)) & .data$qb_spike == 1,
"qb_spike", .data$play_type
(.data$penalty == 1 & .data$penalty_fix == 1)) & .data$field_goal_attempt == 1,
"field_goal", .data$play_type
),
play_type = dplyr::if_else(
(.data$penalty == 0 |
(.data$penalty == 1 & .data$penalty_fix == 1)) & .data$qb_kneel == 1,
"qb_kneel", .data$play_type
(.data$penalty == 1 & .data$penalty_fix == 1)) & .data$extra_point_attempt == 1,
"extra_point", .data$play_type
),
play_type = dplyr::if_else(
(.data$penalty == 0 |
(.data$penalty == 1 & .data$penalty_fix == 1)) & .data$field_goal_attempt == 1,
"field_goal", .data$play_type
(.data$penalty == 1 & .data$penalty_fix == 1)) & .data$qb_spike == 1,
"qb_spike", .data$play_type
),
play_type = dplyr::if_else(
(.data$penalty == 0 |
(.data$penalty == 1 & .data$penalty_fix == 1)) & .data$extra_point_attempt == 1,
"extra_point", .data$play_type
(.data$penalty == 1 & .data$penalty_fix == 1)) & .data$qb_kneel == 1,
"qb_kneel", .data$play_type
),
# Indicator for QB dropbacks (exclude spikes and kneels):
qb_dropback = dplyr::if_else(
Expand Down
10 changes: 6 additions & 4 deletions R/helper_additional_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#' \item{rush}{Binary indicator if the play was a rushing play.}
#' \item{special}{Binary indicator if the play was a special teams play.}
#' \item{first_down}{Binary indicator if the play ended in a first down.}
#' \item{aborted_play}{Binary indicator if the play description indicates "Aborted".}
#' \item{play}{Binary indicator: 1 if the play was a 'normal' play (including penalties), 0 otherwise.}
#' \item{passer_id}{ID of the player in the 'passer' column (NOTE: ids vary pre and post 2011)}
#' \item{rusher_id}{ID of the player in the 'rusher' column (NOTE: ids vary pre and post 2011)}
Expand All @@ -39,7 +40,7 @@
#' @importFrom stringr str_detect str_extract str_replace_all
#' @importFrom glue glue
#' @importFrom rlang .data
#' @importFrom tidyselect one_of
#' @importFrom tidyselect any_of
clean_pbp <- function(pbp) {
message('Cleaning up play-by-play. If you run this with a lot of seasons this could take a few minutes.')

Expand All @@ -48,7 +49,7 @@ clean_pbp <- function(pbp) {
legacy_id_map <- readRDS(url("https://github.com/guga31bb/nflfastR-data/blob/master/roster-data/legacy_id_map.rds?raw=true"))

# drop existing values of clean_pbp
pbp <- pbp %>% dplyr::select(-tidyselect::one_of(drop.cols))
pbp <- pbp %>% dplyr::select(-tidyselect::any_of(drop.cols))

r <- pbp %>%
dplyr::mutate(
Expand Down Expand Up @@ -129,6 +130,7 @@ clean_pbp <- function(pbp) {
TRUE ~ receiver
),
first_down = dplyr::if_else(.data$first_down_rush == 1 | .data$first_down_pass == 1 | .data$first_down_penalty == 1, 1, 0),
aborted_play = dplyr::if_else(stringr::str_detect(.data$desc, 'Aborted'), 1, 0),
# easy filter: play is 1 if a "special teams" play, or 0 otherwise
# with thanks to Lee Sharpe for the code
special = dplyr::if_else(.data$play_type %in%
Expand Down Expand Up @@ -257,11 +259,11 @@ update_ids <- function(var, id_map) {
#' @export
#' @import dplyr
#' @importFrom rlang .data
#' @importFrom tidyselect one_of
#' @importFrom tidyselect any_of
add_qb_epa <- function(d) {

# drop existing values of clean_pbp
d <- d %>% dplyr::select(-tidyselect::one_of("qb_epa"))
d <- d %>% dplyr::select(-tidyselect::any_of("qb_epa"))

fumbles_df <- d %>%
dplyr::filter(.data$complete_pass == 1 & .data$fumble_lost == 1 & !is.na(.data$epa) & !is.na(.data$down)) %>%
Expand Down
6 changes: 5 additions & 1 deletion R/helper_scrape_gc.R
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,11 @@ get_pbp_gc <- function(gameId, dir = NULL) {
play_type_nfl = NA_character_,
drive_yards_penalized = NA_real_,
end_clock_time = NA_character_,
end_yard_line = NA_character_
end_yard_line = NA_character_,
order_sequence = NA_real_,
time_of_day = NA_character_,
special_teams_play = NA_real_,
st_play_type = NA_character_
) %>%
dplyr::group_by(.data$drive) %>%
dplyr::mutate(
Expand Down
20 changes: 0 additions & 20 deletions R/helper_scrape_nfl.R
Original file line number Diff line number Diff line change
Expand Up @@ -264,26 +264,6 @@ get_pbp_nfl <- function(id, dir = NULL) {
}


# otherwise scraping a lot of seasons breaks
save_cols <- c(
"game_id", "nfl_api_id", "home_team", "away_team",
"season", "game_month",
"game_year", "time", "down", "drive_net_yards",
"drive", "first_down", "goal_to_go", "order_sequence",
"play_description", "play_review_status",
"play_type_nfl", "quarter", "sp",
"scoring_play_type", "special_teams_play",
"time_of_day",
"yardline", "yards",
"yards_to_go", "latest_play",
"posteam",
"scoring_team_id",
"scoring_team_abbreviation", "scoring_team_nick_name",
"ydsnet", "drive_yards_penalized",
"posteam_id", "yardline_side",
"yardline_number", "quarter_end"
)

# hard coded 2020 regular season game ids to make sure the output of the
# schedule scraper is not named 'invalid' if the source file not yet exists
valid_games <- c(
Expand Down
2 changes: 2 additions & 0 deletions R/helper_tidy_play_stats.R
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ sum_play_stats <- function(play_Id, stats) {
row$pass_attempt <- 1
row$passer_player_id <- play_stats$player.esbId[index]
row$passer_player_name <- play_stats$player.displayName[index]
row$penalty_fix <- 1
} else if (stat_id == 15) {
row$pass_attempt <- 1
row$complete_pass <- 1
Expand All @@ -118,6 +119,7 @@ sum_play_stats <- function(play_Id, stats) {
row$pass_attempt <- 1
row$passer_player_id <- play_stats$player.esbId[index]
row$passer_player_name <- play_stats$player.displayName[index]
row$penalty_fix <- 1
} else if (stat_id == 20) {
row$pass_attempt <- 1
row$sack <- 1
Expand Down
10 changes: 5 additions & 5 deletions R/helper_variable_selector.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@
################################################################################

#' @import dplyr
#' @importFrom tidyselect one_of
#' @importFrom tidyselect any_of
select_variables <- function(pbp) {

suppressWarnings(
out <-
pbp %>%
dplyr::select(
tidyselect::one_of(
tidyselect::any_of(
c(nflscrapr_cols, new_cols, api_cols)
)
)
Expand All @@ -27,7 +27,7 @@ new_cols <- c("season", "cp", "cpoe", "series", "series_success")
# original nflscrapr columns
nflscrapr_cols <-
c(
"play_id", "game_id", "home_team", "away_team",
"play_id", "game_id", "old_game_id", "home_team", "away_team",
#added these to new gc scraper
"season_type", "week",
"posteam", "posteam_type", "defteam", "side_of_field", "yardline_100",
Expand Down Expand Up @@ -124,10 +124,10 @@ rs_cols <- c(

# these are columns in the new API that aren't in nflscrapR
api_cols <- c(
"start_time",
"order_sequence", "start_time", "time_of_day",
"stadium", "weather", "nfl_api_id",
"play_clock", "play_deleted",
"play_type_nfl",
"play_type_nfl", "special_teams_play", "st_play_type",
"end_clock_time", "end_yard_line",
"drive_real_start_time",

Expand Down
11 changes: 8 additions & 3 deletions R/top-level_scraper.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#' \describe{
#' \item{play_id}{Numeric play id that when used with game_id and drive provides the unique identifier for a single play.}
#' \item{game_id}{Ten digit identifier for NFL game.}
#' \item{old_game_id}{Legacy NFL game ID.}
#' \item{home_team}{String abbreviation for the home team.}
#' \item{away_team}{String abbreviation for the away team.}
#' \item{season_type}{'REG' or 'POST' indicating if the game belongs to regular or post season.}
Expand Down Expand Up @@ -69,7 +70,7 @@
#' \item{two_point_conv_result}{String indicator for result of two point conversion attempt: success, failure, safety (touchback in defensive endzone is 1 point apparently), or return.}
#' \item{home_timeouts_remaining}{Numeric timeouts remaining in the half for the home team.}
#' \item{away_timeouts_remaining}{Numeric timeouts remaining in the half for the away team.}
#' \item{timeout}{Binary indicator for whether or not a timeout was called.}
#' \item{timeout}{Binary indicator for whether or not a timeout was called by either team.}
#' \item{timeout_team}{String abbreviation for which team called the timeout.}
#' \item{td_team}{String abbreviation for which team scored the touchdown.}
#' \item{posteam_timeouts_remaining}{Number of timeouts remaining for the possession team.}
Expand Down Expand Up @@ -116,8 +117,8 @@
#' \item{home_wp}{Estimated win probability for the home team.}
#' \item{away_wp}{Estimated win probability for the away team.}
#' \item{wpa}{Win probability added (WPA) for the posteam.}
#' \item{home_wp_post}{Estimated win probability for the home team at the start of the play.}
#' \item{away_wp_post}{Estimated win probability for the away team at the start of the play.}
#' \item{home_wp_post}{Estimated win probability for the home team at the end of the play.}
#' \item{away_wp_post}{Estimated win probability for the away team at the end of the play.}
#' \item{vegas_wp}{Estimated win probabiity for the posteam given the current situation at the start of the given play, incorporating pre-game Vegas line.}
#' \item{vegas_home_wp}{Estimated win probability for the home team incorporating pre-game Vegas line.}
#' \item{total_home_rush_wpa}{Cumulative total rushing WPA for the home team in the game so far.}
Expand Down Expand Up @@ -288,12 +289,16 @@
#' \item{series}{Starts at 1, each new first down increments, numbers shared across both teams NA: kickoffs, extra point/two point conversion attempts, non-plays, no posteam}
#' \item{series_success}{1: scored touchdown, gained enough yards for first down 0: punt, interception, fumble lost, turnover on downs, FG attempt NA: series is NA, series contains QB spike/kneel}
#' \item{start_time}{Kickoff time in eastern time zone.}
#' \item{order_sequence}{Column provided by NFL to fix out-of-order plays. Available 2011 and beyond.}
#' \item{time_of_day}{Time of day of play in UTC "HH:MM:SS" format. Available 2011 and beyond.}
#' \item{stadium}{Game site name.}
#' \item{weather}{String describing the weather including temperature, humidity and wind (direction and speed). Doesn't change during the game!}
#' \item{nfl_api_id}{UUID of the game in the new NFL API.}
#' \item{play_clock}{Time on the playclock when the ball was snapped.}
#' \item{play_deleted}{Binary indicator for deleted plays.}
#' \item{play_type_nfl}{Play type as listed in the NFL source. Slightly different to the regular play_type variable.}
#' \item{special_teams_play}{Binary indicator for whether play is special teams play from NFL source. Available 2011 and beyond.}
#' \item{st_play_type}{Type of special teams play from NFL source. Available 2011 and beyond.}
#' \item{end_clock_time}{Game time at the end of a given play.}
#' \item{end_yard_line}{String indicating the yardline at the end of the given play consisting of team half and yard line number.}
#' \item{drive_real_start_time}{Local day time when the drive started (currently not used by the NFL and therefore mostly 'NA').}
Expand Down
Binary file modified data/teams_colors_logos.rda
Binary file not shown.
Loading

0 comments on commit d5b52b8

Please sign in to comment.