Push nflfastR v2.1.1

nflverse · Jul 10, 2020 · 75a14cd · 75a14cd
1 parent 496f647
commit 75a14cd
Show file tree

Hide file tree

Showing 12 changed files with 124 additions and 81 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -16,5 +16,4 @@
 ^man/figures/card\.png$
 ^man/figures/header_github\.png$
 ^man/figures/header_twitter\.png$
-^man/figures/logo\.png$
 ^man/figures/nflfastR_logo_fillsize\.png$
diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml
diff --git a/.travis.yml b/.travis.yml
@@ -1,4 +1,7 @@
 # R for travis: see documentation at https://docs.travis-ci.com/user/languages/r
+os:
+  - linux
+  - osx
 
 language: r
 r:

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: nflfastR
 Title: Functions to Efficiently Scrape NFL Play by Play Data
-Version: 2.1.0
+Version: 2.1.1
 Authors@R: 
     c(person(given = "Sebastian",
              family = "Carl",
@@ -26,8 +26,7 @@ Authors@R:
              family = "Ventura",
              role = "ctb",
              email = "samventura22@gmail.com"))
-Description: nflfastR is a set of functions to efficiently
-    scrape NFL play-by-play data.
+Description: A set of functions to efficiently scrape NFL play-by-play data.
 License: MIT + file LICENSE
 URL: https://mrcaseb.github.io/nflfastR/, https://github.com/mrcaseb/nflfastR
 BugReports: https://github.com/mrcaseb/nflfastR/issues
@@ -42,23 +41,22 @@ Imports:
     lubridate,
     magrittr,
     mgcv,
-    progressr,
-    purrr,
+    progressr (>= 0.6.0),
+    purrr (>= 0.3.0),
     stats,
     rlang,
-    stringr,
-    tibble,
-    tidyr,
-    tidyselect,
+    stringr (>= 1.3.0),
+    tibble (>= 3.0),
+    tidyr (>= 1.0.0),
+    tidyselect (>= 1.0.0),
     xgboost (>= 1.1)
 Suggests: 
     DBI,
     furrr,
     future,
     rmarkdown,
-    RSQLite,
-    knitr
+    RSQLite
 Encoding: UTF-8
 LazyData: true
 RoxygenNote: 7.1.1
-VignetteBuilder: knitr
+
diff --git a/NAMESPACE b/NAMESPACE
@@ -9,10 +9,12 @@ export(fast_scraper_schedules)
 export(update_db)
 import(dplyr)
 importFrom(dplyr,bind_cols)
+importFrom(dplyr,case_when)
 importFrom(dplyr,filter)
 importFrom(dplyr,first)
 importFrom(dplyr,group_by)
 importFrom(dplyr,if_else)
+importFrom(dplyr,left_join)
 importFrom(dplyr,mutate)
 importFrom(dplyr,rename)
 importFrom(dplyr,select)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,33 @@
+# nflfastR 2.1.1
+
+### Functions
+
+* `clean_pbp()` now standardizes player IDs across the old (1999-2010) and new 
+(2011+) data sources. Player IDs once again uniquely identify players, and each 
+unique player has one unique ID (as they did before the NFL data source change):
+    * For players whose careers finished before 2011, their IDs remain the same
+    * For players who played in both eras or only in the new era, their ID is 
+    the new ID
+    * For example, Akili Smith (ID: 00-0015082) and Alex Smith 
+    (ID: 32013030-2d30-3032-3334-3336b638d37d) are both abbreviated as "A.Smith" 
+    but can be distinguished by their IDs, with Akili showing what the old 
+    format ID looks like, and Smith the new one
+    * Standardization is realized by using an [ID map](https://github.com/guga31bb/nflfastR-data/blob/master/roster-data/legacy_id_map.csv)
+    available in the data repo created with [this script](https://github.com/guga31bb/nflfastR-data/blob/master/roster-data/legacy_id_map.R)
+
+* `clean_pbp()` now removes all variables it is about to create to make sure 
+nothing unexpected can happen
+
+### Miscellaneous
+
+* Added minimum version requirements to some package dependencies because 
+installation broke for some users with outdated packages
+
+* Made a minor bug fix to catch more out-of-order plays and fixed a bug where some
+plays were being incorrectly dropped in older seasons
+
+* Standardized team names (e.g. `SD` --> `LAC`) in some columns we had missed
+
 # nflfastR 2.1.0
 
 ### Models

diff --git a/R/helper_add_nflscrapr_mutations.R b/R/helper_add_nflscrapr_mutations.R
@@ -18,12 +18,13 @@ add_nflscrapr_mutations <- function(pbp) {
     dplyr::mutate(index = 1 : dplyr::n()) %>%
     # remove duplicate plays. can't do this with play_id because duplicate plays
     # sometimes have different play_ids
-    dplyr::group_by(.data$game_id, .data$quarter, .data$time, .data$play_description) %>%
+    dplyr::group_by(.data$game_id, .data$quarter, .data$time, .data$play_description, .data$down) %>%
     dplyr::slice(1) %>%
     dplyr::ungroup() %>%
     dplyr::mutate(
       # Modify the time column for the quarter end:
-      time = dplyr::if_else(.data$quarter_end == 1, "00:00", .data$time),
+      time = dplyr::if_else(.data$quarter_end == 1 |
+                              (.data$play_description == "END GAME" & is.na(.data$time)), "00:00", .data$time),
       time = dplyr::if_else(.data$play_description == 'GAME', "15:00", .data$time),
       # Create a column with the time in seconds remaining for the quarter:
       quarter_seconds_remaining = lubridate::period_to_seconds(lubridate::ms(.data$time))

diff --git a/R/helper_additional_functions.R b/R/helper_additional_functions.R
@@ -13,6 +13,8 @@
 #' the play description; e.g. 24-M.Lynch instead of M.Lynch.
 #' The function also standardizes team abbreviations so that, for example,
 #' the Chargers are always represented by 'LAC' regardless of which year it was.
+#' The function also standardizes player IDs for players appearing in both the
+#' older era (1999-2010) and the new era (2011+).
 #' @return The input Data Frame of the paramter 'pbp' with the following columns
 #' added:
 #' \describe{
@@ -37,8 +39,17 @@
 #' @importFrom stringr str_detect str_extract str_replace_all
 #' @importFrom glue glue
 #' @importFrom rlang .data
+#' @importFrom tidyselect one_of
 clean_pbp <- function(pbp) {
   message('Cleaning up play-by-play. If you run this with a lot of seasons this could take a few minutes.')
+
+  # Load id map to standardize player ids for players that were active before 2011
+  # and in or after 2011 meaning they appear with old gsis_ids and new ids
+  legacy_id_map <- readRDS(url("https://github.com/guga31bb/nflfastR-data/blob/master/roster-data/legacy_id_map.rds?raw=true"))
+
+  # drop existing values of clean_pbp
+  pbp <- pbp %>% dplyr::select(-tidyselect::one_of(drop.cols))
+
   r <- pbp %>%
     dplyr::mutate(
       #get rid of extraneous spaces that mess with player name finding
@@ -91,7 +102,8 @@ clean_pbp <- function(pbp) {
         passer == "R.Griffin" ~ "R.Griffin III",
         passer == "Randel El" ~ "A.Randle El",
         passer == "Randle El" ~ "A.Randle El",
-        passer == "Van Pelt" ~ "A.Van Pelt",
+        season <= 2003 & passer == "Van Pelt" ~ "A.Van Pelt",
+        season > 2003 & passer == "Van Pelt" ~ "B.Van Pelt",
         passer == "Dom.Davis" ~ "D.Davis",
         TRUE ~ .data$passer
       ),
@@ -107,7 +119,8 @@ clean_pbp <- function(pbp) {
         rusher == "R.Griffin" ~ "R.Griffin III",
         rusher == "Randel El" ~ "A.Randle El",
         rusher == "Randle El" ~ "A.Randle El",
-        rusher == "Van Pelt" ~ "A.Van Pelt",
+        season <= 2003 & rusher == "Van Pelt" ~ "A.Van Pelt",
+        season > 2003 & rusher == "Van Pelt" ~ "B.Van Pelt",
         rusher == "Dom.Davis" ~ "D.Davis",
         TRUE ~ rusher
       ),
@@ -122,7 +135,7 @@ clean_pbp <- function(pbp) {
                        c("extra_point","field_goal","kickoff","punt"), 1, 0),
       # easy filter: play is 1 if a "normal" play (including penalties), or 0 otherwise
       # with thanks to Lee Sharpe for the code
-      play=dplyr::if_else(!is.na(.data$epa) & !is.na(.data$posteam) &
+      play = dplyr::if_else(!is.na(.data$epa) & !is.na(.data$posteam) &
                             .data$desc != "*** play under review ***" &
                             substr(.data$desc,1,8) != "Timeout " &
                             .data$play_type %in% c("no_play","pass","run"),1,0)
@@ -133,7 +146,8 @@ clean_pbp <- function(pbp) {
       "side_of_field", "forced_fumble_player_1_team", "forced_fumble_player_2_team",
       "solo_tackle_1_team", "solo_tackle_2_team",
       "assist_tackle_1_team", "assist_tackle_2_team", "assist_tackle_3_team", "assist_tackle_4_team",
-      "fumbled_1_team", "fumbled_2_team", "fumble_recovery_1_team", "fumble_recovery_2_team"
+      "fumbled_1_team", "fumbled_2_team", "fumble_recovery_1_team", "fumble_recovery_2_team",
+      "yrdln", "end_yard_line", "drive_start_yard_line", "drive_end_yard_line"
       ), team_name_fn) %>%
 
     #Seb's stuff for fixing player ids
@@ -162,6 +176,9 @@ clean_pbp <- function(pbp) {
       name = dplyr::if_else(!is.na(.data$passer), .data$passer, .data$rusher),
       id = dplyr::if_else(!is.na(.data$passer_id), .data$passer_id, .data$rusher_id)
     ) %>%
+    dplyr::mutate_at(
+      dplyr::vars(.data$passer_id, .data$rusher_id, .data$receiver_id, .data$id, ends_with("player_id")),
+      update_ids, legacy_id_map) %>%
     dplyr::arrange(.data$index) %>%
     dplyr::select(-"index")
 
@@ -181,29 +198,56 @@ receiver_finder <- "(?<=((to)|(for))\\s[:digit:]{0,2}\\-{0,1})"
 # weird play finder
 abnormal_play <- "(Lateral)|(lateral)|(pitches to)|(Direct snap to)|(New quarterback for)|(Aborted)|(backwards pass)|(Pass back to)|(Flea-flicker)"
 
+# These columns are being generated by clean_pbp and the function tries to drop
+# them in case it is being used on a pbp dataset where the columns already exist
+drop.cols <- c(
+  "success", "passer", "rusher", "receiver", "pass", "rush", "special",
+  "first_down", "play", "passer_id", "rusher_id", "receiver_id", "name", "id"
+)
+
 # custom mode function from https://stackoverflow.com/questions/2547402/is-there-a-built-in-function-for-finding-the-mode/8189441
 custom_mode <- function(x, na.rm = TRUE) {
   if(na.rm){x <- x[!is.na(x)]}
   ux <- unique(x)
   return(ux[which.max(tabulate(match(x, ux)))])
 }
 
-#just a function to help with standardizing team abbreviations used in clean_pbp()
+# fixes team names on columns with yard line
+# example: 'SD 49' --> 'LAC 49'
+# thanks to awgymer for the contribution:
+# https://github.com/mrcaseb/nflfastR/issues/29#issuecomment-654592195
 team_name_fn <- function(var) {
-  dplyr::case_when(
-    var %in% "JAC" ~ "JAX",
-    var %in% "STL" ~ "LA",
-    var %in% "SL" ~ "LA",
-    var %in% "ARZ" ~ "ARI",
-    var %in% "BLT" ~ "BAL",
-    var %in% "CLV" ~ "CLE",
-    var %in% "HST" ~ "HOU",
-    var %in% "SD" ~ "LAC",
-    var %in% "OAK" ~ "LV",
-    TRUE ~ var
+  stringr::str_replace_all(
+    var,
+    c(
+      "JAC" = "JAX",
+      "STL" = "LA",
+      "SL" = "LA",
+      "ARZ" = "ARI",
+      "BLT" = "BAL",
+      "CLV" = "CLE",
+      "HST" = "HOU",
+      "SD" = "LAC",
+      "OAK" = "LV"
+    )
   )
 }
 
+#' @importFrom tibble tibble
+#' @importFrom rlang .data
+#' @importFrom dplyr left_join mutate case_when
+update_ids <- function(var, id_map) {
+  join <- tibble::tibble(id = var) %>%
+    dplyr::left_join(id_map, by = c("id" = "gsis_id")) %>%
+    dplyr::mutate(
+      out_id = dplyr::case_when(
+        is.na(.data$new_id) ~ .data$id,
+        TRUE ~ .data$new_id
+      )
+    )
+  return(join$out_id)
+}
+
 #' Compute QB epa
 #'
 #' @param d is a Data frame of play-by-play data scraped using \code{\link{fast_scraper}}.
@@ -213,8 +257,12 @@ team_name_fn <- function(var) {
 #' @export
 #' @import dplyr
 #' @importFrom rlang .data
+#' @importFrom tidyselect one_of
 add_qb_epa <- function(d) {
 
+  # drop existing values of clean_pbp
+  d <- d %>% dplyr::select(-tidyselect::one_of("qb_epa"))
+
   fumbles_df <- d %>%
     dplyr::filter(.data$complete_pass == 1 & .data$fumble_lost == 1 & !is.na(.data$epa) & !is.na(.data$down)) %>%
     dplyr::mutate(

diff --git a/R/helper_scrape_nfl.R b/R/helper_scrape_nfl.R
@@ -229,6 +229,15 @@ get_pbp_nfl <- function(id, dir = NULL) {
         ) %>%
         dplyr::mutate_all(dplyr::na_if, "")
 
+      # nfl didn't fill in first downs on this game
+      if (id == '2018_01_ATL_PHI') {
+        combined <- combined %>%
+          dplyr::mutate(
+            first_down_pass = dplyr::if_else(.data$pass_attempt == 1 & .data$first_down == 1, 1, .data$first_down_pass),
+            first_down_rush = dplyr::if_else(.data$rush_attempt == 1 & .data$first_down == 1, 1, .data$first_down_rush)
+          )
+      }
+
     },
     error = function(e) {
       message("The following error has occured:")

diff --git a/man/clean_pbp.Rd b/man/clean_pbp.Rd
diff --git a/nflfastR_2.1.0.pdf → nflfastR_2.1.1.pdf b/nflfastR_2.1.0.pdf → nflfastR_2.1.1.pdf
diff --git a/vignettes/examples.Rmd b/vignettes/examples.Rmd
@@ -200,7 +200,7 @@ nflfastR::calculate_win_probability(data) %>%
 
 Not surprisingly, `vegas_wp` increases with the amount a team was coming into the game favored by. Weirdly, the model thinks home teams are more likely to win even when the spread is 0. I'm not sure how much to believe the model on that one, but leaving `home` in the model did make the model better at out of sample predictions, so who knows.
 
-# Example 8: Using the build-in database function
+# Example 8: Using the built-in database function
 
 If you're comfortable using `dplyr` functions to manipulate and tidy data, you're ready to use a database. Why should you use a database?