Push nflfastR v2.1.0

nflverse · Jul 3, 2020 · 496f647 · 496f647
1 parent 59bf0d4
commit 496f647
Show file tree

Hide file tree

Showing 35 changed files with 1,867 additions and 1,569 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -13,3 +13,8 @@
 ^vignettes/nflfastR-models\.Rmd$
 ^vignettes$
 ^\.travis\.yml$
+^man/figures/card\.png$
+^man/figures/header_github\.png$
+^man/figures/header_twitter\.png$
+^man/figures/logo\.png$
+^man/figures/nflfastR_logo_fillsize\.png$
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: nflfastR
 Title: Functions to Efficiently Scrape NFL Play by Play Data
-Version: 2.0.6
+Version: 2.1.0
 Authors@R: 
     c(person(given = "Sebastian",
              family = "Carl",
@@ -45,17 +45,20 @@ Imports:
     progressr,
     purrr,
     stats,
+    rlang,
     stringr,
     tibble,
     tidyr,
     tidyselect,
     xgboost (>= 1.1)
 Suggests: 
+    DBI,
     furrr,
     future,
     rmarkdown,
+    RSQLite,
     knitr
 Encoding: UTF-8
 LazyData: true
-RoxygenNote: 7.1.0
+RoxygenNote: 7.1.1
 VignetteBuilder: knitr
diff --git a/NAMESPACE b/NAMESPACE
@@ -2,14 +2,54 @@
 
 export(add_qb_epa)
 export(calculate_expected_points)
+export(calculate_win_probability)
 export(clean_pbp)
 export(fast_scraper)
 export(fast_scraper_schedules)
+export(update_db)
+import(dplyr)
+importFrom(dplyr,bind_cols)
 importFrom(dplyr,filter)
+importFrom(dplyr,first)
+importFrom(dplyr,group_by)
 importFrom(dplyr,if_else)
+importFrom(dplyr,mutate)
+importFrom(dplyr,rename)
+importFrom(dplyr,select)
+importFrom(dplyr,ungroup)
+importFrom(glue,glue)
+importFrom(httr,GET)
+importFrom(httr,HEAD)
+importFrom(httr,content)
+importFrom(janitor,clean_names)
+importFrom(jsonlite,fromJSON)
+importFrom(lubridate,ms)
+importFrom(lubridate,period_to_seconds)
 importFrom(magrittr,"%>%")
+importFrom(mgcv,predict.bam)
+importFrom(purrr,map_chr)
+importFrom(purrr,map_df)
+importFrom(purrr,map_dfr)
 importFrom(purrr,modify_at)
 importFrom(purrr,modify_if)
+importFrom(purrr,pluck)
 importFrom(purrr,prepend)
 importFrom(purrr,set_names)
+importFrom(rlang,.data)
+importFrom(stats,na.omit)
+importFrom(stats,predict)
+importFrom(stringr,str_detect)
+importFrom(stringr,str_extract)
+importFrom(stringr,str_remove_all)
+importFrom(stringr,str_replace_all)
+importFrom(stringr,str_split)
+importFrom(stringr,str_sub)
+importFrom(stringr,str_trim)
+importFrom(tibble,as_tibble)
 importFrom(tibble,as_tibble_row)
+importFrom(tibble,tibble)
+importFrom(tidyr,replace_na)
+importFrom(tidyr,unnest)
+importFrom(tidyr,unnest_wider)
+importFrom(tidyselect,matches)
+importFrom(tidyselect,one_of)
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,33 @@
+# nflfastR 2.1.0
+
+### Models
+
+* Removed `week` from Expected Points models along with an update of
+`vignette("nflfastR-models")` and `vignette("examples")`
+
+### Functions
+
+* Added function `update_db()` which adds all completed games to a SQLite database
+* Added function `calculate_win_probability()` 
+* Added new examples to `vignette("examples")` demonstrating the usage of the
+above mentioned functions
+
+### Bugs
+
+* Fixed a problem with inconsistent data types of the variable
+`drive_real_start_time` pre and post 2011
+* Fixed a problem where some `game_id`s were overwritten during the play by play parsing
+* Fix some more WP bugs on kickoffs with penalties and rare play description
+
+### Miscellaneous
+
+* `fast_scraper()` now loads the raw game data from a separate raw data repo
+* Completely overhauled the entire code base to directly implement
+[tidy evaluation](https://dplyr.tidyverse.org/articles/programming.html) using 
+`.data` from the [rlang](https://rlang.r-lib.org/) package (this is a major 
+code change that takes some getting used to but we need it in preparation of 
+a future release)
+
 # nflfastR 2.0.6
 
 * Fixed a problem where defensive two point conversions were not counted

diff --git a/R/ep_calculator.R b/R/ep_calculator.R
diff --git a/R/ep_wp_calculators.R b/R/ep_wp_calculators.R
@@ -0,0 +1,154 @@
+#' Compute expected points
+#'
+#' for provided plays. Returns the data with
+#' probabilities of each scoring event and EP added. The following columns
+#' must be present: season, home_team, posteam, roof (coded as 'open',
+#' 'closed', or 'retractable'), half_seconds_remaining, yardline_100,
+#' ydstogo, posteam_timeouts_remaining, defteam_timeouts_remaining
+#'
+#' @param pbp_data Play-by-play dataset to estimate expected points for.
+#' @details Computes expected points for provided plays. Returns the data with
+#' probabilities of each scoring event and EP added. The following columns
+#' must be present:
+#' \itemize{
+#' \item{season}
+#' \item{home_team}
+#' \item{posteam}
+#' \item{roof (coded as 'outdoors', 'dome', or {'open' / 'closed' / NA} (retractable))}
+#' \item{half_seconds_remaining}
+#' \item{yardline_100}
+#' \item{down}
+#' \item{ydstogo}
+#' \item{posteam_timeouts_remaining}
+#' \item{defteam_timeouts_remaining}
+#' }
+#' @return The original pbp_data with the following columns appended to it:
+#' \describe{
+#' \item{ep}{expected points.}
+#' \item{no_score_prob}{probability of no more scoring this half.}
+#' \item{opp_fg_prob}{probability next score opponent field goal this half.}
+#' \item{opp_safety_prob}{probability next score opponent safety  this half.}
+#' \item{opp_td_prob}{probability of next score opponent touchdown this half.}
+#' \item{fg_prob}{probability next score field goal this half.}
+#' \item{safety_prob}{probability next score safety this half.}
+#' \item{td_prob}{probability text score touchdown this half.}
+#' }
+#' @importFrom rlang .data
+#' @importFrom dplyr select mutate bind_cols
+#' @importFrom tidyselect one_of
+#' @importFrom stats predict
+#' @export
+calculate_expected_points <- function(pbp_data) {
+  suppressWarnings(
+    model_data <- pbp_data %>%
+      # drop existing values of ep and the probs before making new ones
+      dplyr::select(-one_of(drop.cols)) %>%
+      make_model_mutations() %>%
+      ep_model_select()
+  )
+
+
+  preds <- as.data.frame(
+    matrix(stats::predict(ep_model, as.matrix(model_data)), ncol = 7, byrow = TRUE)
+  )
+
+  colnames(preds) <- c(
+    "td_prob", "opp_td_prob", "fg_prob", "opp_fg_prob",
+    "safety_prob", "opp_safety_prob", "no_score_prob"
+  )
+
+  preds <- preds %>%
+    dplyr::mutate(
+      ep =
+        (-3 * .data$opp_fg_prob) +
+        (-2 * .data$opp_safety_prob) +
+        (-7 * .data$opp_td_prob) +
+        (3 * .data$fg_prob) +
+        (2 * .data$safety_prob) +
+        (7 * .data$td_prob)
+    ) %>%
+    dplyr::bind_cols(pbp_data)
+
+  return(preds)
+}
+
+# helper column for ep calculator
+drop.cols <- c(
+  "ep", "td_prob", "opp_td_prob", "fg_prob", "opp_fg_prob",
+  "safety_prob", "opp_safety_prob", "no_score_prob"
+)
+
+
+#' Compute win probability
+#'
+#' for provided plays. Returns the data with
+#' probabilities of winning the game. The following columns
+#' must be present: receive_h2_ko (1 if game is in 1st half and possession
+#' team will receive 2nd half kickoff, 0 otherwise), ep (expected points),
+#' home_team, posteam, half_seconds_remaining, game_seconds_remaining,
+#' spread_line (how many points home team was favored by), down, ydstogo,
+#' posteam_timeouts_remaining, defteam_timeouts_remaining
+#'
+#' @param pbp_data Play-by-play dataset to estimate win probability for.
+#' @details Computes win probability for provided plays. Returns the data with
+#' probabilities of each scoring event and EP added. The following columns
+#' must be present:
+#' \itemize{
+#' \item{receive_2h_ko (1 if game is in 1st half and possession team will receive 2nd half kickoff, 0 otherwise)}
+#' \item{ep (expected points)}
+#' \item{score_differential}
+#' \item{home_team}
+#' \item{posteam}
+#' \item{half_seconds_remaining}
+#' \item{game_seconds_remaining}
+#' \item{spread_line (how many points home team was favored by)}
+#' \item{down}
+#' \item{ydstogo}
+#' \item{posteam_timeouts_remaining}
+#' \item{defteam_timeouts_remaining}
+#' }
+#' @return The original pbp_data with the following columns appended to it:
+#' \describe{
+#' \item{wp}{win probability.}
+#' \item{vegas_wp}{win probability taking into account pre-game spread.}
+#' }
+#' @importFrom rlang .data
+#' @importFrom dplyr select mutate if_else rename bind_cols
+#' @importFrom tidyselect one_of
+#' @importFrom stats predict
+#' @importFrom tibble as_tibble
+#' @export
+calculate_win_probability <- function(pbp_data) {
+  suppressWarnings(
+    model_data <- pbp_data %>%
+      # drop existing values of ep and the probs before making new ones
+      dplyr::select(-one_of(drop.cols.wp)) %>%
+      dplyr::mutate(
+        home = dplyr::if_else(.data$posteam == .data$home_team, 1, 0),
+        ExpScoreDiff = .data$ep + .data$score_differential,
+        posteam_spread = dplyr::if_else(.data$home == 1, .data$spread_line, -1 * .data$spread_line),
+        spread_time = .data$posteam_spread * log(3600 / (50 + (3600 - .data$game_seconds_remaining))),
+        ExpScoreDiff_Time_Ratio = .data$ExpScoreDiff / (.data$game_seconds_remaining + 1)
+      )
+  )
+
+  wp <- get_preds_wp(model_data) %>%
+    tibble::as_tibble() %>%
+    dplyr::rename(wp = "value")
+  wp_spread <- get_preds_wp_spread(model_data) %>%
+    tibble::as_tibble() %>%
+    dplyr::rename(vegas_wp = "value")
+
+  preds <- dplyr::bind_cols(
+    pbp_data,
+    wp,
+    wp_spread
+  )
+
+  return(preds)
+}
+
+# helper column for wp calculator
+drop.cols.wp <- c(
+  "wp", "vegas_wp"
+)