From 57422f524323a4a8f063d0b77207ab0fb7d83fe5 Mon Sep 17 00:00:00 2001 From: hubcad25 Date: Wed, 2 Oct 2024 16:11:54 -0400 Subject: [PATCH] scraped f*** cap hit --- extractors/e_capwages.R | 79 +++++++++++ loaders/l_create_puck_pedia_ids.R | 173 +++++++++++++++++++++++ loaders/l_fill_in_missing_capwages_ids.R | 29 ++++ refiners/r_train_random_forest.R | 2 +- 4 files changed, 282 insertions(+), 1 deletion(-) create mode 100644 extractors/e_capwages.R create mode 100644 loaders/l_create_puck_pedia_ids.R create mode 100644 loaders/l_fill_in_missing_capwages_ids.R diff --git a/extractors/e_capwages.R b/extractors/e_capwages.R new file mode 100644 index 0000000..e8cee47 --- /dev/null +++ b/extractors/e_capwages.R @@ -0,0 +1,79 @@ +## This script loads the player ids-capwages_id combination from warehouse/player_capwages_ids_final and extracts the 2024-25 cap hit +## for each player from https://capwages.com/. It then saves the result into warehouse/players_caphits.rds which contains two columns: +## 1. player_id (nhl player id) +## 2. cap hit + +# Packages --------------------------------------------------------------- +library(dplyr) +library(rvest) +options(scipen = 999) + +# Load capwages ids -------------------------------------------------------------- +df_capwages_ids <- readRDS("data/warehouse/player_capwages_ids_final.rds") + +# Function to scrape cap hit --------------------------------------------- +get_cap_hit <- function(player_id) { + # Construire l'URL + url <- paste0("https://capwages.com/players/", player_id) + # Lire le contenu de la page + page <- rvest::read_html(url) + # Extraire le contenu JSON à partir de la balise