-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2 from hubcad25/test_random_forest_predict_points…
…_directly scraped f*** cap hit
- Loading branch information
Showing
4 changed files
with
282 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
## This script loads the player ids-capwages_id combination from warehouse/player_capwages_ids_final and extracts the 2024-25 cap hit | ||
## for each player from https://capwages.com/. It then saves the result into warehouse/players_caphits.rds which contains two columns: | ||
## 1. player_id (nhl player id) | ||
## 2. cap hit | ||
|
||
# Packages --------------------------------------------------------------- | ||
library(dplyr) | ||
library(rvest) | ||
options(scipen = 999) | ||
|
||
# Load capwages ids -------------------------------------------------------------- | ||
df_capwages_ids <- readRDS("data/warehouse/player_capwages_ids_final.rds") | ||
|
||
# Function to scrape cap hit --------------------------------------------- | ||
get_cap_hit <- function(player_id) { | ||
# Construire l'URL | ||
url <- paste0("https://capwages.com/players/", player_id) | ||
# Lire le contenu de la page | ||
page <- rvest::read_html(url) | ||
# Extraire le contenu JSON à partir de la balise <script type="application/ld+json"> | ||
json_data <- page %>% | ||
html_nodes('script[type="application/ld+json"]') %>% | ||
html_text() | ||
# Remplacer les entités HTML manuellement avec gsub | ||
decoded_json <- gsub(""", '"', json_data) | ||
# Convertir le texte JSON en R list | ||
parsed_data <- jsonlite::fromJSON(decoded_json) | ||
# Extraire le cap_hit_text | ||
cap_hit_text <- parsed_data$mainEntity$contract$capHit | ||
# Vérifier si cap_hit_text est manquant | ||
if (is.na(cap_hit_text) || is.null(cap_hit_text)) { | ||
warning(paste("Cap hit text is missing for player:", player_id)) | ||
return(NA) | ||
} | ||
# Nettoyer cap_hit_text pour garder uniquement les chiffres | ||
cap_hit <- as.numeric(gsub("[^0-9]", "", cap_hit_text)) | ||
# Vérifier si cap_hit est manquant | ||
if (is.na(cap_hit)) { | ||
warning(paste("Cap hit value could not be converted for player:", player_id)) | ||
} | ||
# Retourner cap_hit | ||
return(cap_hit) | ||
} | ||
|
||
|
||
# Loop through all players ----------------------------------------------- | ||
for (i in 1:nrow(df_capwages_ids)) { | ||
player_idi <- df_capwages_ids$capwages_id[i] | ||
message(player_idi, " ", round(i / nrow(df_capwages_ids) * 100, 1), "%") | ||
# Initialiser cap_hit à NA | ||
cap_hit <- NA | ||
# Boucle pour réessayer en cas d'erreur | ||
repeat { | ||
result <- tryCatch({ | ||
cap_hit <- get_cap_hit(player_idi) | ||
message(" ", cap_hit) | ||
break # Si la requête réussit, sortir de la boucle | ||
}, error = function(e) { | ||
message("Error fetching data for player: ", player_idi) | ||
message("Sleeping for 5 seconds before retrying...") | ||
Sys.sleep(5) | ||
}) | ||
} | ||
# Créer une ligne de données pour ce joueur | ||
df_cap_hiti <- data.frame( | ||
player_id = df_capwages_ids$player_id[i], | ||
capwages_id = player_idi, | ||
cap_hit = cap_hit | ||
) | ||
# Ajouter cette ligne au dataframe de résultats | ||
if (i == 1) { | ||
df_cap_hit <- df_cap_hiti | ||
} else { | ||
df_cap_hit <- rbind(df_cap_hit, df_cap_hiti) | ||
} | ||
Sys.sleep(0.5) | ||
} | ||
|
||
saveRDS(df_cap_hit, "data/warehouse/players_caphits.rds") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,173 @@ | ||
# This scripts creates the warehouse tables *player_capwages_ids* | ||
## It loads a list of player_ids and their infos. | ||
## It checks if the players are already in *player_capwages_ids* | ||
## For the players that are not in *player_capwages_ids*, it: | ||
#### - Tests different possible capwages_ids | ||
#### - Keep the one that works | ||
#### - Creates a dataframe with the result | ||
#### - Adds the dataframe to the existing *player_capwages_ids* | ||
#### - The ones who did not work are stocked with NA value | ||
|
||
# Packages --------------------------------------------------------------- | ||
library(dplyr) | ||
library(gemini.R) | ||
|
||
# Data ------------------------------------------------------------------- | ||
all_players <- readRDS("data/marts/projections/final_points_predictions.RDS")$player_id | ||
|
||
## Check if players are already in the warehouse table -------------------- | ||
|
||
### Load warehouse table | ||
df_player_capwages_ids <- readRDS("data/warehouse/player_capwages_ids.rds") | ||
##### Save a backup | ||
saveRDS(df_player_capwages_ids, "data/warehouse/player_capwages_ids_backup.rds") | ||
|
||
players_to_extract <- tryCatch( | ||
{ | ||
# Tenter d'exécuter la première instruction | ||
all_players[!(all_players %in% df_player_capwages_ids$player_id)] | ||
}, | ||
error = function(e) { | ||
# En cas d'erreur, exécuter cette instruction | ||
all_players | ||
} | ||
) | ||
|
||
# Create functions to create different possible variants of name ---------------- | ||
|
||
gemini.R::setAPI(api_key = Sys.getenv("GEMINI_API_KEY2")) | ||
|
||
# Function to generate the prompt for Gemini to return R vector style output | ||
generate_gemini_prompt <- function(first_name, last_name) { | ||
# Context and rules to provide to Gemini | ||
prompt <- paste0( | ||
"Hello, glad to have you with me.\n", | ||
"Context:\n", | ||
"I want to generate URL-friendly name variants, also known as slugs, based on first and last names. ", | ||
"These slugs will be used in URLs and should be formatted in lowercase, without accents, and follow certain rules.\n\n", | ||
|
||
"Rules:\n", | ||
"0. First and last names are ALWAYS separated by a hyphen.\n", | ||
"1. Spaces in first and last names should be replaced with hyphens to make the name URL-friendly.\n", | ||
"2. If there are initials in the first or last name (e.g., 'T.J.' or 'A.B.'), create one variant where the dots are replaced with hyphens (e.g., 'T.J.' becomes 't-j') and another variant where the dots are removed (e.g., 'T.J.' becomes 'tj').\n", | ||
"3. Accents should be removed from names to make them ASCII-compliant for URLs. Additionally, umlauts and other special characters should be converted to their English equivalents (e.g., 'Stützle' becomes 'stuetzle').\n", | ||
"4. If a first or last name contains hyphens, they should be preserved in the slug. Generate a variant with the hyphens intact, and another where the parts are concatenated without the hyphens (e.g., 'Ekman-Larsson' becomes 'ekman-larsson' and 'ekmanlarsson').\n", | ||
"5. For ambiguous first or last names (such as 'Alex', 'Anderson', 'Karlsson'), generate additional variants by transforming the name into common variants. For example:\n", | ||
" - 'Alex' could be 'Alexandre' or 'Alexander'.\n", | ||
" - 'Anderson' could also be 'Andersen' or 'Andersson'.\n", | ||
" - 'Karlsson' could be 'Carlson', 'Karlson', or 'Carlsson'.\n", | ||
" - **'Josh' could be expanded to 'Joshua'.**\n", | ||
" - Jake could be expanded to Jacob, Jakub, etc.\n", | ||
" This rule should apply to any ambiguous name, not just the examples. Really put emphasis on using multiple different variant of first and last names.\n", | ||
" Don't be scared to use variants that are different from the original name, but it is really important **that the variant is a credible name in real life**.\n\n", | ||
|
||
"Please return only a valid R vector with the variants. Do not return any code or additional explanations.\n\n", | ||
|
||
"Now, generate between 2 and 10 URL-friendly slugs for the following name:\n", | ||
"First Name: ", first_name, "\n", | ||
"Last Name: ", last_name, "\n", | ||
"Return the slugs as a valid R vector in this form: c('slug1', 'slug2', ...).\n\n", | ||
"Put the most credible names first.\n", | ||
"Don't forget that first and last names are ALWAYS separated by a hyphen. Good luck!" | ||
) | ||
return(prompt) | ||
} | ||
|
||
## Clean the output from gemini | ||
clean_and_eval_gemini_output <- function(output) { | ||
# Remove the backticks and "r" from the start and end of the text | ||
cleaned_output <- gsub("```r\\n|```", "", output) | ||
# Remove any extra line breaks or leading/trailing spaces | ||
cleaned_output <- trimws(cleaned_output) | ||
output <- eval(parse(text = cleaned_output)) | ||
return(output) | ||
} | ||
|
||
# Example usage | ||
first_name <- "Nate" | ||
last_name <- "Smith" | ||
prompt <- generate_gemini_prompt(first_name, last_name) | ||
|
||
test <- gemini.R::gemini( | ||
prompt = prompt, | ||
maxOutputTokens = 10000 | ||
) | ||
|
||
clean_and_eval_gemini_output(test) | ||
|
||
## Load df_player_infos and only keep players to extract ------------------ | ||
df_player_infos <- readRDS("data/warehouse/player_infos.rds") |> | ||
filter(player_id %in% players_to_extract) | ||
|
||
# Loop through the player_ids and check if an url is found --------------- | ||
|
||
retry_count <- 0 | ||
max_retries <- 5 # Limite de tentatives pour chaque joueur | ||
|
||
for (index in 1:length(players_to_extract)) { | ||
i <- players_to_extract[index] | ||
first_name <- df_player_infos$first_name[df_player_infos$player_id == i] | ||
last_name <- df_player_infos$last_name[df_player_infos$player_id == i] | ||
prompt <- generate_gemini_prompt(first_name, last_name) | ||
# Essayer plusieurs fois en cas d'erreur avec Gemini | ||
repeat { | ||
gemini_output <- tryCatch({ | ||
setTimeLimit(elapsed = 10) | ||
suppressMessages( | ||
gemini.R::gemini(prompt = prompt, maxOutputTokens = 10000) | ||
) | ||
}, error = function(e) { | ||
if (retry_count < max_retries) { | ||
retry_count <- retry_count + 1 | ||
#cat("\nGemini API error. Retrying in 10 seconds... (Attempt: ", retry_count, ")") | ||
setTimeLimit() # Réinitialiser la limite avant Sys.sleep() | ||
Sys.sleep(10) # Attendre 10 secondes avant de réessayer | ||
return(NULL) | ||
} else { | ||
stop("Max retries reached for Gemini API. Moving to next player.") | ||
} | ||
}, finally = { | ||
setTimeLimit() # Réinitialiser la limite de temps après chaque tentative | ||
}) | ||
|
||
# Si Gemini renvoie un résultat valide, sortir de la boucle de réessai | ||
if (!is.null(gemini_output)) break | ||
if (is.null(gemini_output)){ | ||
variants <- paste0(first_name, "-", last_name) | ||
} | ||
} | ||
|
||
variants <- clean_and_eval_gemini_output(gemini_output) | ||
capwages_id <- NA # Par défaut, on n'a pas trouvé de variante valide | ||
|
||
for (j in variants) { | ||
url <- paste0("https://capwages.com/players/", j) | ||
|
||
# Utiliser tryCatch pour gérer les erreurs et ajouter un timeout pour les requêtes | ||
response <- tryCatch({ | ||
httr::GET(url, httr::timeout(10), httr::config(ssl_verifypeer = FALSE)) | ||
}, error = function(e) { | ||
return(NULL) | ||
}) | ||
|
||
# Si la réponse est non nulle et le statut est 200, l'URL est valide | ||
if (!is.null(response) && response$status_code == 200) { | ||
capwages_id <- j | ||
break # Sortir de la boucle dès qu'une URL valide est trouvée | ||
} | ||
} | ||
|
||
# Ajouter le résultat dans le dataframe | ||
df_player_capwages_ids <- rbind(df_player_capwages_ids, data.frame(player_id = i, capwages_id = capwages_id, stringsAsFactors = FALSE)) | ||
|
||
# Calculer le pourcentage de progression | ||
progress <- round((index / length(players_to_extract)) * 100, 2) | ||
|
||
# Afficher la progression en remplaçant l'ancienne ligne | ||
cat("\rProgression :", progress, "%") | ||
saveRDS(df_player_capwages_ids, "data/warehouse/player_capwages_ids.rds") | ||
|
||
# Ajouter un léger délai entre les requêtes pour éviter de surcharger le serveur | ||
Sys.sleep(4) # Délai de 1 seconde entre les requêtes | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
## This script takes the warehouse/player_capwages_ids and creates a csv | ||
## used to manually input the missing capwages ids. | ||
## This csv is then loaded to replace the ids of the missing players and saves | ||
## the result in warehouse/player_capwages_ids_final.rds | ||
|
||
get_name <- function(player_id){ | ||
list <- httr::content(httr::GET(paste0("https://api-web.nhle.com/v1/player/", player_id ,"/landing"))) | ||
return(paste0(list$firstName, " ", list$lastName)) | ||
} | ||
|
||
# Load data -------------------------------------------------------------- | ||
data <- readRDS("data/warehouse/player_capwages_ids.rds") | ||
|
||
df_missing_players <- data |> | ||
dplyr::filter(is.na(capwages_id)) | ||
|
||
# Write missing players to csv ------------------------------------------- | ||
#write.csv(df_missing_players, "data/warehouse/missing_player_capwages_ids.csv") | ||
|
||
################################################# | ||
### MANUALLY INPUT MISSING PLAYERS IN THE CSV ### | ||
################################################# | ||
|
||
df_missing_players_filled <- read.csv("data/warehouse/missing_player_capwages_ids.csv") | ||
|
||
output <- rbind(data, df_missing_players_filled) |> | ||
tidyr::drop_na() | ||
|
||
saveRDS(output, "data/warehouse/player_capwages_ids_final.rds") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters