Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

scraped f*** cap hit #2

Merged
merged 1 commit into from
Oct 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 79 additions & 0 deletions extractors/e_capwages.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
## This script loads the player ids-capwages_id combination from warehouse/player_capwages_ids_final and extracts the 2024-25 cap hit
## for each player from https://capwages.com/. It then saves the result into warehouse/players_caphits.rds which contains two columns:
## 1. player_id (nhl player id)
## 2. cap hit

# Packages ---------------------------------------------------------------
library(dplyr)
library(rvest)
options(scipen = 999)

# Load capwages ids --------------------------------------------------------------
df_capwages_ids <- readRDS("data/warehouse/player_capwages_ids_final.rds")

# Function to scrape cap hit ---------------------------------------------
get_cap_hit <- function(player_id) {
# Construire l'URL
url <- paste0("https://capwages.com/players/", player_id)
# Lire le contenu de la page
page <- rvest::read_html(url)
# Extraire le contenu JSON à partir de la balise <script type="application/ld+json">
json_data <- page %>%
html_nodes('script[type="application/ld+json"]') %>%
html_text()
# Remplacer les entités HTML manuellement avec gsub
decoded_json <- gsub("&quot;", '"', json_data)
# Convertir le texte JSON en R list
parsed_data <- jsonlite::fromJSON(decoded_json)
# Extraire le cap_hit_text
cap_hit_text <- parsed_data$mainEntity$contract$capHit
# Vérifier si cap_hit_text est manquant
if (is.na(cap_hit_text) || is.null(cap_hit_text)) {
warning(paste("Cap hit text is missing for player:", player_id))
return(NA)
}
# Nettoyer cap_hit_text pour garder uniquement les chiffres
cap_hit <- as.numeric(gsub("[^0-9]", "", cap_hit_text))
# Vérifier si cap_hit est manquant
if (is.na(cap_hit)) {
warning(paste("Cap hit value could not be converted for player:", player_id))
}
# Retourner cap_hit
return(cap_hit)
}


# Loop through all players -----------------------------------------------
for (i in 1:nrow(df_capwages_ids)) {
player_idi <- df_capwages_ids$capwages_id[i]
message(player_idi, " ", round(i / nrow(df_capwages_ids) * 100, 1), "%")
# Initialiser cap_hit à NA
cap_hit <- NA
# Boucle pour réessayer en cas d'erreur
repeat {
result <- tryCatch({
cap_hit <- get_cap_hit(player_idi)
message(" ", cap_hit)
break # Si la requête réussit, sortir de la boucle
}, error = function(e) {
message("Error fetching data for player: ", player_idi)
message("Sleeping for 5 seconds before retrying...")
Sys.sleep(5)
})
}
# Créer une ligne de données pour ce joueur
df_cap_hiti <- data.frame(
player_id = df_capwages_ids$player_id[i],
capwages_id = player_idi,
cap_hit = cap_hit
)
# Ajouter cette ligne au dataframe de résultats
if (i == 1) {
df_cap_hit <- df_cap_hiti
} else {
df_cap_hit <- rbind(df_cap_hit, df_cap_hiti)
}
Sys.sleep(0.5)
}

saveRDS(df_cap_hit, "data/warehouse/players_caphits.rds")
173 changes: 173 additions & 0 deletions loaders/l_create_puck_pedia_ids.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
# This scripts creates the warehouse tables *player_capwages_ids*
## It loads a list of player_ids and their infos.
## It checks if the players are already in *player_capwages_ids*
## For the players that are not in *player_capwages_ids*, it:
#### - Tests different possible capwages_ids
#### - Keep the one that works
#### - Creates a dataframe with the result
#### - Adds the dataframe to the existing *player_capwages_ids*
#### - The ones who did not work are stocked with NA value

# Packages ---------------------------------------------------------------
library(dplyr)
library(gemini.R)

# Data -------------------------------------------------------------------
all_players <- readRDS("data/marts/projections/final_points_predictions.RDS")$player_id

## Check if players are already in the warehouse table --------------------

### Load warehouse table
df_player_capwages_ids <- readRDS("data/warehouse/player_capwages_ids.rds")
##### Save a backup
saveRDS(df_player_capwages_ids, "data/warehouse/player_capwages_ids_backup.rds")

players_to_extract <- tryCatch(
{
# Tenter d'exécuter la première instruction
all_players[!(all_players %in% df_player_capwages_ids$player_id)]
},
error = function(e) {
# En cas d'erreur, exécuter cette instruction
all_players
}
)

# Create functions to create different possible variants of name ----------------

gemini.R::setAPI(api_key = Sys.getenv("GEMINI_API_KEY2"))

# Function to generate the prompt for Gemini to return R vector style output
generate_gemini_prompt <- function(first_name, last_name) {
# Context and rules to provide to Gemini
prompt <- paste0(
"Hello, glad to have you with me.\n",
"Context:\n",
"I want to generate URL-friendly name variants, also known as slugs, based on first and last names. ",
"These slugs will be used in URLs and should be formatted in lowercase, without accents, and follow certain rules.\n\n",

"Rules:\n",
"0. First and last names are ALWAYS separated by a hyphen.\n",
"1. Spaces in first and last names should be replaced with hyphens to make the name URL-friendly.\n",
"2. If there are initials in the first or last name (e.g., 'T.J.' or 'A.B.'), create one variant where the dots are replaced with hyphens (e.g., 'T.J.' becomes 't-j') and another variant where the dots are removed (e.g., 'T.J.' becomes 'tj').\n",
"3. Accents should be removed from names to make them ASCII-compliant for URLs. Additionally, umlauts and other special characters should be converted to their English equivalents (e.g., 'Stützle' becomes 'stuetzle').\n",
"4. If a first or last name contains hyphens, they should be preserved in the slug. Generate a variant with the hyphens intact, and another where the parts are concatenated without the hyphens (e.g., 'Ekman-Larsson' becomes 'ekman-larsson' and 'ekmanlarsson').\n",
"5. For ambiguous first or last names (such as 'Alex', 'Anderson', 'Karlsson'), generate additional variants by transforming the name into common variants. For example:\n",
" - 'Alex' could be 'Alexandre' or 'Alexander'.\n",
" - 'Anderson' could also be 'Andersen' or 'Andersson'.\n",
" - 'Karlsson' could be 'Carlson', 'Karlson', or 'Carlsson'.\n",
" - **'Josh' could be expanded to 'Joshua'.**\n",
" - Jake could be expanded to Jacob, Jakub, etc.\n",
" This rule should apply to any ambiguous name, not just the examples. Really put emphasis on using multiple different variant of first and last names.\n",
" Don't be scared to use variants that are different from the original name, but it is really important **that the variant is a credible name in real life**.\n\n",

"Please return only a valid R vector with the variants. Do not return any code or additional explanations.\n\n",

"Now, generate between 2 and 10 URL-friendly slugs for the following name:\n",
"First Name: ", first_name, "\n",
"Last Name: ", last_name, "\n",
"Return the slugs as a valid R vector in this form: c('slug1', 'slug2', ...).\n\n",
"Put the most credible names first.\n",
"Don't forget that first and last names are ALWAYS separated by a hyphen. Good luck!"
)
return(prompt)
}

## Clean the output from gemini
clean_and_eval_gemini_output <- function(output) {
# Remove the backticks and "r" from the start and end of the text
cleaned_output <- gsub("```r\\n|```", "", output)
# Remove any extra line breaks or leading/trailing spaces
cleaned_output <- trimws(cleaned_output)
output <- eval(parse(text = cleaned_output))
return(output)
}

# Example usage
first_name <- "Nate"
last_name <- "Smith"
prompt <- generate_gemini_prompt(first_name, last_name)

test <- gemini.R::gemini(
prompt = prompt,
maxOutputTokens = 10000
)

clean_and_eval_gemini_output(test)

## Load df_player_infos and only keep players to extract ------------------
df_player_infos <- readRDS("data/warehouse/player_infos.rds") |>
filter(player_id %in% players_to_extract)

# Loop through the player_ids and check if an url is found ---------------

retry_count <- 0
max_retries <- 5 # Limite de tentatives pour chaque joueur

for (index in 1:length(players_to_extract)) {
i <- players_to_extract[index]
first_name <- df_player_infos$first_name[df_player_infos$player_id == i]
last_name <- df_player_infos$last_name[df_player_infos$player_id == i]
prompt <- generate_gemini_prompt(first_name, last_name)
# Essayer plusieurs fois en cas d'erreur avec Gemini
repeat {
gemini_output <- tryCatch({
setTimeLimit(elapsed = 10)
suppressMessages(
gemini.R::gemini(prompt = prompt, maxOutputTokens = 10000)
)
}, error = function(e) {
if (retry_count < max_retries) {
retry_count <- retry_count + 1
#cat("\nGemini API error. Retrying in 10 seconds... (Attempt: ", retry_count, ")")
setTimeLimit() # Réinitialiser la limite avant Sys.sleep()
Sys.sleep(10) # Attendre 10 secondes avant de réessayer
return(NULL)
} else {
stop("Max retries reached for Gemini API. Moving to next player.")
}
}, finally = {
setTimeLimit() # Réinitialiser la limite de temps après chaque tentative
})

# Si Gemini renvoie un résultat valide, sortir de la boucle de réessai
if (!is.null(gemini_output)) break
if (is.null(gemini_output)){
variants <- paste0(first_name, "-", last_name)
}
}

variants <- clean_and_eval_gemini_output(gemini_output)
capwages_id <- NA # Par défaut, on n'a pas trouvé de variante valide

for (j in variants) {
url <- paste0("https://capwages.com/players/", j)

# Utiliser tryCatch pour gérer les erreurs et ajouter un timeout pour les requêtes
response <- tryCatch({
httr::GET(url, httr::timeout(10), httr::config(ssl_verifypeer = FALSE))
}, error = function(e) {
return(NULL)
})

# Si la réponse est non nulle et le statut est 200, l'URL est valide
if (!is.null(response) && response$status_code == 200) {
capwages_id <- j
break # Sortir de la boucle dès qu'une URL valide est trouvée
}
}

# Ajouter le résultat dans le dataframe
df_player_capwages_ids <- rbind(df_player_capwages_ids, data.frame(player_id = i, capwages_id = capwages_id, stringsAsFactors = FALSE))

# Calculer le pourcentage de progression
progress <- round((index / length(players_to_extract)) * 100, 2)

# Afficher la progression en remplaçant l'ancienne ligne
cat("\rProgression :", progress, "%")
saveRDS(df_player_capwages_ids, "data/warehouse/player_capwages_ids.rds")

# Ajouter un léger délai entre les requêtes pour éviter de surcharger le serveur
Sys.sleep(4) # Délai de 1 seconde entre les requêtes
}

29 changes: 29 additions & 0 deletions loaders/l_fill_in_missing_capwages_ids.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
## This script takes the warehouse/player_capwages_ids and creates a csv
## used to manually input the missing capwages ids.
## This csv is then loaded to replace the ids of the missing players and saves
## the result in warehouse/player_capwages_ids_final.rds

get_name <- function(player_id){
list <- httr::content(httr::GET(paste0("https://api-web.nhle.com/v1/player/", player_id ,"/landing")))
return(paste0(list$firstName, " ", list$lastName))
}

# Load data --------------------------------------------------------------
data <- readRDS("data/warehouse/player_capwages_ids.rds")

df_missing_players <- data |>
dplyr::filter(is.na(capwages_id))

# Write missing players to csv -------------------------------------------
#write.csv(df_missing_players, "data/warehouse/missing_player_capwages_ids.csv")

#################################################
### MANUALLY INPUT MISSING PLAYERS IN THE CSV ###
#################################################

df_missing_players_filled <- read.csv("data/warehouse/missing_player_capwages_ids.csv")

output <- rbind(data, df_missing_players_filled) |>
tidyr::drop_na()

saveRDS(output, "data/warehouse/player_capwages_ids_final.rds")
2 changes: 1 addition & 1 deletion refiners/r_train_random_forest.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,4 @@ model_d <- ptspredictR::train_random_forest(

randomForest::varImpPlot(model_d)

saveRDS(model_d, "apis/points_predictor_api/models/points_d.rds")
saveRDS(model_d, "apis/points_predictor_api/models/points_d.rds")