Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Test random forest predict points directly #1

Merged
merged 2 commits into from
Sep 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
261 changes: 27 additions & 234 deletions refiners/r_predict_random_forest_on_projections.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,255 +16,48 @@ df_projections_f <- df_projections |>
df_projections_d <- df_projections |>
filter(position == "D")

model_goals_f <- readRDS("apis/points_predictor_api/models/goals_f.rds")
model_assists_f <- readRDS("apis/points_predictor_api/models/assists_f.rds")
model_goals_d <- readRDS("apis/points_predictor_api/models/goals_d.rds")
model_assists_d <- readRDS("apis/points_predictor_api/models/assists_d.rds")
model_f <- readRDS("apis/points_predictor_api/models/points_f.rds")
model_d <- readRDS("apis/points_predictor_api/models/points_d.rds")

# Create function to scale predicted values on a credible distribution ---------------------------------------

## The models underestimate by a lot. We will manually adjust them
scale_value <- function(value, old_max = 35, new_max = 65, exponent = 2) {
# Applique une transformation non linéaire en utilisant un exposant pour donner plus de poids aux grandes valeurs
scaled_value <- (value / old_max)^exponent * new_max
return(scaled_value)
}

## We need to find the optimal exponent to use in the scale_value function.
find_optimal_exponent <- function(
exponents_to_test = seq(1, 20, by = 0.1),
unscaled_values,
real_distribution,
old_max,
new_max
) {
centiles_to_check <- c(0, 0.05, 0.1, 0.25, 0.4, 0.5, 0.6, 0.75, 0.9, 0.95, 0.975, 0.99, 0.995, 0.9995, 1)
real_centiles <- quantile(real_distribution, centiles_to_check)
# Définir des poids plus élevés pour les centiles du milieu
weights <- c(0.5, 0.8, 1, 2, 2.5, 3, 2.5, 2, 1.5, 0.8, 0.6, 0.4, 0.3, 0.2, 0.1)
best_exponent <- NULL
min_diff <- Inf
for (exponent in exponents_to_test) {
# Scaling les valeurs avec l'exposant actuel
scaled_values <- (unscaled_values / old_max)^exponent * new_max
# Calcul des centiles prévus
predicted_centiles <- quantile(scaled_values, centiles_to_check)
# Différence absolue totale entre les centiles réels et prévus
diff <- sum(weights * abs(predicted_centiles - real_centiles))
# Vérifie si cet exposant donne une meilleure approximation
if (diff < min_diff) {
min_diff <- diff
best_exponent <- exponent
}
}
return(best_exponent)
}

# Predict models and scale -----------------------------------------------


predict_adjusted_points <- function(
model_goals,
model_assists,
df_projections,
real_data_goals,
real_data_assists,
real_data_points,
goals_max = 65,
assists_max = 95
) {

# 1. Prédictions des goals
df_projections$goals <- predict(model_goals, newdata = df_projections)

# Scale des goals
df_projections$goals <- scale_value(
df_projections$goals,
old_max = max(df_projections$goals),
new_max = goals_max,
exponent = find_optimal_exponent(
unscaled_values = df_projections$goals,
real_distribution = real_data_goals,
old_max = max(df_projections$goals),
new_max = goals_max
)
)

# 2. Prédictions des assists
df_projections$assists <- predict(model_assists, newdata = df_projections)

# Scale des assists
df_projections$assists <- scale_value(
df_projections$assists,
old_max = max(df_projections$assists),
new_max = assists_max,
exponent = find_optimal_exponent(
unscaled_values = df_projections$assists,
real_distribution = real_data_assists,
old_max = max(df_projections$assists),
new_max = assists_max
)
)

# 3. Calcul des points basés sur goals + assists
df_projections$points <- df_projections$goals + df_projections$assists

# 4. Rescaling des points pour correspondre aux centiles des vraies données de points
centiles_to_check <- c(0, 0.05, 0.1, 0.25, 0.4, 0.5, 0.6, 0.75, 0.9, 0.95, 0.975, 0.99, 0.995, 0.9995, 1)

projected_points_centiles <- quantile(df_projections$points, centiles_to_check)
real_points_centiles <- quantile(real_data_points, centiles_to_check)

# Fonction de rescaling des points
rescale_points_to_real_distribution <- function(predicted_points, real_points_centiles, projected_points_centiles) {
return(approx(x = projected_points_centiles, y = real_points_centiles, xout = predicted_points)$y)
}

# Appliquer le rescaling aux points
df_projections$points_rescaled <- rescale_points_to_real_distribution(
df_projections$points,
real_points_centiles,
projected_points_centiles
)

# 5. Réajuster les goals et assists pour correspondre aux points rescalés
df_projections$goals_weight <- df_projections$goals / df_projections$points
df_projections$assists_weight <- df_projections$assists / df_projections$points

df_projections$goals_adjusted <- df_projections$goals_weight * df_projections$points_rescaled
df_projections$assists_adjusted <- df_projections$assists_weight * df_projections$points_rescaled

# 6. Remplacer les colonnes originales par les valeurs ajustées
df_projections$goals <- df_projections$goals_adjusted
df_projections$assists <- df_projections$assists_adjusted
df_projections$points <- df_projections$goals + df_projections$assists

df_projections <- df_projections |>
select(-goals_weight, -assists_weight, -goals_adjusted, -assists_adjusted, -points_rescaled)
# Retourner le dataframe mis à jour
return(df_projections)
}

# Exemple d'utilisation
df_projections_f_adjusted <- predict_adjusted_points(
model_goals = model_goals_f,
model_assists = model_assists_f,
df_projections = df_projections_f,
real_data_goals = real_data_f$goals,
real_data_assists = real_data_f$assists,
real_data_points = real_data_f$points
)





# END OF FUNCTION TEST ---------------------------------------------------

## Forwards
df_projections_f$goals <- predict(model_goals_f, newdata = df_projections_f)
df_projections_f$goals <- scale_value(
df_projections_f$goals,
old_max = max(df_projections_f$goals),
new_max = 65,
exponent = find_optimal_exponent(
unscaled_values = df_projections_f$goals,
real_distribution = real_data_f$goals,
old_max = max(df_projections_f$goals),
new_max = 65
)
)

hist(df_projections_f$goals)
hist(real_data_f$goals)

df_projections_f$assists <- predict(model_assists_f, newdata = df_projections_f)
df_projections_f$assists <- scale_value(
df_projections_f$assists,
old_max = max(df_projections_f$assists),
new_max = 95,
exponent = find_optimal_exponent(
unscaled_values = df_projections_f$assists,
real_distribution = real_data_f$assists,
old_max = max(df_projections_f$assists),
new_max = 95
)
)

hist(df_projections_f$assists)
hist(real_data_f$assists)

df_projections_f$points <- df_projections_f$goals + df_projections_f$assists

hist(df_projections_f$points)

centiles_to_check <- c(0, 0.05, 0.1, 0.25, 0.4, 0.5, 0.6, 0.75, 0.9, 0.95, 0.975, 0.99, 0.995, 0.9995, 1)
projected_points_centiles <- quantile(df_projections_f$points, centiles_to_check)
real_points_centiles <- quantile(real_data_f$points, centiles_to_check)

# Fonction pour rescaler les points prédits en fonction des centiles réels
rescale_points_to_real_distribution <- function(predicted_points, real_points_centiles, projected_points_centiles) {
return(approx(x = projected_points_centiles, y = real_points_centiles, xout = predicted_points)$y)
}

# Appliquer la fonction de rescaling aux points projetés
df_projections_f$points_rescaled <- rescale_points_to_real_distribution(df_projections_f$points, real_points_centiles, projected_points_centiles)

# Calcul des proportions de "goals" et "assists" dans les points originaux
df_projections_f$goals_weight <- df_projections_f$goals / df_projections_f$points
df_projections_f$assists_weight <- df_projections_f$assists / df_projections_f$points

# Réajustement des goals et assists basés sur les poids et le nouveau total de points
df_projections_f$goals_adjusted <- df_projections_f$goals_weight * df_projections_f$points_rescaled
df_projections_f$assists_adjusted <- df_projections_f$assists_weight * df_projections_f$points_rescaled
# Predict models and scale -----------------------------------------------

# Vérification de la somme des points réajustés
df_projections_f$points_adjusted <- df_projections_f$goals_adjusted + df_projections_f$assists_adjusted
centiles_to_check <- c(0, 0.05, 0.1, 0.25, 0.4, 0.5, 0.6, 0.75, 0.9, 0.95, 0.975, 0.99, 0.995, 0.9995, 1)

## Forwards ---------------------------------------------------------------
unscaled_projections <- predict(model_f, newdata = df_projections_f)
projected_points_centiles <- quantile(unscaled_projections, centiles_to_check)
real_points_centiles <- quantile(real_data_f$points, centiles_to_check)

## Defensemen
df_projections_d$goals <- predict(model_goals_d, newdata = df_projections_d)
df_projections_d$goals <- scale_value(
df_projections_d$goals,
old_max = max(df_projections_d$goals),
new_max = 25,
exponent = find_optimal_exponent(
unscaled_values = df_projections_d$goals,
real_distribution = real_data_d$goals,
old_max = max(df_projections_d$goals),
new_max = 25
)
df_projections_f$points <- rescale_points_to_real_distribution(
predicted_points = unscaled_projections,
real_points_centiles = real_points_centiles,
projected_points_centiles = projected_points_centiles
)

# Histogrammes pour les buts (défenseurs)
hist(df_projections_d$goals)
hist(real_data_d$goals)
## Defensemen -------------------------------------------------------------
unscaled_projections <- predict(model_d, newdata = df_projections_d)
projected_points_centiles <- quantile(unscaled_projections, centiles_to_check)
real_points_centiles <- quantile(real_data_d$points, centiles_to_check)

# Prédiction et scaling des passes décisives pour les défenseurs
df_projections_d$assists <- predict(model_assists_d, newdata = df_projections_d)
df_projections_d$assists <- scale_value(
df_projections_d$assists,
old_max = max(df_projections_d$assists),
new_max = 75,
exponent = find_optimal_exponent(
unscaled_values = df_projections_d$assists,
real_distribution = real_data_d$assists,
old_max = max(df_projections_d$assists),
new_max = 75
)
df_projections_d$points <- rescale_points_to_real_distribution(
predicted_points = unscaled_projections,
real_points_centiles = real_points_centiles,
projected_points_centiles = projected_points_centiles
)

# Histogrammes pour les passes décisives (défenseurs)
hist(df_projections_d$assists)
hist(real_data_d$assists)

# Calcul des points (buts + passes décisives) pour les défenseurs
df_projections_d$points <- df_projections_d$goals + df_projections_d$assists

# Histogramme des points (défenseurs)
hist(df_projections_d$points)


# Bind it all together ---------------------------------------------------

df_predictions <- rbind(
df_projections_f,
df_projections_d
) |>
relocate(player_id, first_name, last_name, points, age)


saveRDS(df_predictions, "data/marts/projections/final_points_predictions.RDS")
45 changes: 12 additions & 33 deletions refiners/r_train_random_forest.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,49 +4,28 @@

# Data -------------------------------------------------------------------
data_f <- readRDS("data/marts/data_random_forest_forwards.rds") |>
dplyr::select(-points, -player_id, -games_played)
dplyr::select(-goals, -assists, -player_id, -games_played)
data_d <- readRDS("data/marts/data_random_forest_defensemen.rds") |>
dplyr::select(-points, -player_id, -games_played)
dplyr::select(-goals, -assists, -player_id, -games_played)

# Forwards ---------------------------------------------------------------

model_goals <- ptspredictR::train_random_forest(
data_f |> dplyr::select(-assists),
"goals"
model_f <- ptspredictR::train_random_forest(
data_f,
"points"
)

randomForest::varImpPlot(model_goals)
randomForest::varImpPlot(model_f)

saveRDS(model_goals, "apis/points_predictor_api/models/goals_f.rds")

model_assists <- ptspredictR::train_random_forest(
data_f |> dplyr::select(-goals),
"assists"
)

randomForest::varImpPlot(model_assists)
hist(predict(model_assists))

saveRDS(model_assists, "apis/points_predictor_api/models/assists_f.rds")
saveRDS(model_f, "apis/points_predictor_api/models/points_f.rds")

# Defensemen ---------------------------------------------------------------

model_goals_d <- ptspredictR::train_random_forest(
data_d |> dplyr::select(-assists),
"goals"
)

randomForest::varImpPlot(model_goals_d)

saveRDS(model_goals_d, "apis/points_predictor_api/models/goals_d.rds")

model_assists_d <- ptspredictR::train_random_forest(
data_d |> dplyr::select(-goals),
"assists"
model_d <- ptspredictR::train_random_forest(
data_d,
"points"
)

randomForest::varImpPlot(model_assists_d)

saveRDS(model_assists_d, "apis/points_predictor_api/models/assists_d.rds")

randomForest::varImpPlot(model_d)

saveRDS(model_d, "apis/points_predictor_api/models/points_d.rds")