Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Quiz 2 Review Code #33

Open
chendaniely opened this issue Jun 15, 2022 · 0 comments
Open

Quiz 2 Review Code #33

chendaniely opened this issue Jun 15, 2022 · 0 comments

Comments

@chendaniely
Copy link
Contributor

See the rendered notebook here: https://gist.github.com/chendaniely/b5a4c9834289457837005f94cb3f1ff7

Otherwise you can use the code below to study + run it on your own

library(tidyverse)
library(tidymodels)

# pros + cons of each model we use
# git is the version control software
# github is the cloud repository hosting service


# Initial Data Processing -----

# given the mtcars data set, I want to classify based on transmition, am
# given the mpg, hp, and wt of a car
mtcars

mtcars <- mtcars %>%
  mutate(am = factor(am)) # why do you need this?

mtcars


# Fit raw model on everything -----

# use knn to fit the model on everything
knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 5) %>%
  set_engine("kknn") %>%
  set_mode("classification")
knn_spec


# process data for model
car_recipe_raw <- recipe(am ~ mpg + hp + wt, data = mtcars)
car_recipe_raw

knn_fit_1 <- workflow() %>%
  add_recipe(car_recipe_raw) %>%
  add_model(knn_spec) %>%
  fit(data = mtcars)
knn_fit_1

mtcars %>%
  bind_cols(predict(knn_fit_1, mtcars)) %>%
  count(am, .pred_class)


# Fit scaled model on everything -----

car_recipe_processed <- recipe(am ~ mpg + hp + wt, data = mtcars) %>%
  step_scale(all_predictors()) %>% # why do you need these steps?
  step_center(all_predictors())
car_recipe_processed


knn_fit_2 <- workflow() %>%
  add_recipe(car_recipe_processed) %>%
  add_model(knn_spec) %>%
  fit(data = mtcars)
knn_fit_2


mtcars %>%
  bind_cols(predict(knn_fit_2, mtcars)) %>%
  count(am, .pred_class)


# what is the optimal value of k? -----

ks <- tibble(neighbors = seq(from = 1, to = 10, by = 1))

knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) %>%
  set_engine("kknn") %>%
  set_mode("classification")

data_vfold <- vfold_cv(mtcars, v = 5, strata = am)

results <- workflow() %>%
  add_recipe(car_recipe_processed) %>%
  add_model(knn_spec) %>%
  tune_grid(resamples = data_vfold, grid = ks) %>%
  collect_metrics()

accuracies <- results %>%
  filter(.metric == "accuracy")

accuracy_vs_k <- ggplot(accuracies, aes(x = neighbors, y = mean)) +
  geom_point() +
  geom_line() +
  labs(x = "Neighbors", y = "Accuracy Estimate") +
  theme(text = element_text(size = 12))

accuracy_vs_k



# How do we make sure we can predict well? -----

# what can can go wrong here?

# Splitting data

set.seed(4242)

car_split <- initial_split(mtcars, prop = 0.75, strata = am)
car_train <- training(car_split)
car_test <- testing(car_split)

# Create recipe

car_recipe <- recipe(am ~ mpg + hp + wt, data = mtcars) %>%
  step_scale(all_predictors()) %>%
  step_center(all_predictors())

# Fit the model

knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 3) %>%
  set_engine("kknn") %>%
  set_mode("classification")

knn_fit <- workflow() %>%
  add_recipe(car_recipe) %>%
  add_model(knn_spec) %>%
  fit(data = mtcars)

knn_fit

car_predictions_1 <- predict(knn_fit, car_test) %>%
  bind_cols(car_test)
car_predictions_1

car_predictions_1 %>%
  metrics(truth = am, estimate = .pred_class)


# What happens when we use the wrong training/testing data -----

car_recipe_train <- recipe(am ~ mpg + hp + wt, data = car_train) %>%
  step_scale(all_predictors()) %>%
  step_center(all_predictors())

car_recipe_train

knn_fit_train <- workflow() %>%
  add_recipe(car_recipe_train) %>%
  add_model(knn_spec) %>%
  fit(data = car_train)

knn_fit_train

car_predictions_2 <- predict(knn_fit_train, car_test) %>%
  bind_cols(car_test)
car_predictions_2

car_predictions_2 %>%
  metrics(truth = am, estimate = .pred_class)


# compare the 2 accuracy scores from the models we just fit?
# why is one higher than the other?

car_predictions_1 %>%
  metrics(truth = am, estimate = .pred_class)

car_predictions_2 %>%
  metrics(truth = am, estimate = .pred_class)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant