Merge pull request #232 from tidymodels/more-fig-updates

juliasilge · web-flow · commit f681d9ff4679 · 2022-02-03T19:16:07.000-07:00
More figure updates
diff --git a/01-software-modeling.Rmd b/01-software-modeling.Rmd
@@ -89,7 +89,7 @@ plm_plot <-
 ames_plot <- 
   ggplot(ames, aes(x = Latitude, y = Sale_Price)) + 
   geom_point(alpha = .2) + 
-  geom_smooth(se = FALSE, method = stats::loess, method.args = list(span = .3), col = "red") + 
+  geom_smooth(se = FALSE, method = stats::loess, method.args = list(span = .3), color = "red") + 
   scale_y_log10() + 
   ylab("House Sale Price ($US)") + 
   ggtitle("(b) Using a model-based smoother to discover trends.")
@@ -200,7 +200,7 @@ This iterative process is especially true for modeling. Figure \@ref(fig:softwar
 
 * **Feature engineering:** The understanding gained from EDA results in the creation of specific model terms that make it easier to accurately model the observed data. This can include complex methodologies (e.g., PCA) or simpler features (using the ratio of two predictors). Chapter \@ref(recipes) focuses entirely on this important step.
 
-* **Model tuning and selection (circles with blue and yellow segments):** A variety of models are generated and their performance is compared. Some models require _parameter tuning_ where some structural parameters are required to be specified or optimized. The colored segments within the circles signify the repeated data splitting used during resampling (see Chapter \@ref(resampling)). 
+* **Model tuning and selection (circles with alternating segments):** A variety of models are generated and their performance is compared. Some models require _parameter tuning_ where some structural parameters are required to be specified or optimized. The colored segments within the circles signify the repeated data splitting used during resampling (see Chapter \@ref(resampling)). 
 
 * **Model evaluation:** During this phase of model development, we assess the model's performance metrics, examine residual plots, and conduct other EDA-like analyses to understand how well the models work. In some cases, formal between-model comparisons (Chapter \@ref(compare)) help you to understand whether any differences in models are within the experimental noise.   
 
diff --git a/03-base-r.Rmd b/03-base-r.Rmd
@@ -23,11 +23,13 @@ names(crickets)
 
 # Plot the temperature on the x-axis, the chirp rate on the y-axis. The plot
 # elements will be colored differently for each species:
-ggplot(crickets, aes(x = temp, y = rate, col = species)) + 
+ggplot(crickets, 
+       aes(x = temp, y = rate, color = species, pch = species, lty = species)) + 
   # Plot points for each data point and color by species
-  geom_point() + 
+  geom_point(size = 2) + 
   # Show a simple linear model fit created separately for each species:
-  geom_smooth(method = lm, se = FALSE) + 
+  geom_smooth(method = lm, se = FALSE, alpha = 0.5) + 
+  scale_color_brewer(palette = "Paired") +
   labs(x = "Temperature (C)", y = "Chirp Rate (per minute)")
 ```
 
diff --git a/08-feature-engineering.Rmd b/08-feature-engineering.Rmd
@@ -255,7 +255,7 @@ After exploring the Ames training set, we might find that the regression slopes
 ggplot(ames_train, aes(x = Gr_Liv_Area, y = 10^Sale_Price)) + 
   geom_point(alpha = .2) + 
   facet_wrap(~ Bldg_Type) + 
-  geom_smooth(method = lm, formula = y ~ x, se = FALSE, col = "red") + 
+  geom_smooth(method = lm, formula = y ~ x, se = FALSE, color = "lightblue") + 
   scale_x_log10() + 
   scale_y_log10() + 
   labs(x = "Gross Living Area", y = "Sale Price (USD)")
@@ -330,7 +330,7 @@ plot_smoother <- function(deg_free) {
     geom_smooth(
       method = lm,
       formula = y ~ ns(x, df = deg_free),
-      col = "red",
+      color = "lightblue",
       se = FALSE
     ) +
     labs(title = paste(deg_free, "Spline Terms"),
diff --git a/10-resampling.Rmd b/10-resampling.Rmd
@@ -468,7 +468,7 @@ Since this analysis used 10-fold cross-validation, there is one unique predictio
 assess_res %>% 
   ggplot(aes(x = Sale_Price, y = .pred)) + 
   geom_point(alpha = .15) +
-  geom_abline(col = "red") + 
+  geom_abline(color = "red") + 
   coord_obs_pred() + 
   ylab("Predicted")
 ```
diff --git a/11-comparing-models.Rmd b/11-comparing-models.Rmd
@@ -136,7 +136,7 @@ These high correlations indicate that, across models, there are large within-res
 ```{r compare-rsq-plot, eval=FALSE}
 rsq_indiv_estimates %>% 
   mutate(wflow_id = reorder(wflow_id, .estimate)) %>% 
-  ggplot(aes(x = wflow_id, y = .estimate, group = id, col = id)) + 
+  ggplot(aes(x = wflow_id, y = .estimate, group = id, color = id)) + 
   geom_line(alpha = .5, lwd = 1.25) + 
   theme(legend.position = "none")
 ```
@@ -151,7 +151,7 @@ y_lab <- expression(R^2 ~ statistics)
 
 rsq_indiv_estimates %>% 
   mutate(wflow_id = reorder(wflow_id, .estimate)) %>% 
-  ggplot(aes(x = wflow_id, y = .estimate, group = id, col = id)) + 
+  ggplot(aes(x = wflow_id, y = .estimate, group = id, color = id)) + 
   geom_line(alpha = .5, lwd = 1.25) + 
   theme(legend.position = "none") + 
   labs(x = NULL, y = y_lab)
@@ -350,7 +350,7 @@ The four posterior distributions are visualized in Figure \@ref(fig:four-posteri
 model_post %>% 
   mutate(model = forcats::fct_inorder(model)) %>%
   ggplot(aes(x = posterior)) + 
-  geom_histogram(bins = 50, col = "white", fill = "blue", alpha = 0.4) + 
+  geom_histogram(bins = 50, color = "white", fill = "blue", alpha = 0.4) + 
   facet_wrap(~ model, ncol = 1)
 ```
 
@@ -364,8 +364,8 @@ x_lab <- expression(Posterior ~ "for" ~ mean ~ R^2)
 model_post %>% 
   mutate(model = forcats::fct_inorder(model)) %>%
   ggplot(aes(x = posterior)) + 
-  geom_histogram(bins = 50, col = "white", fill = "blue", alpha = 0.4) + 
-  facet_wrap(~ model, ncol = 1) + 
+  geom_histogram(bins = 50, color = "white", fill = "blue", alpha = 0.4) + 
+  facet_wrap(~ model, ncolor = 1) + 
   labs(x = x_lab)
 ```
 
@@ -398,7 +398,7 @@ rqs_diff %>%
   as_tibble() %>% 
   ggplot(aes(x = difference)) + 
   geom_vline(xintercept = 0, lty = 2) + 
-  geom_histogram(bins = 50, col = "white", fill = "red", alpha = 0.4)
+  geom_histogram(bins = 50, color = "white", fill = "red", alpha = 0.4)
 ```
 
 ```{r posterior-difference}
@@ -419,7 +419,7 @@ rqs_diff %>%
   as_tibble() %>% 
   ggplot(aes(x = difference)) + 
   geom_vline(xintercept = 0, lty = 2) + 
-  geom_histogram(bins = 50, col = "white", fill = "red", alpha = 0.4) + 
+  geom_histogram(bins = 50, color = "white", fill = "red", alpha = 0.4) + 
   labs(x = x_lab)
 ```
 
diff --git a/12-tuning-parameters.Rmd b/12-tuning-parameters.Rmd
@@ -112,10 +112,10 @@ To demonstrate, consider the classification data shown in Figure \@ref(fig:two-c
 #| echo = FALSE,
 #| fig.cap = "An example two-class classification data set with two predictors.",
 #| fig.alt = "An example two-class classification data set with two predictors. The two predictors have a moderate correlation and there is some locations of separation between the classes."
-ggplot(training_set, aes(x = A, y = B, col = Class)) + 
-  geom_point(alpha = .5) + 
+ggplot(training_set, aes(x = A, y = B, color = Class, pch = Class)) + 
+  geom_point(alpha = 0.7) + 
   coord_equal()  + 
-  labs(x = "Predictor A", y = "Predictor B", col = NULL) + 
+  labs(x = "Predictor A", y = "Predictor B", color = NULL, pch = NULL) +  
   scale_color_manual(values = c("#CC6677", "#88CCEE"))
 ```
 
@@ -266,9 +266,9 @@ link_grids <-
 
 link_grids %>% 
   ggplot(aes(x = A, y = B)) + 
-  geom_point(data = testing_set, aes(col = Class, pch = Class), 
-             alpha = .5, show.legend = FALSE) + 
-  geom_contour(aes( z = .pred_Class1, lty = link), breaks = 0.5, col = "black") + 
+  geom_point(data = testing_set, aes(color = Class, pch = Class), 
+             alpha = 0.7, show.legend = FALSE) + 
+  geom_contour(aes( z = .pred_Class1, lty = link), breaks = 0.5, color = "black") + 
   scale_color_manual(values = c("#CC6677", "#88CCEE")) + 
   coord_equal() + 
   labs(x = "Predictor A", y = "Predictor B")
@@ -352,9 +352,9 @@ te_plot <-
     label = ifelse(label == " 1 units", " 1 unit", label)
   ) %>% 
   ggplot(aes(x = A, y = B)) + 
-  geom_point(data = testing_set, aes(col = Class, pch = Class), 
-             alpha = .5, show.legend = FALSE) + 
-  geom_contour(aes( z = .pred_Class1), breaks = 0.5, col = "black") + 
+  geom_point(data = testing_set, aes(color = Class, pch = Class), 
+             alpha = 0.5, show.legend = FALSE) + 
+  geom_contour(aes( z = .pred_Class1), breaks = 0.5, color = "black") + 
   scale_color_manual(values = c("#CC6677", "#88CCEE")) + 
   facet_wrap(~ label, nrow = 1) + 
   coord_equal() + 
@@ -374,9 +374,9 @@ tr_plot <-
     label = ifelse(label == " 1 units", " 1 unit", label)
   ) %>% 
   ggplot(aes(x = A, y = B)) +
-  geom_point(data = training_set, aes(col = Class, pch = Class), 
-             alpha = .5, show.legend = FALSE) + 
-  geom_contour(aes( z = .pred_Class1), breaks = 0.5, col = "black") + 
+  geom_point(data = training_set, aes(color = Class, pch = Class), 
+             alpha = 0.5, show.legend = FALSE) + 
+  geom_contour(aes( z = .pred_Class1), breaks = 0.5, color = "black") + 
   scale_color_manual(values = c("#CC6677", "#88CCEE")) + 
   facet_wrap(~ label, nrow = 1) + 
   coord_equal() + 
diff --git a/13-grid-search.Rmd b/13-grid-search.Rmd
@@ -501,7 +501,7 @@ load("extras/parallel_times/resamples_times.RData")
 resamples_times %>%
   dplyr::rename(operation = label) %>% 
   ggplot(aes(y = id_alt, x = duration, fill = operation)) +
-  geom_bar(stat = "identity", col = "black") +
+  geom_bar(stat = "identity", color = "black") +
   labs(y = NULL, x = "Elapsed Time") + 
   scale_fill_brewer(palette = "Paired") +
   theme(legend.position = "top")
@@ -615,7 +615,7 @@ start_stop_dat %>%
    ymax = id_stop,
    fill = operation
   ),
-  col = "black"
+  color = "black"
  ) +
  facet_wrap(~ pid, nrow = 2) +
  labs(y = NULL, x = "Elapsed Time") + 
@@ -653,8 +653,8 @@ First, let's consider the raw execution times in Figure \@ref(fig:parallel-times
 #| fig.alt = "Execution times for model tuning versus the number of workers using different delegation schemes. The diagonal black line indicates a linear speedup where the addition of a new worker process has maximal effect. The 'everything' scheme shows that the benefits decrease after three or four workers, especially when there is expensive preprocessing. The 'resamples' scheme has almost linear speedups across all tasks."
 
 load("extras/parallel_times/xgb_times.RData")
-ggplot(times, aes(x = num_cores, y = elapsed, col = parallel_over, shape = parallel_over)) + 
-  geom_point() + 
+ggplot(times, aes(x = num_cores, y = elapsed, color = parallel_over, shape = parallel_over)) + 
+  geom_point(size = 2) + 
   geom_line() +
   facet_wrap(~ preprocessing) + 
   labs(x = "Number of Workers", y = "Execution Time (s)") + 
@@ -684,9 +684,9 @@ We can also view these data in terms of speed-ups in Figure \@ref(fig:parallel-s
 #| fig.cap = "Speed-ups for model tuning versus the number of workers using different delegation schemes.",
 #| fig.alt = "Speed-ups for model tuning versus the number of workers using different delegation schemes."
 
-ggplot(times, aes(x = num_cores, y = speed_up, col = parallel_over, shape = parallel_over)) + 
+ggplot(times, aes(x = num_cores, y = speed_up, color = parallel_over, shape = parallel_over)) + 
   geom_abline(lty = 1) + 
-  geom_point() + 
+  geom_point(size = 2) + 
   geom_line() +
   facet_wrap(~ preprocessing) + 
   coord_obs_pred() + 
@@ -778,9 +778,9 @@ iter_three <- race_details %>% dplyr::filter(iter == 3)
 
 iter_three %>% 
   ggplot(aes(x = -estimate, y = .config)) + 
-  geom_vline(xintercept = 0, lty = 2, col = "green") +
-  geom_point(size = 2, aes(col = decision)) +
-  geom_errorbarh(aes(xmin = -estimate, xmax = -upper, col = decision), height = .3, show.legend = FALSE) + 
+  geom_vline(xintercept = 0, lty = 2, color = "green") +
+  geom_point(size = 2, aes(color = decision)) +
+  geom_errorbarh(aes(xmin = -estimate, xmax = -upper, color = decision), height = .3, show.legend = FALSE) + 
   labs(x = "Loss of ROC AUC", y = NULL) + 
   scale_colour_manual(values = race_cols)
 ```
@@ -801,8 +801,8 @@ race_ci_plots <- function(x, iters = max(x$iter)) {
     p <-
       x %>% 
       dplyr::filter(iter == i) %>% 
-      ggplot(aes(x = -estimate, y = .config, col = decision)) +
-      geom_vline(xintercept = 0, col = "green", lty = 2) +
+      ggplot(aes(x = -estimate, y = .config, color = decision)) +
+      geom_vline(xintercept = 0, color = "green", lty = 2) +
       geom_point(size = 2) +
       labs(title = ttl, y = "", x = "Loss of ROC AUC") +
       scale_color_manual(values = c(best = "blue", retain = "black", discard = "grey"), 
diff --git a/14-iterative-search.Rmd b/14-iterative-search.Rmd
@@ -241,7 +241,7 @@ To demonstrate, let's look at a toy example with a single parameter that has val
 y_lab <- expression(Estimated ~ R^2)
 
 ggplot(grid, aes(x = x, y = y)) + 
-  geom_line(col = "red", alpha = .5, lwd = 1.25) + 
+  geom_line(color = "red", alpha = .5, lwd = 1.25) + 
   labs(y = y_lab, x = "Tuning Parameter") +
   geom_point(data = current_iter)
 ```
@@ -317,7 +317,7 @@ small_pred %>%
   group_by(value) %>% 
   do(get_density(.)) %>% 
   ungroup() %>% 
-  ggplot(aes(x = x, y = density, col = `Parameter Value`, lty = `Parameter Value`)) + 
+  ggplot(aes(x = x, y = density, color = `Parameter Value`, lty = `Parameter Value`)) + 
   geom_line() +
   geom_vline(xintercept = max(current_iter$y), lty = 3) +  
   labs(x = x_lab) + 
@@ -736,7 +736,7 @@ The process starts with initial values of `penalty = 0.025` and `mixture = 0.050
 #| fig.alt = "An illustration of how simulated annealing determines what is the local neighborhood for two numeric tuning parameters. The clouds of points show possible next values where one would be selected at random. The candidate points are small circular clouds surrounding the current best point."
 
 ggplot(neighbors_values, aes(x = penalty, y = mixture)) + 
-  geom_point(alpha = .3, size = 3/4, aes(col = factor(Iteration)), show.legend = FALSE) + 
+  geom_point(alpha = .3, size = 3/4, aes(color = factor(Iteration)), show.legend = FALSE) + 
   scale_x_continuous(trans = "log10", limits = pen_rng) + 
   scale_y_continuous(limits = mix_rng) + 
   geom_point(data = best_values) + 
diff --git a/15-workflow-sets.Rmd b/15-workflow-sets.Rmd
@@ -483,7 +483,7 @@ collect_metrics(boosting_test_results)
 boosting_test_results %>% 
    collect_predictions() %>% 
    ggplot(aes(x = compressive_strength, y = .pred)) + 
-   geom_abline(col = "green", lty = 2) + 
+   geom_abline(color = "gray50", lty = 2) + 
    geom_point(alpha = 0.5) + 
    coord_obs_pred() + 
    labs(x = "observed", y = "predicted")
diff --git a/16-dimensionality-reduction.Rmd b/16-dimensionality-reduction.Rmd
@@ -231,13 +231,13 @@ library(patchwork)
 p1 <- 
   bean_validation %>% 
   ggplot(aes(x = area)) + 
-  geom_histogram(bins = 30, col = "white", fill = "blue", alpha = 1/3) + 
+  geom_histogram(bins = 30, color = "white", fill = "blue", alpha = 1/3) + 
   ggtitle("Original validation set data")
 
 p2 <- 
   bean_val_processed %>% 
   ggplot(aes(x = area)) + 
-  geom_histogram(bins = 30, col = "white", fill = "red", alpha = 1/3) + 
+  geom_histogram(bins = 30, color = "white", fill = "red", alpha = 1/3) + 
   ggtitle("Processed validation set data")
 
 p1 + p2
@@ -278,7 +278,7 @@ plot_validation_results <- function(recipe, dat = assessment(bean_val$splits[[1]
     # Process the data (the validation set by default)
     bake(new_data = dat) %>%
     # Create the scatterplot matrix
-    ggplot(aes(x = .panel_x, y = .panel_y, col = class, fill = class)) +
+    ggplot(aes(x = .panel_x, y = .panel_y, color = class, fill = class)) +
     geom_point(alpha = 0.4, size = 0.5) +
     geom_autodensity(alpha = .3) +
     facet_matrix(vars(-class), layer.diag = 2) + 
@@ -319,6 +319,7 @@ bean_rec_trained %>%
   step_pca(all_numeric_predictors(), num_comp = 4) %>% 
   prep() %>% 
   plot_top_loadings(component_number <= 4, n = 5) + 
+  scale_fill_brewer(palette = "Paired") +
   ggtitle("Principal Component Analysis")
 ```
 
@@ -357,6 +358,7 @@ bean_rec_trained %>%
   step_pls(all_numeric_predictors(), outcome = "class", num_comp = 4) %>%
   prep() %>% 
   plot_top_loadings(component_number <= 4, n = 5, type = "pls") + 
+  scale_fill_brewer(palette = "Paired") +
   ggtitle("Partial Least Squares")
 ```
 
@@ -530,7 +532,7 @@ Figure \@ref(fig:dimensionality-rankings) illustrates this ranking.
 #| fig.alt = "Area under the ROC curve from the validation set. The three best model configurations use PLS together with regularized discriminant analysis, a multi-layer perceptron, and a naive Bayes model."
 
 rankings %>% 
-  ggplot(aes(x = rank, y = mean, pch = method, col = model)) + 
+  ggplot(aes(x = rank, y = mean, pch = method, color = model)) + 
   geom_point(cex = 3.5) + 
   theme(legend.position = "right") +
   labs(y = "ROC AUC")  +
diff --git a/17-encoding-categorical-data.Rmd b/17-encoding-categorical-data.Rmd
@@ -198,7 +198,7 @@ glm_estimates %>%
       mutate(level = as.character(Neighborhood))
   ) %>%
   ggplot(aes(`no pooling`, pooling, size = sqrt(n))) +
-  geom_abline(color = "red", lty = 2) +
+  geom_abline(color = "gray50", lty = 2) +
   geom_point(alpha = 0.7) +
   coord_fixed()
 ```
diff --git a/18-explaining-models-and-predictions.Rmd b/18-explaining-models-and-predictions.Rmd
@@ -57,7 +57,7 @@ bind_rows(
   augment(rf_fit, ames_train) %>% mutate(model = "random forest")
 ) %>%
   ggplot(aes(Sale_Price, .pred, color = model)) +
-  geom_abline(col = "gray50", lty = 2) + 
+  geom_abline(color = "gray50", lty = 2) + 
   geom_point(alpha = 0.3, show.legend = FALSE) +
   facet_wrap(vars(model)) +
   scale_color_brewer(palette = "Paired") +
diff --git a/19-when-should-you-trust-predictions.Rmd b/19-when-should-you-trust-predictions.Rmd
diff --git a/20-ensemble-models.Rmd b/20-ensemble-models.Rmd
diff --git a/21-inferential-analysis.Rmd b/21-inferential-analysis.Rmd