From d38a5ab5b4674b726c33b6bc20feb01097f26d6f Mon Sep 17 00:00:00 2001 From: Rob J Hyndman Date: Sun, 15 Dec 2024 14:20:35 +1100 Subject: [PATCH] Ran flint and styler across whole package --- R/dist_kde.R | 1 - R/gg_density.R | 12 +++++++---- R/hdr.R | 34 ++++++++++++++++-------------- R/mvscale.R | 8 ++++--- R/show_data.R | 30 ++++++++++++++------------ R/surprisal_prob.R | 31 +++++++++++++++------------ R/surprisals.R | 27 +++++++++++++----------- man/surprisals.Rd | 2 +- tests/testthat/surprisals.R | 4 ++-- tests/testthat/test_dist_density.R | 2 +- tests/testthat/test_dist_kde1.R | 10 ++++----- tests/testthat/test_dist_kde2.R | 4 ++-- 12 files changed, 90 insertions(+), 75 deletions(-) diff --git a/R/dist_kde.R b/R/dist_kde.R index 1367d9b..c8630fd 100644 --- a/R/dist_kde.R +++ b/R/dist_kde.R @@ -201,4 +201,3 @@ kurtosis.dist_kde <- function(x, ..., na.rm = FALSE) { (mean((x$kde$x - mean(x$kde$x))^4) + 6 * h^2 * v - 3 * h^4) / v^2 - 3 } } - diff --git a/R/gg_density.R b/R/gg_density.R index ef8904a..4a05c14 100644 --- a/R/gg_density.R +++ b/R/gg_density.R @@ -71,9 +71,9 @@ gg_density <- function( # Set up data frame containing densities df <- make_density_df(object, ngrid = ngrid) # Repeat colors - if(length(object) > length(colors)) { + if (length(object) > length(colors)) { warning("Insufficient colors. Some densities will be plotted in the same color.") - colors <- rep(colors, 1 + round(length(object)/length(colors)))[seq(length(object))] + colors <- rep(colors, 1 + round(length(object) / length(colors)))[seq_along(object)] } # HDR thresholds if needed @@ -85,7 +85,9 @@ gg_density <- function( # HDR color palette hdr_colors <- lapply( colors, - function(u) { hdr_palette(color = u, prob = c(prob, 0.995)) } + function(u) { + hdr_palette(color = u, prob = c(prob, 0.995)) + } ) names(hdr_colors) <- names_dist(object, unique = TRUE) } else { @@ -231,7 +233,9 @@ gg_density1 <- function( } # Color scale and legend - colors <- unlist(lapply(hdr_colors, function(u) { u[1] })) + colors <- unlist(lapply(hdr_colors, function(u) { + u[1] + })) p <- p + ggplot2::scale_color_manual(breaks = dist_names, values = colors, labels = dist_names) # Don't show color legend if only one density diff --git a/R/hdr.R b/R/hdr.R index 4ae3544..1bab94c 100644 --- a/R/hdr.R +++ b/R/hdr.R @@ -82,7 +82,7 @@ gg_hdrboxplot <- function(data, var1, var2 = NULL, prob = c(0.5, 0.99), # Data to plot show_x <- show_data(dist, prob, threshold, anomalies = show_anomalies) - if(NROW(show_x) != NROW(data)) { + if (NROW(show_x) != NROW(data)) { stop("Something has gone wrong here!") } @@ -177,22 +177,24 @@ hdr_table <- function(object, prob) { return(df) }) } else { - output <- lapply(as.list(object), + output <- lapply( + as.list(object), function(u) { - # If u is a kde, we can use the data - # Otherwise we need to generate a random sample - if (stats::family(u) == "kde") { - x <- lapply(vctrs::vec_data(u), function(u) u$kde$x)[[1]] - } else { - x <- distributional::generate(u, times = 1e5)[[1]] - } - fi <- density(u, at = as.matrix(x))[[1]] - tibble( - distribution = names_dist(object), - prob = prob, - density = quantile(fi, prob = 1 - prob, type = 8) - ) - }) + # If u is a kde, we can use the data + # Otherwise we need to generate a random sample + if (stats::family(u) == "kde") { + x <- lapply(vctrs::vec_data(u), function(u) u$kde$x)[[1]] + } else { + x <- distributional::generate(u, times = 1e5)[[1]] + } + fi <- density(u, at = as.matrix(x))[[1]] + tibble( + distribution = names_dist(object), + prob = prob, + density = quantile(fi, prob = 1 - prob, type = 8) + ) + } + ) } purrr::list_rbind(output) |> dplyr::arrange(distribution, prob) diff --git a/R/mvscale.R b/R/mvscale.R index 2900f2b..4bb26e6 100644 --- a/R/mvscale.R +++ b/R/mvscale.R @@ -60,7 +60,7 @@ mvscale <- function(object, center = stats::median, scale = robustbase::s_Qn, mat <- object } else { # It must be a data frame. So let's find the numeric columns numeric_col <- unlist(lapply(object, is.numeric)) - if (any(!numeric_col) & warning) { + if (!all(numeric_col) & warning) { warning( "Ignoring non-numeric columns: ", paste(names(object)[!numeric_col], collapse = ", ") @@ -74,14 +74,16 @@ mvscale <- function(object, center = stats::median, scale = robustbase::s_Qn, mat <- sweep(mat, 2L, med) } # Create more resilient version of scale function - if(!is.null(scale)) { + if (!is.null(scale)) { my_scale <- function(x, ..., na.rm = TRUE) { s <- scale(x, ..., na.rm = na.rm) s[s == 0] <- 1 # Avoid division by zero return(s) } } else { - my_scale <- function(x, ..., na.rm = TRUE) {1} + my_scale <- function(x, ..., na.rm = TRUE) { + 1 + } } # Scale if (d == 1L) { diff --git a/R/show_data.R b/R/show_data.R index 3c680f8..db4927b 100644 --- a/R/show_data.R +++ b/R/show_data.R @@ -27,21 +27,21 @@ show_data <- function(object, prob, threshold, anomalies = FALSE) { ) # Compute density values show_x <- mapply( - function(u, dist) { - d <- NCOL(u) - 1 - u$den <- unlist(density(dist, at = as.matrix(u[, seq(d)]))) - return(u) - }, - u = show_x, dist = as.list(object), - SIMPLIFY = FALSE - ) + function(u, dist) { + d <- NCOL(u) - 1 + u$den <- unlist(density(dist, at = as.matrix(u[, seq(d)]))) + return(u) + }, + u = show_x, dist = as.list(object), + SIMPLIFY = FALSE + ) # Compute surprisal probabilities - if(anomalies) { + if (anomalies) { show_x <- mapply( function(u, dist) { d <- NCOL(u) - 2 data <- as.matrix(u[, seq(d)]) - u$prob <- surprisals_from_den(data, den= log(u$den), distribution = dist, loo = TRUE, probablity = TRUE) + u$prob <- surprisals_from_den(data, den = log(u$den), distribution = dist, loo = TRUE, probablity = TRUE) return(u) }, u = show_x, dist = as.list(object), @@ -49,7 +49,7 @@ show_data <- function(object, prob, threshold, anomalies = FALSE) { ) } # Divide into HDR groups - if(!is.null(threshold)) { + if (!is.null(threshold)) { show_x <- mapply( function(u, threshold) { u$group <- cut(u$den, breaks = c(0, threshold$threshold, Inf), labels = FALSE) @@ -66,13 +66,15 @@ show_data <- function(object, prob, threshold, anomalies = FALSE) { ) } # Mark anomalies - if(anomalies) { + if (anomalies) { # Anomalies are points with p < 0.005 - show_x <- lapply(show_x, + show_x <- lapply( + show_x, function(u) { u$anomaly <- u$prob < 0.005 & u$group == "Outside" return(u) - }) + } + ) } # Combine into a single tibble purrr::list_rbind(show_x) diff --git a/R/surprisal_prob.R b/R/surprisal_prob.R index df55ea0..8ec0b73 100644 --- a/R/surprisal_prob.R +++ b/R/surprisal_prob.R @@ -10,12 +10,12 @@ surprisal_prob <- function( y = NULL) { approximation <- match.arg(approximation) n <- length(s) - if(all(is.na(s))) { + if (all(is.na(s))) { return(rep(NA_real_, n)) } - if(approximation == "none") { - if(dimension_dist(distribution) > 1) { + if (approximation == "none") { + if (dimension_dist(distribution) > 1) { warning("Using an empirical approximation for multivariate data") approximation <- "empirical" } else if (identical(unique(stats::family(distribution)), "normal")) { @@ -24,21 +24,23 @@ surprisal_prob <- function( approximation <- "symmetric" } } - if(approximation == "none") { + if (approximation == "none") { # Univariate, not normal, not symmetric - if(length(unique(distribution)) == 1L) { + if (length(unique(distribution)) == 1L) { distribution <- unique(distribution) } else { # Need to compute probabilities one by one dd <- length(distribution) - if(dd != n) { + if (dd != n) { stop("Length of distribution must be 1 or equal to length of s") } p <- numeric(n) - for(i in seq(n)) { - p[i] <- surprisal_prob(s[i], distribution[i], y = y[i], + for (i in seq(n)) { + p[i] <- surprisal_prob(s[i], distribution[i], + y = y[i], approximation = approximation, - threshold_probability = threshold_probability) + threshold_probability = threshold_probability + ) } return(p) } @@ -63,7 +65,7 @@ surprisal_prob <- function( dist_x <- unique(unlist(dist_x)) dist_y <- -unlist(density(distribution, dist_x, log = TRUE)) prob <- (rank(dist_y) - 1) / length(dist_y) - if(all(is.na(dist_y)) | all(is.na(prob))) { + if (all(is.na(dist_y)) | all(is.na(prob))) { return(rep(NA_real_, n)) } p <- 1 - approx(dist_y, prob, xout = s, rule = 2, ties = mean)$y @@ -104,10 +106,11 @@ surprisal_normal_prob <- function(s, distribution) { is_symmetric <- function(dist) { dist <- unique(dist) fam <- stats::family(dist) - if(length(fam) > 1) { - for(i in seq_along(fam)) { - if(!is_symmetric(dist[i])) + if (length(fam) > 1) { + for (i in seq_along(fam)) { + if (!is_symmetric(dist[i])) { return(FALSE) + } } return(TRUE) } else if (fam %in% c("student_t", "cauchy", "logistic", "triangular", "uniform")) { @@ -118,7 +121,7 @@ is_symmetric <- function(dist) { q1 <- q1 - q1[1] q2 <- q2 - q2[1] out <- sum(abs(q1 + q2) / max(abs(c(q1, q2)))) - if(is.na(out)) { + if (is.na(out)) { return(FALSE) } else { return(out < 1e-8) diff --git a/R/surprisals.R b/R/surprisals.R index 35dd1e5..8105fda 100644 --- a/R/surprisals.R +++ b/R/surprisals.R @@ -71,7 +71,7 @@ surprisals <- function( #' y = c(5, rnorm(49)), #' p_kde = surprisals(y, loo = TRUE), #' p_normal = surprisals(y, distribution = dist_normal()), -#' p_zscore = 2*(1-pnorm(abs(y))) +#' p_zscore = 2 * (1 - pnorm(abs(y))) #' ) #' tibble( #' y = n01$v1, @@ -101,17 +101,19 @@ surprisals.default <- function( if (NCOL(object) == 1L) { object <- c(object) } - if(length(distribution) > 1 & length(object) > 1) { - if(length(distribution) != length(object)) + if (length(distribution) > 1 & length(object) > 1) { + if (length(distribution) != length(object)) { stop("Length of distribution and object must be the same or equal to 1") + } } - if(length(distribution) == NROW(object)) { + if (length(distribution) == NROW(object)) { den <- mapply(density, distribution, object, log = TRUE) } else { den <- density(distribution, at = object, log = TRUE) - if(is.list(den)) { - if(length(den) > 1) + if (is.list(den)) { + if (length(den) > 1) { stop("What's going on?") + } den <- den[[1]] } } @@ -133,9 +135,10 @@ surprisals_from_den <- function( if (NCOL(object) == 1L) { object <- c(object) } - if(length(distribution) > 1 & length(object) > 1) { - if(length(distribution) != length(object)) + if (length(distribution) > 1 & length(object) > 1) { + if (length(distribution) != length(object)) { stop("Length of distribution and object must be the same or equal to 1") + } } scores <- -den if (loo & all(stats::family(distribution) == "kde")) { @@ -155,10 +158,10 @@ surprisals_from_den <- function( } if (probability) { surprisal_prob(scores, - distribution = distribution, - approximation = approximation, - threshold_probability = threshold_probability, - y = y + distribution = distribution, + approximation = approximation, + threshold_probability = threshold_probability, + y = y ) |> suppressWarnings() } else { diff --git a/man/surprisals.Rd b/man/surprisals.Rd index 1f3b9eb..e6704bf 100644 --- a/man/surprisals.Rd +++ b/man/surprisals.Rd @@ -93,7 +93,7 @@ tibble( y = c(5, rnorm(49)), p_kde = surprisals(y, loo = TRUE), p_normal = surprisals(y, distribution = dist_normal()), - p_zscore = 2*(1-pnorm(abs(y))) + p_zscore = 2 * (1 - pnorm(abs(y))) ) tibble( y = n01$v1, diff --git a/tests/testthat/surprisals.R b/tests/testthat/surprisals.R index 2819dfd..c079a10 100644 --- a/tests/testthat/surprisals.R +++ b/tests/testthat/surprisals.R @@ -1,8 +1,8 @@ test_that("multiplication works", { set.seed(2) y <- c(rnorm(10), 100) - expect_equal(max(surprisals(y, dist_normal())), Inf) - expect_equal(max(surprisals(y, h = 1, loo = TRUE)), Inf) + expect_identical(max(surprisals(y, dist_normal())), Inf) + expect_identical(max(surprisals(y, h = 1, loo = TRUE)), Inf) expect_equal(max(surprisals(y, h = 1, loo = FALSE)), log(11 / dnorm(0, 0, 1)), tolerance = 0.01 diff --git a/tests/testthat/test_dist_density.R b/tests/testthat/test_dist_density.R index d27cce5..da74381 100644 --- a/tests/testthat/test_dist_density.R +++ b/tests/testthat/test_dist_density.R @@ -30,7 +30,7 @@ test_that("dist_density", { ) # CDF expect_equal(distributional::cdf(dist, at), - list(ifelse(at < 2, 0, 1), pnorm(at), pexp(at)), + list(as.integer(!(at < 2)), pnorm(at), pexp(at)), tolerance = 0.001 ) # Quantiles diff --git a/tests/testthat/test_dist_kde1.R b/tests/testthat/test_dist_kde1.R index 917fd93..c9cff53 100644 --- a/tests/testthat/test_dist_kde1.R +++ b/tests/testthat/test_dist_kde1.R @@ -5,19 +5,19 @@ test_that("dist_kde1", { y <- c(rnorm(100), rnorm(100, 5)) dist <- dist_kde(list(x, y)) # Mean - expect_equal(mean(dist), c(mean(x), mean(y))) + expect_identical(mean(dist), c(mean(x), mean(y))) # Median expect_equal(median(dist), quantile(dist, 0.5)) # Variance - expect_equal(distributional::variance(dist) > 0, c(TRUE, TRUE)) + expect_identical(distributional::variance(dist) > 0, c(TRUE, TRUE)) # Density at <- seq(-4, 10, by = 1) - expect_equal(lengths(density(dist, at)), c(15L, 15L)) + expect_identical(lengths(density(dist, at)), c(15L, 15L)) # CDF - expect_equal(lengths(distributional::cdf(dist, at)), c(15L, 15L)) + expect_identical(lengths(distributional::cdf(dist, at)), c(15L, 15L)) # Quantiles p <- (1:19) / 20 - expect_equal(lengths(quantile(dist, p = p)), c(19L, 19L)) + expect_identical(lengths(quantile(dist, p = p)), c(19L, 19L)) # Generate rand_dist <- distributional::generate(dist, times = 1e6) expect_equal(lapply(rand_dist, mean) |> unlist(), diff --git a/tests/testthat/test_dist_kde2.R b/tests/testthat/test_dist_kde2.R index 12d0066..4ab5feb 100644 --- a/tests/testthat/test_dist_kde2.R +++ b/tests/testthat/test_dist_kde2.R @@ -5,7 +5,7 @@ test_that("dist_kde2", { y <- c(rnorm(100), rnorm(100, 5)) dist <- dist_kde(cbind(x, y)) # Mean - expect_equal(mean(dist), matrix(c(x = mean(x), y = mean(y)), nrow = 1)) + expect_identical(mean(dist), matrix(c(x = mean(x), y = mean(y)), nrow = 1)) # Median expect_error(median(dist)) # Variance @@ -13,7 +13,7 @@ test_that("dist_kde2", { # Density at <- expand.grid(x = seq(-3, 3, by = 0.5), y = seq(-2, 10, by = 2)) |> as.matrix() - expect_equal(all(density(dist, at)[[1]] >= 0), TRUE) + expect_true(all(density(dist, at)[[1]] >= 0)) # CDF expect_error(distributional::cdf(dist, at)) # Quantiles