diff --git a/DESCRIPTION b/DESCRIPTION index 4386fae..44785ef 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: aum Type: Package Title: Area Under Minimum of False Positives and Negatives -Version: 2023.4.4 +Version: 2023.5.12 Authors@R: c( person("Toby Dylan", "Hocking", email="toby.hocking@r-project.org", diff --git a/NEWS b/NEWS index 7668054..7f21749 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,11 @@ +Changes in version 2023.5.12 + +- aum_linear_model/cv defaults to maxIterations="min.aum". + +Changes in version 2023.4.7 + +- aum_line_search now accepts maxIterations="min.aum" or "max.auc". + Changes in version 2023.4.4 - removed some code to check faster on CRAN. diff --git a/R/aum_diffs.R b/R/aum_diffs.R index ef90617..e433cd7 100644 --- a/R/aum_diffs.R +++ b/R/aum_diffs.R @@ -40,7 +40,7 @@ plot.aum_diffs <- function ggplot2::geom_segment(ggplot2::aes( min.pred, value, xend=max.pred, yend=value, - color=variable, size=variable), + color=variable, linewidth=variable), data=err.tall)+ ggplot2::geom_vline(ggplot2::aes( xintercept=pred), @@ -280,7 +280,7 @@ aum_diffs_penalty <- structure(function geom_segment(aes( -log(min.lambda), value, xend=-log(max.lambda), yend=value, - color=variable, size=variable), + color=variable, linewidth=variable), data=fn.not.zero.tall)+ geom_point(aes( -log(min.lambda), value, diff --git a/R/aum_line_search.R b/R/aum_line_search.R index aeab5f0..f476ee6 100644 --- a/R/aum_line_search.R +++ b/R/aum_line_search.R @@ -1,5 +1,8 @@ aum_line_search <- structure(function -### Exact line search. +### Exact line search using a C++ STL map (red-black tree) to +### implement a queue of line intersection events. If number of rows +### of error.diff.df is B, and number of iterations is I, then space +### complexity is O(B) and time complexity is O( (I+B)log B ). (error.diff.df, ### aum_diffs data frame with B rows, one for each breakpoint in ### example-specific error functions. @@ -11,7 +14,9 @@ aum_line_search <- structure(function ### N-vector of numeric predicted values. If NULL, feature.mat and ### weight.vec will be used to compute predicted values. maxIterations=nrow(error.diff.df) -### positive int: max number of line search iterations. +### max number of line search iterations, either a positive integer or +### "max.auc" or "min.aum" indicating to keep going until AUC +### decreases or AUM increases. ){ . <- fp.diff <- fn.diff <- intercept <- slope <- step.size <- NULL ## Above to suppress CRAN NOTE. @@ -38,11 +43,18 @@ aum_line_search <- structure(function fp.diff=sum(fp.diff), fn.diff=sum(fn.diff) ), keyby=.(intercept, slope)] + if(identical(maxIterations, "max.auc"))maxIterations <- -1L + if(identical(maxIterations, "min.aum"))maxIterations <- 0L line.search.all <- aumLineSearch(L$line_search_input, maxIterations) L$line_search_result <- data.table(line.search.all)[0 <= step.size] class(L) <- c("aum_line_search", class(L)) L -### List of class aum_line_search. +### List of class aum_line_search. Element named "line_search_result" +### is a data table with number of rows equal to maxIterations (if it +### is positive integer, info for all steps, q.size column is number +### of items in queue at each iteration), otherwise 1 (info for the +### best step, q.size column is the total number of items popped off +### the queue). }, ex=function(){ ## Example 1: two binary data. @@ -150,18 +162,18 @@ plot.aum_line_search <- function data=hline.df)+ ggplot2::geom_line(ggplot2::aes( step.size, aum), - size=1, + linewidth=1, data=aum.df)+ ggplot2::geom_segment(ggplot2::aes( min.step.size, auc, xend=max.step.size, yend=auc), - size=1, + linewidth=1, data=auc.segs)+ ggplot2::geom_segment(ggplot2::aes( step.size, aum, xend=step.after, yend=aum.after), linetype="dotted", - size=1, + linewidth=1, data=last.seg)+ ggplot2::geom_point(ggplot2::aes( step.size, value), diff --git a/R/aum_linear_model.R b/R/aum_linear_model.R index 3f72cf3..2c30ad4 100644 --- a/R/aum_linear_model.R +++ b/R/aum_linear_model.R @@ -8,7 +8,7 @@ aum_linear_model_cv <- structure(function ### data table of differences in error functions, from ### aum_diffs_penalty or aum_diffs_binary. There should be an example ### column with values from 0 to N-1. - maxIterations=nrow(feature.mat), + maxIterations="min.aum", ### max iterations of the exact line search, default is number of examples. improvement.thresh=NULL, ### before doing cross-validation to learn the number of gradient @@ -92,6 +92,7 @@ aum_linear_model_cv <- structure(function initial.weight.fun=initial.weight.fun, max.steps=best.row$step.number, maxIterations=maxIterations) + final.model$min.valid.aum <- best.row final.model$fold.loss <- fold.loss final.model$set.loss <- set.loss final.model$keep <- keep @@ -111,15 +112,46 @@ aum_linear_model_cv <- structure(function }, ex=function(){ ## simulated binary classification problem. - N.rows <- 50 + N.rows <- 60 N.cols <- 2 set.seed(1) feature.mat <- matrix(rnorm(N.rows*N.cols), N.rows, N.cols) unknown.score <- feature.mat[,1]*2.1 + rnorm(N.rows) label.vec <- ifelse(unknown.score > 0, 1, 0) diffs.dt <- aum::aum_diffs_binary(label.vec) - model <- aum::aum_linear_model_cv(feature.mat, diffs.dt) - plot(model) + + ## Default line search keeps doing iterations until increase in AUM. + (default.time <- system.time({ + default.model <- aum::aum_linear_model_cv(feature.mat, diffs.dt) + })) + plot(default.model) + print(default.valid <- default.model[["set.loss"]][set=="validation"]) + print(default.model[["search"]][, .(step.size, aum, iterations=q.size)]) + + ## Can specify max number of iterations of line search. + (small.step.time <- system.time({ + small.step.model <- aum::aum_linear_model_cv(feature.mat, diffs.dt, maxIterations = N.rows) + })) + plot(small.step.model) + print(small.step.valid <- small.step.model[["set.loss"]][set=="validation"]) + small.step.model[["search"]][, .(step.size, aum, iterations=q.size)] + + ## Compare number of steps, iterations and time. On my machine small + ## step model takes more time/steps, but less iterations in the C++ + ## line search code. + cbind( + iterations=c( + default=default.model[["search"]][, sum(q.size)], + small.step=small.step.model[["search"]][, sum(q.size)]), + seconds=c( + default.time[["elapsed"]], + small.step.time[["elapsed"]]), + steps=c( + default.model[["min.valid.aum"]][["step.number"]], + small.step.model[["min.valid.aum"]][["step.number"]]), + min.valid.aum=c( + default.model[["min.valid.aum"]][["aum_mean"]], + small.step.model[["min.valid.aum"]][["aum_mean"]])) }) @@ -153,7 +185,7 @@ aum_linear_model <- function ### non-negative real number: keep doing gradient descent while the ### improvement in AUM is greater than this number (specify either ### this or max.steps, not both). - maxIterations=nrow(feature.list$subtrain), + maxIterations="min.aum", ### max number of iterations of exact line search, default is number ### of subtrain examples. initial.weight.fun=NULL @@ -170,6 +202,7 @@ aum_linear_model <- function improvement <- old.aum <- Inf step.number <- 0 loss.dt.list <- list() + search.dt.list <- list() while({ search.result <- aum::aum_line_search( diff.list$subtrain, @@ -191,6 +224,7 @@ aum_linear_model <- function } exact.dt <- data.table(search.result$line_search_result) best.row <- exact.dt[which.min(aum)] + search.dt.list[[paste(step.number)]] <- best.row improvement <- old.aum-best.row$aum old.aum <- best.row$aum if(!is.null(improvement.thresh)){ @@ -217,15 +251,17 @@ aum_linear_model <- function }else{ mean(thresh[c(best-1,best)]) } - }]) + }], + search=rbindlist(search.dt.list)) structure(out.list, class="aum_linear_model") ### Linear model represented as a list of class aum_linear_model with ### named elements: loss is a data table of values for subtrain and ### optionally validation at each step, weight.vec is the final vector -### of weights learned via gradient descent, and intercept is the -### value which results in minimal total error (FP+FN), learned via a -### linear scan over all possible values given the final weight -### vector. +### of weights learned via gradient descent, intercept is the value +### which results in minimal total error (FP+FN), learned via a linear +### scan over all possible values given the final weight vector, and +### search is a data table with one row for each step (best step size +### and number of iterations of line search). } plot.aum_linear_model <- function(x, ...){ diff --git a/man/aum_diffs_penalty.Rd b/man/aum_diffs_penalty.Rd index 4afb150..9853276 100644 --- a/man/aum_diffs_penalty.Rd +++ b/man/aum_diffs_penalty.Rd @@ -69,7 +69,7 @@ if(require("ggplot2")){ geom_segment(aes( -log(min.lambda), value, xend=-log(max.lambda), yend=value, - color=variable, size=variable), + color=variable, linewidth=variable), data=fn.not.zero.tall)+ geom_point(aes( -log(min.lambda), value, diff --git a/man/aum_line_search.Rd b/man/aum_line_search.Rd index 7778f65..a05385b 100644 --- a/man/aum_line_search.Rd +++ b/man/aum_line_search.Rd @@ -1,7 +1,10 @@ \name{aum_line_search} \alias{aum_line_search} \title{aum line search} -\description{Exact line search.} +\description{Exact line search using a C++ STL map (red-black tree) to +implement a queue of line intersection events. If number of rows +of \code{error.diff.df} is B, and number of iterations is I, then space +complexity is O(B) and time complexity is O( (I+B)log B ).} \usage{aum_line_search(error.diff.df, feature.mat, weight.vec, pred.vec = NULL, @@ -13,10 +16,17 @@ example-specific error functions.} \item{weight.vec}{p-vector of numeric linear model coefficients.} \item{pred.vec}{N-vector of numeric predicted values. If NULL, \code{feature.mat} and \code{weight.vec} will be used to compute predicted values.} - \item{maxIterations}{positive int: max number of line search iterations.} + \item{maxIterations}{max number of line search iterations, either a positive integer or +"max.auc" or "min.aum" indicating to keep going until AUC +decreases or AUM increases.} } -\value{List of class aum_line_search.} +\value{List of class aum_line_search. Element named "line_search_result" +is a data table with number of rows equal to \code{maxIterations} (if it +is positive integer, info for all steps, q.size column is number +of items in queue at each iteration), otherwise 1 (info for the +best step, q.size column is the total number of items popped off +the queue).} \author{Toby Dylan Hocking [aut, cre], Jadon Fowler [aut] (Contributed exact line search C++ code)} diff --git a/man/aum_linear_model.Rd b/man/aum_linear_model.Rd index bb9a84b..508d37a 100644 --- a/man/aum_linear_model.Rd +++ b/man/aum_linear_model.Rd @@ -7,7 +7,7 @@ descent with exact line search.} \usage{aum_linear_model(feature.list, diff.list, max.steps = NULL, improvement.thresh = NULL, - maxIterations = nrow(feature.list$subtrain), + maxIterations = "min.aum", initial.weight.fun = NULL)} \arguments{ \item{feature.list}{List with named elements subtrain and optionally validation, each @@ -29,10 +29,11 @@ random standard normal vector.} \value{Linear model represented as a list of class aum_linear_model with named elements: loss is a data table of values for subtrain and optionally validation at each step, weight.vec is the final vector -of weights learned via gradient descent, and intercept is the -value which results in minimal total error (FP+FN), learned via a -linear scan over all possible values given the final weight -vector.} +of weights learned via gradient descent, intercept is the value +which results in minimal total error (FP+FN), learned via a linear +scan over all possible values given the final weight vector, and +search is a data table with one row for each step (best step size +and number of iterations of line search).} \author{Toby Dylan Hocking [aut, cre], Jadon Fowler [aut] (Contributed exact line search C++ code)} diff --git a/man/aum_linear_model_cv.Rd b/man/aum_linear_model_cv.Rd index 7d53fd9..99a01e8 100644 --- a/man/aum_linear_model_cv.Rd +++ b/man/aum_linear_model_cv.Rd @@ -5,7 +5,7 @@ descent steps with exact line search, in linear model for minimizing AUM.} \usage{aum_linear_model_cv(feature.mat, - diff.dt, maxIterations = nrow(feature.mat), + diff.dt, maxIterations = "min.aum", improvement.thresh = NULL, n.folds = 3, initial.weight.fun = NULL)} \arguments{ @@ -43,14 +43,45 @@ selecting the best number of gradient descent steps.} \examples{ ## simulated binary classification problem. -N.rows <- 50 +N.rows <- 60 N.cols <- 2 set.seed(1) feature.mat <- matrix(rnorm(N.rows*N.cols), N.rows, N.cols) unknown.score <- feature.mat[,1]*2.1 + rnorm(N.rows) label.vec <- ifelse(unknown.score > 0, 1, 0) diffs.dt <- aum::aum_diffs_binary(label.vec) -model <- aum::aum_linear_model_cv(feature.mat, diffs.dt) -plot(model) + +## Default line search keeps doing iterations until increase in AUM. +(default.time <- system.time({ + default.model <- aum::aum_linear_model_cv(feature.mat, diffs.dt) +})) +plot(default.model) +print(default.valid <- default.model[["set.loss"]][set=="validation"]) +print(default.model[["search"]][, .(step.size, aum, iterations=q.size)]) + +## Can specify max number of iterations of line search. +(small.step.time <- system.time({ + small.step.model <- aum::aum_linear_model_cv(feature.mat, diffs.dt, maxIterations = N.rows) +})) +plot(small.step.model) +print(small.step.valid <- small.step.model[["set.loss"]][set=="validation"]) +small.step.model[["search"]][, .(step.size, aum, iterations=q.size)] + +## Compare number of steps, iterations and time. On my machine small +## step model takes more time/steps, but less iterations in the C++ +## line search code. +cbind( + iterations=c( + default=default.model[["search"]][, sum(q.size)], + small.step=small.step.model[["search"]][, sum(q.size)]), + seconds=c( + default.time[["elapsed"]], + small.step.time[["elapsed"]]), + steps=c( + default.model[["min.valid.aum"]][["step.number"]], + small.step.model[["min.valid.aum"]][["step.number"]]), + min.valid.aum=c( + default.model[["min.valid.aum"]][["aum_mean"]], + small.step.model[["min.valid.aum"]][["aum_mean"]])) } diff --git a/src/aum_line_search.cpp b/src/aum_line_search.cpp index 0042f20..d41f150 100644 --- a/src/aum_line_search.cpp +++ b/src/aum_line_search.cpp @@ -1,4 +1,5 @@ #include "aum_line_search.h" +#define EPSILON 1e-6 using namespace std; bool Point::isFinite() const { @@ -333,27 +334,29 @@ int lineSearch intersectionCountVec[0] = 0; intervalCountVec[0] = 0; qSizeVec[0]=queue.step_IntervalColumn_map.size(); - for//iterations/step sizes - (int iteration = 1; - iteration < maxIterations && !queue.step_IntervalColumn_map.empty(); - iteration++){ + int iteration=0; + double min_aum=aumVec[0], max_auc=aucAfterStepVec[0]; + double min_aum_first_step=0, max_auc_first_step=0; + int total_intersections=0, total_intervals=0; + while(1){ + iteration++; + if(iteration==maxIterations || queue.step_IntervalColumn_map.empty()){ + return 0; + } auto groups_it = queue.step_IntervalColumn_map.begin(); double stepSize = groups_it->first; aum += total_auc.aum_slope * (stepSize - lastStepSize); - stepSizeVec[iteration] = stepSize; - aumVec[iteration] = aum; IntervalColumn groups = groups_it->second; groups.set_intervals_ranks(); double more_auc_at_step = total_auc.handle_interval_groups(&groups, -1.0); double auc_after_remove = total_auc.value; - intersectionCountVec[iteration] = groups.thresh_intervals_map.size(); - intervalCountVec[iteration] = 0; + int interval_count = 0; for//thresholds at a given step size. (auto intervals_it = groups.thresh_intervals_map.begin(); intervals_it != groups.thresh_intervals_map.end(); intervals_it++){ double FPhi_tot=0, FPlo_tot=0, FNhi_tot=0, FNlo_tot=0; - intervalCountVec[iteration] += intervals_it->second.n_intervals; + interval_count += intervals_it->second.n_intervals; int lowest_rank = intervals_it->second.low_rank; int highest_rank = intervals_it->second.high_rank; for//intervals within a given threshold. @@ -393,9 +396,6 @@ int lineSearch } } total_auc.handle_interval_groups(&groups, 1.0); - aumSlopeAfterStepVec[iteration] = total_auc.aum_slope; - aucAtStepVec[iteration] = auc_after_remove+more_auc_at_step; - aucAfterStepVec[iteration] = total_auc.value; // queue the next actions/intersections. queue.step_IntervalColumn_map.erase(groups_it); int prev_high_rank = 0; @@ -414,7 +414,54 @@ int lineSearch } prev_high_rank = highest_rank; } - qSizeVec[iteration]=queue.step_IntervalColumn_map.size(); + // write outputs. + if(maxIterations>=1){ + stepSizeVec[iteration] = stepSize; + aumVec[iteration] = aum; + aumSlopeAfterStepVec[iteration] = total_auc.aum_slope; + aucAtStepVec[iteration] = auc_after_remove+more_auc_at_step; + aucAfterStepVec[iteration] = total_auc.value; + intersectionCountVec[iteration] = groups.thresh_intervals_map.size(); + intervalCountVec[iteration] = interval_count; + qSizeVec[iteration]=queue.step_IntervalColumn_map.size(); + } + total_intersections += groups.thresh_intervals_map.size(); + total_intervals += interval_count; + if(aum < min_aum){ + min_aum = aum; + min_aum_first_step = stepSize; + } + bool found_min = queue.step_IntervalColumn_map.empty() || + aum > min_aum || aummin_aum) ? lastStepSize : stepSize; + stepSizeVec[0]=(big_step+min_aum_first_step)/2; + aumVec[0]=min_aum; + aumSlopeAfterStepVec[0]=INFINITY; + aucAtStepVec[0]=INFINITY; + aucAfterStepVec[0]=INFINITY; + intersectionCountVec[0]=total_intersections; + intervalCountVec[0]=total_intervals; + qSizeVec[0]=iteration; + return 0; + } + if(total_auc.value > max_auc){ + max_auc = total_auc.value; + max_auc_first_step = stepSize; + } + bool found_max = queue.step_IntervalColumn_map.empty() || + EPSILON < max_auc-total_auc.value; + if(found_max && maxIterations == -1){ + stepSizeVec[0]=(stepSize+max_auc_first_step)/2; + aumVec[0]=-INFINITY; + aumSlopeAfterStepVec[0]=-INFINITY; + aucAtStepVec[0]=max_auc; + aucAfterStepVec[0]=INFINITY; + intersectionCountVec[0]=total_intersections; + intervalCountVec[0]=total_intervals; + qSizeVec[0]=iteration; + return 0; + } lastStepSize = stepSize; } return 0;//SUCCESS diff --git a/src/aum_line_search.h b/src/aum_line_search.h index f9c785e..43c98a6 100644 --- a/src/aum_line_search.h +++ b/src/aum_line_search.h @@ -33,6 +33,6 @@ int lineSearch( #define ERROR_LINE_SEARCH_INTERCEPTS_SHOULD_BE_NON_DECREASING 1 #define ERROR_LINE_SEARCH_SLOPES_SHOULD_BE_INCREASING_FOR_EQUAL_INTERCEPTS 2 #define ERROR_LINE_SEARCH_MAX_FP_SHOULD_BE_POSITIVE 3 -#define ERROR_LINE_SEARCH_MAX_FN_SHOULD_BE_POSITIVE 3 +#define ERROR_LINE_SEARCH_MAX_FN_SHOULD_BE_POSITIVE 4 #endif //AUM_LINE_SEARCH_AUMLINESEARCH_H diff --git a/src/interface.cpp b/src/interface.cpp index e517df1..64fd86f 100644 --- a/src/interface.cpp +++ b/src/interface.cpp @@ -68,24 +68,27 @@ Rcpp::List aum_sort_interface // [[Rcpp::export]] Rcpp::DataFrame aumLineSearch(const Rcpp::DataFrame df, int maxIterations) { + if(maxIterations < -1){ + Rcpp::stop("maxIterations must be either -1 (first max auc), 0 (first min aum), or positive (run for that many iterations)"); + } // extract columns from dataframe Rcpp::NumericVector fpDiff = df["fp.diff"]; Rcpp::NumericVector fnDiff = df["fn.diff"]; Rcpp::NumericVector intercept = df["intercept"]; Rcpp::NumericVector slope = df["slope"]; - int lineCount = df.nrow(); - Rcpp::NumericVector stepSizeVec(maxIterations, -1.0); - Rcpp::NumericVector aumVec(maxIterations, -1.0); - Rcpp::NumericVector aumSlopeAfterStepVec(maxIterations, -100.0); - Rcpp::NumericVector aucAtStepVec(maxIterations, -1.0); - Rcpp::NumericVector aucAfterStepVec(maxIterations, -1.0); - Rcpp::IntegerVector intersectionCountVec(maxIterations, -1); - Rcpp::IntegerVector intervalCountVec(maxIterations, -1); - Rcpp::IntegerVector qSizeVec(maxIterations, -1); + int n_out = maxIterations<1 ? 1 : maxIterations; + Rcpp::NumericVector stepSizeVec(n_out, -1.0); + Rcpp::NumericVector aumVec(n_out, -1.0); + Rcpp::NumericVector aumSlopeAfterStepVec(n_out, -100.0); + Rcpp::NumericVector aucAtStepVec(n_out, -1.0); + Rcpp::NumericVector aucAfterStepVec(n_out, -1.0); + Rcpp::IntegerVector intersectionCountVec(n_out, -1); + Rcpp::IntegerVector intervalCountVec(n_out, -1); + Rcpp::IntegerVector qSizeVec(n_out, -1); int status = lineSearch( &intercept[0], &slope[0], - lineCount, + df.nrow(), &fpDiff[0], &fnDiff[0], maxIterations, diff --git a/tests/testthat/test-CRAN-line-search.R b/tests/testthat/test-CRAN-line-search.R index 5a0d179..8169bd4 100644 --- a/tests/testthat/test-CRAN-line-search.R +++ b/tests/testthat/test-CRAN-line-search.R @@ -15,6 +15,17 @@ test_that("error when slope same", { }, "slopes should be increasing for equal intercepts") }) +test_that("error for negative max iterations", { + three.intersect <- data.frame( + intercept=c(-1,0,1), + slope=c(1, 0, -1), + fp.diff=c(0.5,0,0.5), + fn.diff=c(0,-0.5,-0.5)) + expect_error({ + aum:::aumLineSearch(three.intersect, maxIterations = -2) + }, "maxIterations must be either -1 (first max auc), 0 (first min aum), or positive (run for that many iterations)", fixed=TRUE) +}) + test_that("contrived three way tie computed ok", { three.intersect <- data.frame( intercept=c(-1,0,1), @@ -188,3 +199,99 @@ test_that("complex real data example", { step.dt[, expect_equal(computed, expected)] }) +test_that("dynamic line search works", { + data(neuroblastomaProcessed, package="penaltyLearning", envir=environment()) + nb.err <- with(neuroblastomaProcessed$errors, data.frame( + example=paste0(profile.id, ".", chromosome), + min.lambda, + max.lambda, + fp, fn)) + X.sc <- scale(neuroblastomaProcessed$feature.mat) + keep <- apply(is.finite(X.sc), 2, all) + X.keep <- X.sc[1:50,keep] + weight.vec <- rep(0, ncol(X.keep)) + (nb.diffs <- aum::aum_diffs_penalty(nb.err, rownames(X.keep))) + nb.weight.search <- aum::aum_line_search( + nb.diffs, + feature.mat=X.keep, + weight.vec=weight.vec, + maxIterations = 200) + nb.weight.search$line_search_result[, `:=`( + iteration = .I-1L, + cum.intersections=cumsum(intersections), + cum.intervals=cumsum(intervals))] + ## dynamic min aum. + first.min.aum <- aum::aum_line_search( + nb.diffs, + feature.mat=X.keep, + weight.vec=weight.vec, + maxIterations = "min.aum") + computed.min.aum <- first.min.aum$line_search_result[, .( + iteration=q.size, step.size, aum, intersections, intervals)] + expected.min.aum <- nb.weight.search$line_search_result[ + which.min(aum), .( + iteration=iteration+1, step.size, aum, + intersections=cum.intersections+1, intervals=cum.intervals+1)] + expect_equal(computed.min.aum, expected.min.aum) + ##dynamic max auc. + first.max.auc <- aum::aum_line_search( + nb.diffs, + feature.mat=X.keep, + weight.vec=weight.vec, + maxIterations = "max.auc") + computed.max.auc <- first.max.auc$line_search_result[, .( + iteration=q.size, step.size, auc, intersections, intervals)] + i <- nb.weight.search$line_search_result[, which(auc.after==max(auc.after))] + expected.auc.step <- nb.weight.search$line_search_result[ + , mean(step.size[c(min(i),max(i)+1)])] + expect_equal(computed.max.auc$step.size, expected.auc.step) + if(interactive()&&require(ggplot2))plot(nb.weight.search)+geom_point(aes(step.size,value),color="red",data=rbind(computed.min.aum[, .(step.size, value=aum, panel="aum")], first.max.auc$line_search_result[, .(step.size, value=auc, panel="auc")])) +}) + +test_that("dynamic simple ex first min aum line search", { + data(neuroblastomaProcessed, package="penaltyLearning", envir=environment()) + nb.err <- with(neuroblastomaProcessed$errors, data.frame( + example=paste0(profile.id, ".", chromosome), + min.lambda, + max.lambda, + fp, fn)) + (nb.diffs <- aum::aum_diffs_penalty(nb.err, c("1.1", "4.2"))) + nb.line.search <- aum::aum_line_search(nb.diffs, pred.vec=c(1,-1)) + max.auc.search <- aum::aum_line_search( + nb.diffs, pred.vec=c(1,-1), maxIterations = "max.auc") + i <- nb.line.search$line_search_result[, which.max(auc.after)] + expect_equal( + max.auc.search$line_search_result$step.size, + nb.line.search$line_search_result[, mean(step.size[c(i,i+1)])]) + min.aum.search <- aum::aum_line_search( + nb.diffs, pred.vec=c(1,-1), maxIterations = "min.aum") + expected.step <- nb.line.search$line_search_result[ + which.min(aum), .(step.size, aum)] + computed.step <- min.aum.search$line_search_result[ + , .(step.size, aum)] + expect_equal(computed.step, expected.step) + if(interactive()&&require(ggplot2))plot(nb.line.search)+geom_point(aes(step.size,value),color="red",data=rbind(computed.step[, .(step.size, value=aum, panel="aum")], max.auc.search$line_search_result[, .(step.size, value=auc, panel="auc")])) +}) + +test_that("dynamic ex flat first aum min line search", { + data(neuroblastomaProcessed, package="penaltyLearning", envir=environment()) + nb.err <- with(neuroblastomaProcessed$errors, data.frame( + example=paste0(profile.id, ".", chromosome), + min.lambda, + max.lambda, + fp, fn)) + (nb.diffs <- aum::aum_diffs_penalty(nb.err, c("513.3", "4.2", "1.1", "2.1"))) + pred.vec <- c(3,-3, 5, 10) + nb.line.search <- aum::aum_line_search(nb.diffs, pred.vec=pred.vec, maxIterations = 15) + max.auc.search <- aum::aum_line_search( + nb.diffs, pred.vec=pred.vec, maxIterations = "max.auc") + i <- nb.line.search$line_search_result[, which.max(auc.after)] + min.aum.search <- aum::aum_line_search( + nb.diffs, pred.vec=pred.vec, maxIterations = "min.aum") + expected.step <- nb.line.search$line_search_result[ + which.min(aum), .(step.size, aum)] + computed.step <- min.aum.search$line_search_result[ + , .(step.size, aum)] + expect_equal(computed.step, expected.step) + if(interactive()&&require(ggplot2))plot(nb.line.search)+geom_point(aes(step.size,value),color="red",data=rbind(computed.step[, .(step.size, value=aum, panel="aum")], max.auc.search$line_search_result[, .(step.size, value=auc, panel="auc")])) +}) diff --git a/tests/testthat/test-CRAN-linear-model.R b/tests/testthat/test-CRAN-linear-model.R index 5d05383..20bc336 100644 --- a/tests/testthat/test-CRAN-linear-model.R +++ b/tests/testthat/test-CRAN-linear-model.R @@ -16,7 +16,7 @@ test_that("error when there are not enough data", { label.vec[seq(1, n.folds)] <- 1 diffs.dt <- aum::aum_diffs_binary(label.vec) -test_that("error when there are not enough data", { +test_that("model fitting works if at least as many data as folds", { model <- aum::aum_linear_model_cv(feature.mat, diffs.dt, n.folds=n.folds) expect_is(model, "aum_linear_model_cv") }) diff --git a/tests/testthat/test-CRAN.R b/tests/testthat/test-CRAN.R index 0999025..2bbb91b 100644 --- a/tests/testthat/test-CRAN.R +++ b/tests/testthat/test-CRAN.R @@ -189,28 +189,52 @@ test_that("4fp[2,3] 1fn[-1,-1] 2fp2fn[-2,-1]", { expect_equal(L$derivative_mat[3,], c(-2,-1)) }) -models <- diffs( - data.frame(fp_diff=0, - fn_diff=1, - pred =0)) -predictions <- c(1,0,0) +bad.fn.diffs <- diffs(data.frame( + fp_diff=1, + fn_diff=1, + pred =0)) +bad.fn.pred <- c(1,0,0) test_that("error for fn<0", { expect_error({ - aum::aum(models, predictions) + aum::aum(bad.fn.diffs, bad.fn.pred) }, "fn should be non-negative") }) -models <- diffs( - data.frame(fp_diff=-1, - fn_diff=0, - pred =0)) -predictions <- c(1,0,0) +test_that("line search error for negative max FN", { + line_search_input <- data.table( + fp.diff = bad.fn.diffs$fp_diff, + fn.diff = bad.fn.diffs$fn_diff, + intercept = bad.fn.diffs$pred, + slope = bad.fn.diffs$pred, + key=c("intercept","slope")) + expect_error({ + aum:::aumLineSearch(line_search_input, 0) + }, "max FN should be positive") +}) + +bad.fp.diffs <- diffs(data.frame( + fp_diff=-1, + fn_diff=-1, + pred =0)) +bad.fp.pred <- c(1,0,0) test_that("error for fp<0", { expect_error({ - aum::aum(models, predictions) + aum::aum(bad.fp.diffs, bad.fp.pred) }, "fp should be non-negative") }) +test_that("line search error for negative max FP", { + line_search_input <- data.table( + fp.diff = bad.fp.diffs$fp_diff, + fn.diff = bad.fp.diffs$fn_diff, + intercept = bad.fp.diffs$pred, + slope = bad.fp.diffs$pred, + key=c("intercept","slope")) + expect_error({ + aum:::aumLineSearch(line_search_input, 0) + }, "max FP should be positive") +}) + models <- diffs( data.frame(fp_diff=1, fn_diff=0,