tdhock · tdhock · May 12, 2023 · Apr 7, 2023 · Apr 10, 2023 · Apr 10, 2023
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: aum
 Type: Package
 Title: Area Under Minimum of False Positives and Negatives
-Version: 2023.4.4
+Version: 2023.5.12
 Authors@R: c(
     person("Toby Dylan", "Hocking",
      email="toby.hocking@r-project.org",

diff --git a/NEWS b/NEWS
@@ -1,3 +1,11 @@
+Changes in version 2023.5.12
+
+- aum_linear_model/cv defaults to maxIterations="min.aum".
+
+Changes in version 2023.4.7
+
+- aum_line_search now accepts maxIterations="min.aum" or "max.auc".
+
 Changes in version 2023.4.4
 
 - removed some code to check faster on CRAN.

diff --git a/R/aum_diffs.R b/R/aum_diffs.R
@@ -40,7 +40,7 @@ plot.aum_diffs <- function
     ggplot2::geom_segment(ggplot2::aes(
       min.pred, value,
       xend=max.pred, yend=value,
-      color=variable, size=variable),
+      color=variable, linewidth=variable),
       data=err.tall)+
     ggplot2::geom_vline(ggplot2::aes(
       xintercept=pred),
@@ -280,7 +280,7 @@ aum_diffs_penalty <- structure(function
       geom_segment(aes(
         -log(min.lambda), value,
         xend=-log(max.lambda), yend=value,
-        color=variable, size=variable),
+        color=variable, linewidth=variable),
         data=fn.not.zero.tall)+
       geom_point(aes(
         -log(min.lambda), value,

diff --git a/R/aum_line_search.R b/R/aum_line_search.R
@@ -1,5 +1,8 @@
 aum_line_search <- structure(function
-### Exact line search.
+### Exact line search using a C++ STL map (red-black tree) to
+### implement a queue of line intersection events. If number of rows
+### of error.diff.df is B, and number of iterations is I, then space
+### complexity is O(B) and time complexity is O( (I+B)log B ).
 (error.diff.df,
 ### aum_diffs data frame with B rows, one for each breakpoint in
 ### example-specific error functions.
@@ -11,7 +14,9 @@ aum_line_search <- structure(function
 ### N-vector of numeric predicted values. If NULL, feature.mat and
 ### weight.vec will be used to compute predicted values.
   maxIterations=nrow(error.diff.df)
-### positive int: max number of line search iterations.
+### max number of line search iterations, either a positive integer or
+### "max.auc" or "min.aum" indicating to keep going until AUC
+### decreases or AUM increases.
 ){
   . <- fp.diff <- fn.diff <- intercept <- slope <- step.size <- NULL
   ## Above to suppress CRAN NOTE.
@@ -38,11 +43,18 @@ aum_line_search <- structure(function
     fp.diff=sum(fp.diff),
     fn.diff=sum(fn.diff)
   ), keyby=.(intercept, slope)]
+  if(identical(maxIterations, "max.auc"))maxIterations <- -1L
+  if(identical(maxIterations, "min.aum"))maxIterations <- 0L
   line.search.all <- aumLineSearch(L$line_search_input, maxIterations)
   L$line_search_result <- data.table(line.search.all)[0 <= step.size]
   class(L) <- c("aum_line_search", class(L))
   L
-### List of class aum_line_search.
+### List of class aum_line_search. Element named "line_search_result"
+### is a data table with number of rows equal to maxIterations (if it
+### is positive integer, info for all steps, q.size column is number
+### of items in queue at each iteration), otherwise 1 (info for the
+### best step, q.size column is the total number of items popped off
+### the queue).
 }, ex=function(){
 
   ## Example 1: two binary data.
@@ -150,18 +162,18 @@ plot.aum_line_search <- function
       data=hline.df)+
     ggplot2::geom_line(ggplot2::aes(
       step.size, aum),
-      size=1,
+      linewidth=1,
       data=aum.df)+
     ggplot2::geom_segment(ggplot2::aes(
       min.step.size, auc,
       xend=max.step.size, yend=auc),
-      size=1,
+      linewidth=1,
       data=auc.segs)+
     ggplot2::geom_segment(ggplot2::aes(
       step.size, aum,
       xend=step.after, yend=aum.after),
       linetype="dotted",
-      size=1,
+      linewidth=1,
       data=last.seg)+
     ggplot2::geom_point(ggplot2::aes(
       step.size, value),

diff --git a/R/aum_linear_model.R b/R/aum_linear_model.R
@@ -8,7 +8,7 @@ aum_linear_model_cv <- structure(function
 ### data table of differences in error functions, from
 ### aum_diffs_penalty or aum_diffs_binary. There should be an example
 ### column with values from 0 to N-1.
-  maxIterations=nrow(feature.mat),
+  maxIterations="min.aum",
 ### max iterations of the exact line search, default is number of examples.
   improvement.thresh=NULL,
 ### before doing cross-validation to learn the number of gradient
@@ -92,6 +92,7 @@ aum_linear_model_cv <- structure(function
     initial.weight.fun=initial.weight.fun,
     max.steps=best.row$step.number,
     maxIterations=maxIterations)
+  final.model$min.valid.aum <- best.row
   final.model$fold.loss <- fold.loss
   final.model$set.loss <- set.loss
   final.model$keep <- keep
@@ -111,15 +112,46 @@ aum_linear_model_cv <- structure(function
 }, ex=function(){
 
   ## simulated binary classification problem.
-  N.rows <- 50
+  N.rows <- 60
   N.cols <- 2
   set.seed(1)
   feature.mat <- matrix(rnorm(N.rows*N.cols), N.rows, N.cols)
   unknown.score <- feature.mat[,1]*2.1 + rnorm(N.rows)
   label.vec <- ifelse(unknown.score > 0, 1, 0)
   diffs.dt <- aum::aum_diffs_binary(label.vec)
-  model <- aum::aum_linear_model_cv(feature.mat, diffs.dt)
-  plot(model)
+
+  ## Default line search keeps doing iterations until increase in AUM.
+  (default.time <- system.time({
+    default.model <- aum::aum_linear_model_cv(feature.mat, diffs.dt)
+  }))
+  plot(default.model)
+  print(default.valid <- default.model[["set.loss"]][set=="validation"])
+  print(default.model[["search"]][, .(step.size, aum, iterations=q.size)])
+
+  ## Can specify max number of iterations of line search.
+  (small.step.time <- system.time({
+    small.step.model <- aum::aum_linear_model_cv(feature.mat, diffs.dt, maxIterations = N.rows)
+  }))
+  plot(small.step.model)
+  print(small.step.valid <- small.step.model[["set.loss"]][set=="validation"])
+  small.step.model[["search"]][, .(step.size, aum, iterations=q.size)]
+
+  ## Compare number of steps, iterations and time. On my machine small
+  ## step model takes more time/steps, but less iterations in the C++
+  ## line search code.
+  cbind(
+    iterations=c(
+      default=default.model[["search"]][, sum(q.size)],
+      small.step=small.step.model[["search"]][, sum(q.size)]),
+    seconds=c(
+      default.time[["elapsed"]],
+      small.step.time[["elapsed"]]),
+    steps=c(
+      default.model[["min.valid.aum"]][["step.number"]],
+      small.step.model[["min.valid.aum"]][["step.number"]]),
+    min.valid.aum=c(
+      default.model[["min.valid.aum"]][["aum_mean"]],
+      small.step.model[["min.valid.aum"]][["aum_mean"]]))
 
 })
 
@@ -153,7 +185,7 @@ aum_linear_model <- function
 ### non-negative real number: keep doing gradient descent while the
 ### improvement in AUM is greater than this number (specify either
 ### this or max.steps, not both).
-  maxIterations=nrow(feature.list$subtrain),
+  maxIterations="min.aum",
 ### max number of iterations of exact line search, default is number
 ### of subtrain examples.
   initial.weight.fun=NULL
@@ -170,6 +202,7 @@ aum_linear_model <- function
   improvement <- old.aum <- Inf
   step.number <- 0
   loss.dt.list <- list()
+  search.dt.list <- list()
   while({
     search.result <- aum::aum_line_search(
       diff.list$subtrain,
@@ -191,6 +224,7 @@ aum_linear_model <- function
     }
     exact.dt <- data.table(search.result$line_search_result)
     best.row <- exact.dt[which.min(aum)]
+    search.dt.list[[paste(step.number)]] <- best.row
     improvement <- old.aum-best.row$aum
     old.aum <- best.row$aum
     if(!is.null(improvement.thresh)){
@@ -217,15 +251,17 @@ aum_linear_model <- function
       }else{
         mean(thresh[c(best-1,best)])
       }
-    }])
+    }],
+    search=rbindlist(search.dt.list))
   structure(out.list, class="aum_linear_model")
 ### Linear model represented as a list of class aum_linear_model with
 ### named elements: loss is a data table of values for subtrain and
 ### optionally validation at each step, weight.vec is the final vector
-### of weights learned via gradient descent, and intercept is the
-### value which results in minimal total error (FP+FN), learned via a
-### linear scan over all possible values given the final weight
-### vector.
+### of weights learned via gradient descent, intercept is the value
+### which results in minimal total error (FP+FN), learned via a linear
+### scan over all possible values given the final weight vector, and
+### search is a data table with one row for each step (best step size
+### and number of iterations of line search).
 }  
 
 plot.aum_linear_model <- function(x, ...){

diff --git a/man/aum_diffs_penalty.Rd b/man/aum_diffs_penalty.Rd
@@ -69,7 +69,7 @@ if(require("ggplot2")){
     geom_segment(aes(
       -log(min.lambda), value,
       xend=-log(max.lambda), yend=value,
-      color=variable, size=variable),
+      color=variable, linewidth=variable),
       data=fn.not.zero.tall)+
     geom_point(aes(
       -log(min.lambda), value,

diff --git a/man/aum_line_search.Rd b/man/aum_line_search.Rd
@@ -1,7 +1,10 @@
 \name{aum_line_search}
 \alias{aum_line_search}
 \title{aum line search}
-\description{Exact line search.}
+\description{Exact line search using a C++ STL map (red-black tree) to
+implement a queue of line intersection events. If number of rows
+of \code{error.diff.df} is B, and number of iterations is I, then space
+complexity is O(B) and time complexity is O( (I+B)log B ).}
 \usage{aum_line_search(error.diff.df, 
     feature.mat, weight.vec, 
     pred.vec = NULL, 
@@ -13,10 +16,17 @@ example-specific error functions.}
   \item{weight.vec}{p-vector of numeric linear model coefficients.}
   \item{pred.vec}{N-vector of numeric predicted values. If NULL, \code{feature.mat} and
 \code{weight.vec} will be used to compute predicted values.}
-  \item{maxIterations}{positive int: max number of line search iterations.}
+  \item{maxIterations}{max number of line search iterations, either a positive integer or
+"max.auc" or "min.aum" indicating to keep going until AUC
+decreases or AUM increases.}
 }
 
-\value{List of class aum_line_search.}
+\value{List of class aum_line_search. Element named "line_search_result"
+is a data table with number of rows equal to \code{maxIterations} (if it
+is positive integer, info for all steps, q.size column is number
+of items in queue at each iteration), otherwise 1 (info for the
+best step, q.size column is the total number of items popped off
+the queue).}
 
 \author{Toby Dylan Hocking <toby.hocking@r-project.org> [aut, cre], Jadon Fowler [aut] (Contributed exact line search C++ code)}
 

diff --git a/man/aum_linear_model.Rd b/man/aum_linear_model.Rd
@@ -7,7 +7,7 @@ descent with exact line search.}
 \usage{aum_linear_model(feature.list, 
     diff.list, max.steps = NULL, 
     improvement.thresh = NULL, 
-    maxIterations = nrow(feature.list$subtrain), 
+    maxIterations = "min.aum", 
     initial.weight.fun = NULL)}
 \arguments{
   \item{feature.list}{List with named elements subtrain and optionally validation, each
@@ -29,10 +29,11 @@ random standard normal vector.}
 \value{Linear model represented as a list of class aum_linear_model with
 named elements: loss is a data table of values for subtrain and
 optionally validation at each step, weight.vec is the final vector
-of weights learned via gradient descent, and intercept is the
-value which results in minimal total error (FP+FN), learned via a
-linear scan over all possible values given the final weight
-vector.}
+of weights learned via gradient descent, intercept is the value
+which results in minimal total error (FP+FN), learned via a linear
+scan over all possible values given the final weight vector, and
+search is a data table with one row for each step (best step size
+and number of iterations of line search).}
 
 \author{Toby Dylan Hocking <toby.hocking@r-project.org> [aut, cre], Jadon Fowler [aut] (Contributed exact line search C++ code)}
 

diff --git a/man/aum_linear_model_cv.Rd b/man/aum_linear_model_cv.Rd
@@ -5,7 +5,7 @@
 descent steps with exact line search, in linear model for
 minimizing AUM.}
 \usage{aum_linear_model_cv(feature.mat, 
-    diff.dt, maxIterations = nrow(feature.mat), 
+    diff.dt, maxIterations = "min.aum", 
     improvement.thresh = NULL, 
     n.folds = 3, initial.weight.fun = NULL)}
 \arguments{
@@ -43,14 +43,45 @@ selecting the best number of gradient descent steps.}
 \examples{
 
 ## simulated binary classification problem.
-N.rows <- 50
+N.rows <- 60
 N.cols <- 2
 set.seed(1)
 feature.mat <- matrix(rnorm(N.rows*N.cols), N.rows, N.cols)
 unknown.score <- feature.mat[,1]*2.1 + rnorm(N.rows)
 label.vec <- ifelse(unknown.score > 0, 1, 0)
 diffs.dt <- aum::aum_diffs_binary(label.vec)
-model <- aum::aum_linear_model_cv(feature.mat, diffs.dt)
-plot(model)
+
+## Default line search keeps doing iterations until increase in AUM.
+(default.time <- system.time({
+  default.model <- aum::aum_linear_model_cv(feature.mat, diffs.dt)
+}))
+plot(default.model)
+print(default.valid <- default.model[["set.loss"]][set=="validation"])
+print(default.model[["search"]][, .(step.size, aum, iterations=q.size)])
+
+## Can specify max number of iterations of line search.
+(small.step.time <- system.time({
+  small.step.model <- aum::aum_linear_model_cv(feature.mat, diffs.dt, maxIterations = N.rows)
+}))
+plot(small.step.model)
+print(small.step.valid <- small.step.model[["set.loss"]][set=="validation"])
+small.step.model[["search"]][, .(step.size, aum, iterations=q.size)]
+
+## Compare number of steps, iterations and time. On my machine small
+## step model takes more time/steps, but less iterations in the C++
+## line search code.
+cbind(
+  iterations=c(
+    default=default.model[["search"]][, sum(q.size)],
+    small.step=small.step.model[["search"]][, sum(q.size)]),
+  seconds=c(
+    default.time[["elapsed"]],
+    small.step.time[["elapsed"]]),
+  steps=c(
+    default.model[["min.valid.aum"]][["step.number"]],
+    small.step.model[["min.valid.aum"]][["step.number"]]),
+  min.valid.aum=c(
+    default.model[["min.valid.aum"]][["aum_mean"]],
+    small.step.model[["min.valid.aum"]][["aum_mean"]]))
 
 }