Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

maxIt=max.auc or min.aum #6

Merged
merged 9 commits into from
May 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: aum
Type: Package
Title: Area Under Minimum of False Positives and Negatives
Version: 2023.4.4
Version: 2023.5.12
Authors@R: c(
person("Toby Dylan", "Hocking",
email="toby.hocking@r-project.org",
Expand Down
8 changes: 8 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
Changes in version 2023.5.12

- aum_linear_model/cv defaults to maxIterations="min.aum".

Changes in version 2023.4.7

- aum_line_search now accepts maxIterations="min.aum" or "max.auc".

Changes in version 2023.4.4

- removed some code to check faster on CRAN.
Expand Down
4 changes: 2 additions & 2 deletions R/aum_diffs.R
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ plot.aum_diffs <- function
ggplot2::geom_segment(ggplot2::aes(
min.pred, value,
xend=max.pred, yend=value,
color=variable, size=variable),
color=variable, linewidth=variable),
data=err.tall)+
ggplot2::geom_vline(ggplot2::aes(
xintercept=pred),
Expand Down Expand Up @@ -280,7 +280,7 @@ aum_diffs_penalty <- structure(function
geom_segment(aes(
-log(min.lambda), value,
xend=-log(max.lambda), yend=value,
color=variable, size=variable),
color=variable, linewidth=variable),
data=fn.not.zero.tall)+
geom_point(aes(
-log(min.lambda), value,
Expand Down
24 changes: 18 additions & 6 deletions R/aum_line_search.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
aum_line_search <- structure(function
### Exact line search.
### Exact line search using a C++ STL map (red-black tree) to
### implement a queue of line intersection events. If number of rows
### of error.diff.df is B, and number of iterations is I, then space
### complexity is O(B) and time complexity is O( (I+B)log B ).
(error.diff.df,
### aum_diffs data frame with B rows, one for each breakpoint in
### example-specific error functions.
Expand All @@ -11,7 +14,9 @@ aum_line_search <- structure(function
### N-vector of numeric predicted values. If NULL, feature.mat and
### weight.vec will be used to compute predicted values.
maxIterations=nrow(error.diff.df)
### positive int: max number of line search iterations.
### max number of line search iterations, either a positive integer or
### "max.auc" or "min.aum" indicating to keep going until AUC
### decreases or AUM increases.
){
. <- fp.diff <- fn.diff <- intercept <- slope <- step.size <- NULL
## Above to suppress CRAN NOTE.
Expand All @@ -38,11 +43,18 @@ aum_line_search <- structure(function
fp.diff=sum(fp.diff),
fn.diff=sum(fn.diff)
), keyby=.(intercept, slope)]
if(identical(maxIterations, "max.auc"))maxIterations <- -1L
if(identical(maxIterations, "min.aum"))maxIterations <- 0L
line.search.all <- aumLineSearch(L$line_search_input, maxIterations)
L$line_search_result <- data.table(line.search.all)[0 <= step.size]
class(L) <- c("aum_line_search", class(L))
L
### List of class aum_line_search.
### List of class aum_line_search. Element named "line_search_result"
### is a data table with number of rows equal to maxIterations (if it
### is positive integer, info for all steps, q.size column is number
### of items in queue at each iteration), otherwise 1 (info for the
### best step, q.size column is the total number of items popped off
### the queue).
}, ex=function(){

## Example 1: two binary data.
Expand Down Expand Up @@ -150,18 +162,18 @@ plot.aum_line_search <- function
data=hline.df)+
ggplot2::geom_line(ggplot2::aes(
step.size, aum),
size=1,
linewidth=1,
data=aum.df)+
ggplot2::geom_segment(ggplot2::aes(
min.step.size, auc,
xend=max.step.size, yend=auc),
size=1,
linewidth=1,
data=auc.segs)+
ggplot2::geom_segment(ggplot2::aes(
step.size, aum,
xend=step.after, yend=aum.after),
linetype="dotted",
size=1,
linewidth=1,
data=last.seg)+
ggplot2::geom_point(ggplot2::aes(
step.size, value),
Expand Down
56 changes: 46 additions & 10 deletions R/aum_linear_model.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ aum_linear_model_cv <- structure(function
### data table of differences in error functions, from
### aum_diffs_penalty or aum_diffs_binary. There should be an example
### column with values from 0 to N-1.
maxIterations=nrow(feature.mat),
maxIterations="min.aum",
### max iterations of the exact line search, default is number of examples.
improvement.thresh=NULL,
### before doing cross-validation to learn the number of gradient
Expand Down Expand Up @@ -92,6 +92,7 @@ aum_linear_model_cv <- structure(function
initial.weight.fun=initial.weight.fun,
max.steps=best.row$step.number,
maxIterations=maxIterations)
final.model$min.valid.aum <- best.row
final.model$fold.loss <- fold.loss
final.model$set.loss <- set.loss
final.model$keep <- keep
Expand All @@ -111,15 +112,46 @@ aum_linear_model_cv <- structure(function
}, ex=function(){

## simulated binary classification problem.
N.rows <- 50
N.rows <- 60
N.cols <- 2
set.seed(1)
feature.mat <- matrix(rnorm(N.rows*N.cols), N.rows, N.cols)
unknown.score <- feature.mat[,1]*2.1 + rnorm(N.rows)
label.vec <- ifelse(unknown.score > 0, 1, 0)
diffs.dt <- aum::aum_diffs_binary(label.vec)
model <- aum::aum_linear_model_cv(feature.mat, diffs.dt)
plot(model)

## Default line search keeps doing iterations until increase in AUM.
(default.time <- system.time({
default.model <- aum::aum_linear_model_cv(feature.mat, diffs.dt)
}))
plot(default.model)
print(default.valid <- default.model[["set.loss"]][set=="validation"])
print(default.model[["search"]][, .(step.size, aum, iterations=q.size)])

## Can specify max number of iterations of line search.
(small.step.time <- system.time({
small.step.model <- aum::aum_linear_model_cv(feature.mat, diffs.dt, maxIterations = N.rows)
}))
plot(small.step.model)
print(small.step.valid <- small.step.model[["set.loss"]][set=="validation"])
small.step.model[["search"]][, .(step.size, aum, iterations=q.size)]

## Compare number of steps, iterations and time. On my machine small
## step model takes more time/steps, but less iterations in the C++
## line search code.
cbind(
iterations=c(
default=default.model[["search"]][, sum(q.size)],
small.step=small.step.model[["search"]][, sum(q.size)]),
seconds=c(
default.time[["elapsed"]],
small.step.time[["elapsed"]]),
steps=c(
default.model[["min.valid.aum"]][["step.number"]],
small.step.model[["min.valid.aum"]][["step.number"]]),
min.valid.aum=c(
default.model[["min.valid.aum"]][["aum_mean"]],
small.step.model[["min.valid.aum"]][["aum_mean"]]))

})

Expand Down Expand Up @@ -153,7 +185,7 @@ aum_linear_model <- function
### non-negative real number: keep doing gradient descent while the
### improvement in AUM is greater than this number (specify either
### this or max.steps, not both).
maxIterations=nrow(feature.list$subtrain),
maxIterations="min.aum",
### max number of iterations of exact line search, default is number
### of subtrain examples.
initial.weight.fun=NULL
Expand All @@ -170,6 +202,7 @@ aum_linear_model <- function
improvement <- old.aum <- Inf
step.number <- 0
loss.dt.list <- list()
search.dt.list <- list()
while({
search.result <- aum::aum_line_search(
diff.list$subtrain,
Expand All @@ -191,6 +224,7 @@ aum_linear_model <- function
}
exact.dt <- data.table(search.result$line_search_result)
best.row <- exact.dt[which.min(aum)]
search.dt.list[[paste(step.number)]] <- best.row
improvement <- old.aum-best.row$aum
old.aum <- best.row$aum
if(!is.null(improvement.thresh)){
Expand All @@ -217,15 +251,17 @@ aum_linear_model <- function
}else{
mean(thresh[c(best-1,best)])
}
}])
}],
search=rbindlist(search.dt.list))
structure(out.list, class="aum_linear_model")
### Linear model represented as a list of class aum_linear_model with
### named elements: loss is a data table of values for subtrain and
### optionally validation at each step, weight.vec is the final vector
### of weights learned via gradient descent, and intercept is the
### value which results in minimal total error (FP+FN), learned via a
### linear scan over all possible values given the final weight
### vector.
### of weights learned via gradient descent, intercept is the value
### which results in minimal total error (FP+FN), learned via a linear
### scan over all possible values given the final weight vector, and
### search is a data table with one row for each step (best step size
### and number of iterations of line search).
}

plot.aum_linear_model <- function(x, ...){
Expand Down
2 changes: 1 addition & 1 deletion man/aum_diffs_penalty.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ if(require("ggplot2")){
geom_segment(aes(
-log(min.lambda), value,
xend=-log(max.lambda), yend=value,
color=variable, size=variable),
color=variable, linewidth=variable),
data=fn.not.zero.tall)+
geom_point(aes(
-log(min.lambda), value,
Expand Down
16 changes: 13 additions & 3 deletions man/aum_line_search.Rd
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
\name{aum_line_search}
\alias{aum_line_search}
\title{aum line search}
\description{Exact line search.}
\description{Exact line search using a C++ STL map (red-black tree) to
implement a queue of line intersection events. If number of rows
of \code{error.diff.df} is B, and number of iterations is I, then space
complexity is O(B) and time complexity is O( (I+B)log B ).}
\usage{aum_line_search(error.diff.df,
feature.mat, weight.vec,
pred.vec = NULL,
Expand All @@ -13,10 +16,17 @@ example-specific error functions.}
\item{weight.vec}{p-vector of numeric linear model coefficients.}
\item{pred.vec}{N-vector of numeric predicted values. If NULL, \code{feature.mat} and
\code{weight.vec} will be used to compute predicted values.}
\item{maxIterations}{positive int: max number of line search iterations.}
\item{maxIterations}{max number of line search iterations, either a positive integer or
"max.auc" or "min.aum" indicating to keep going until AUC
decreases or AUM increases.}
}

\value{List of class aum_line_search.}
\value{List of class aum_line_search. Element named "line_search_result"
is a data table with number of rows equal to \code{maxIterations} (if it
is positive integer, info for all steps, q.size column is number
of items in queue at each iteration), otherwise 1 (info for the
best step, q.size column is the total number of items popped off
the queue).}

\author{Toby Dylan Hocking <toby.hocking@r-project.org> [aut, cre], Jadon Fowler [aut] (Contributed exact line search C++ code)}

Expand Down
11 changes: 6 additions & 5 deletions man/aum_linear_model.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ descent with exact line search.}
\usage{aum_linear_model(feature.list,
diff.list, max.steps = NULL,
improvement.thresh = NULL,
maxIterations = nrow(feature.list$subtrain),
maxIterations = "min.aum",
initial.weight.fun = NULL)}
\arguments{
\item{feature.list}{List with named elements subtrain and optionally validation, each
Expand All @@ -29,10 +29,11 @@ random standard normal vector.}
\value{Linear model represented as a list of class aum_linear_model with
named elements: loss is a data table of values for subtrain and
optionally validation at each step, weight.vec is the final vector
of weights learned via gradient descent, and intercept is the
value which results in minimal total error (FP+FN), learned via a
linear scan over all possible values given the final weight
vector.}
of weights learned via gradient descent, intercept is the value
which results in minimal total error (FP+FN), learned via a linear
scan over all possible values given the final weight vector, and
search is a data table with one row for each step (best step size
and number of iterations of line search).}

\author{Toby Dylan Hocking <toby.hocking@r-project.org> [aut, cre], Jadon Fowler [aut] (Contributed exact line search C++ code)}

Expand Down
39 changes: 35 additions & 4 deletions man/aum_linear_model_cv.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
descent steps with exact line search, in linear model for
minimizing AUM.}
\usage{aum_linear_model_cv(feature.mat,
diff.dt, maxIterations = nrow(feature.mat),
diff.dt, maxIterations = "min.aum",
improvement.thresh = NULL,
n.folds = 3, initial.weight.fun = NULL)}
\arguments{
Expand Down Expand Up @@ -43,14 +43,45 @@ selecting the best number of gradient descent steps.}
\examples{

## simulated binary classification problem.
N.rows <- 50
N.rows <- 60
N.cols <- 2
set.seed(1)
feature.mat <- matrix(rnorm(N.rows*N.cols), N.rows, N.cols)
unknown.score <- feature.mat[,1]*2.1 + rnorm(N.rows)
label.vec <- ifelse(unknown.score > 0, 1, 0)
diffs.dt <- aum::aum_diffs_binary(label.vec)
model <- aum::aum_linear_model_cv(feature.mat, diffs.dt)
plot(model)

## Default line search keeps doing iterations until increase in AUM.
(default.time <- system.time({
default.model <- aum::aum_linear_model_cv(feature.mat, diffs.dt)
}))
plot(default.model)
print(default.valid <- default.model[["set.loss"]][set=="validation"])
print(default.model[["search"]][, .(step.size, aum, iterations=q.size)])

## Can specify max number of iterations of line search.
(small.step.time <- system.time({
small.step.model <- aum::aum_linear_model_cv(feature.mat, diffs.dt, maxIterations = N.rows)
}))
plot(small.step.model)
print(small.step.valid <- small.step.model[["set.loss"]][set=="validation"])
small.step.model[["search"]][, .(step.size, aum, iterations=q.size)]

## Compare number of steps, iterations and time. On my machine small
## step model takes more time/steps, but less iterations in the C++
## line search code.
cbind(
iterations=c(
default=default.model[["search"]][, sum(q.size)],
small.step=small.step.model[["search"]][, sum(q.size)]),
seconds=c(
default.time[["elapsed"]],
small.step.time[["elapsed"]]),
steps=c(
default.model[["min.valid.aum"]][["step.number"]],
small.step.model[["min.valid.aum"]][["step.number"]]),
min.valid.aum=c(
default.model[["min.valid.aum"]][["aum_mean"]],
small.step.model[["min.valid.aum"]][["aum_mean"]]))

}
Loading