-
Notifications
You must be signed in to change notification settings - Fork 3
/
HOCKING-slides-sherbrooke.tex
674 lines (557 loc) · 25 KB
/
HOCKING-slides-sherbrooke.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
\documentclass[t]{beamer}
%\usepackage[T1]{fontenc} % use 8-bit T1 fonts
%\usepackage{pdffonts}
\usepackage{tikz}
\usepackage[all]{xy}
\usepackage{amsmath,amssymb}
\usepackage{hyperref}
\usepackage{graphicx}
\usepackage[noend]{algcompatible}
\usepackage{multirow}
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator*{\Lik}{Lik}
\DeclareMathOperator*{\PoissonLoss}{PoissonLoss}
\DeclareMathOperator*{\Peaks}{Peaks}
\DeclareMathOperator*{\Segments}{Segments}
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\maximize}{maximize}
\DeclareMathOperator*{\minimize}{minimize}
\newcommand{\sign}{\operatorname{sign}}
\newcommand{\RR}{\mathbb R}
\newcommand{\ZZ}{\mathbb Z}
\newcommand{\NN}{\mathbb N}
\newcommand{\z}{$z = 2, 4, 3, 5, 1$}
\newcommand{\algo}[1]{\textcolor{#1}{#1}}
\definecolor{PDPA}{HTML}{66C2A5}
\definecolor{CDPA}{HTML}{FC8D62}
\definecolor{GPDPA}{HTML}{4D4D4D}
% Set transparency of non-highlighted sections in the table of
% contents slide.
\setbeamertemplate{section in toc shaded}[default][100]
\AtBeginSection[]
{
\setbeamercolor{section in toc}{fg=red}
\setbeamercolor{section in toc shaded}{fg=black}
\begin{frame}
\tableofcontents[currentsection]
\end{frame}
}
\begin{document}
\title{Sort-based gradient descent learning algorithms for ROC curve optimization}
\author{
Toby Dylan Hocking --- toby.dylan.hocking@usherbrooke.ca\\
joint work with former master student Jadon Fowler\\
Learning Algorithms, Statistical Software, Optimization (LASSO Lab) --- \url{https://lassolab.org}\\
Département d'Informatique, Université de Sherbrooke\\
\includegraphics[height=5cm]{2023-02-02-group-meeting} \\
}
\date{}
\maketitle
\section{Problem Setting 1: ROC curves for evaluating supervised binary classification algorithms}
\begin{frame}
\frametitle{Problem: supervised binary classification}
\begin{itemize}
\item Given pairs of inputs $\mathbf x\in\mathbb R^p$ and outputs
$y\in\{0,1\}$ can we learn a score
$f(\mathbf x)\in\mathbb R$, predict $y=1$ when $f(\mathbf x)>0$?
\item Example: email, $\mathbf x =$bag of words, $y=$spam or not.
\item Example: images. Jones {\it et al.} PNAS 2009.
\parbox{2in}{\includegraphics[width=2in]{cellprofiler}}
\parbox{1.9in}{Most algorithms (SVM, Logistic regression, etc) minimize a differentiable surrogate of zero-one loss = sum of:\\
\textbf{False positives:} $f(\mathbf x)>0$ but $y=0$ (predict
budding, but cell is not).\\
\textbf{False negatives:} $f(\mathbf x)<0$ but $y=1$ (predict
not budding, but cell is). }
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{ROC curves for evaluation, especially useful with imbalance}
\includegraphics[width=0.65\textwidth]{figure-batchtools-expired-earth-roc}
\begin{itemize}
\item At default FPR=24\% (D), glmnet has fewer errors.
\item At FPR=4\%, xgboost has fewer errors.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Receiver Operating Characteristic (ROC) Curves}
\begin{itemize}
\item Classic evaluation method from the signal processing
literature (Egan and Egan, 1975).
\item ROC curve of learned $f$ is plot of True
Positive Rate vs False Positive Rate: each point on the ROC curve
is a different constant $c\in\mathbb R$ added to the predicted
values: $f(\mathbf x)+c$.
\item $c=\infty$ means always predict positive label (FPR=TPR=1).
\item $c=-\infty$ means always predict negative label (FPR=TPR=0).
\item Best classifier has a point near upper left (TPR=1, FPR=0), with large
Area Under the Curve (AUC).
% \item Proposed idea: a new surrogate for AUC that is differentiable,
% so can be used for gradient descent learning.
\end{itemize}
\includegraphics[width=0.9\textwidth]{figure-more-than-one-new-binary}
\end{frame}
\begin{frame}
\frametitle{Research question and new idea}
Can we learn a binary classification function $f$ which directly
optimizes the ROC curve?
\begin{itemize}
\item Most algorithms involve minimizing a differentiable surrogate
of the zero-one loss, which is not the same.
\item The Area Under the ROC Curve (AUC) is piecewise constant
(gradient zero almost everywhere), so can not be used with
gradient descent algorithms.
\item We proposed (Hocking, Hillman 2023) to encourage points to be in the upper left of ROC
space, using a loss function which is a differentiable surrogate
of the sum of min(FPR,FNR).
\end{itemize}
\includegraphics[width=\textwidth]{figure-more-than-one-new-binary-heat}
\end{frame}
\section{Proposed AUM loss, a differentiable surrogate for ROC curve optimization (JMLR'23)}
\begin{frame}
\frametitle{Gradients of sample-based loss are influenced by imbalance}
\includegraphics[width=\textwidth]{roc-gradient-arrows}
\begin{itemize}
\item Left: some imbalance, ~20\% positive labels, gradient 4x
stronger along X axis / False Positive Rate.
\item Right: large imbalance, ~0.1\% positive labels, gradient 1000x
stronger along X axis / False Positive Rate. (True Positive / Y
axis gradients essentially ignored)
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Gradients using balanced sample weights, proposed loss}
\includegraphics[width=\textwidth]{roc-gradient-arrows-proposed}
\begin{itemize}
\item Left: gradient 4x
stronger along X axis for sample weights=1. Balanced sample
weights mean equal influence for gradients along both axes, based on the current prediction threshold.
\item Right: proposed method computes gradients based on all ROC
points, not just the current prediction threshold.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Comparing proposed loss with baselines}
\begin{itemize}
\small
\item Classic baselines: hinge and logistic loss, sum over samples, $\ell[yf(x)]$.
\item Bamber (1975) proved ROC-AUC relation to Mann-Whitney
U statistic (double sum over all pairs of positive and negative
samples).
\item Recently: $\text{SVM}^{\text{struct}}$ (Joachims 2005),
X-risk (Yang 2022), %https://arxiv.org/abs/2206.00439
All Pairs Squared Hinge (Rust and Hocking 2023),
sum loss over pairs of positive and negative samples, $\ell[f(x^+)-f(x^-)]$.
\item Proposed: sort-based AUM loss (sum over points on ROC curve).
\item Figure below: loss for two samples: one positive, one negative.
\end{itemize}
\includegraphics[width=0.7\textwidth]{figure-compare-hinge-loss-contours-logistic.png}
Barr, Hocking, Morton, Thatcher, Shaw, \emph{TransAI} (2022).
%AUM loss here = linear hinge with no margin, summed over pairs.
\end{frame}
\begin{frame}
\frametitle{Large AUC $\approx$ small Area Under Min(FP,FN) (AUM)}
\small
\includegraphics[height=1.3in]{figure-more-than-one-new-binary-heat}
Above: purple heat map = numbers near dots = distance to top or left
= same as black min error rate functions below.
\includegraphics[height=1.3in]{figure-more-than-one-new-binary-aum-rate}
Hocking, Hillman, \emph{Journal of Machine Learning Research} (2023).
%Proposal: track how thresholds in error plot change with step size.
\end{frame}
\begin{frame}
\frametitle{Computing Sum of Min (SM)}
\includegraphics[width=\textwidth]{figure-more-than-one-new-binary-aum-rate}
\vskip -0.2cm
\begin{itemize}
\item For $N$ samples, there are $\leq N+1$ points on the ROC curve,
\item with sorted thresholds $T_1\leq\cdots\leq T_N\in\mathbb R$,
\item and corresponding min error values $M_1,\dots,M_N$.
\item Then if $I$ is the indicator function, we can write the sum of
the min (SM), over all ROC points, as:
\end{itemize}
\begin{equation*}
\label{eq:SM-computation}
\text{SM} =
\sum_{i=2}^{N}
I[ T_{i} \neq T_{i-1} ]
M_i =
\sum_{i:T_{i} \neq T_{i-1} }
M_i.
\end{equation*}
($\neq$ required: a tie $T_i=T_{i-1}$ deletes a point from the ROC curve)
\end{frame}
\begin{frame}
\frametitle{Computing Area Under Min (AUM)}
\includegraphics[width=\textwidth]{figure-more-than-one-new-binary-aum-rate}
The AUM can be interpreted as an L1 relaxation of SM,
\begin{eqnarray*}
\text{SM} &=&
\sum_{i=2}^{N}
I[ T_{i} \neq T_{i-1} ]
M_i =
\sum_{i:T_{i} \neq T_{i-1} }
M_i.
\\
\text{AUM} &=&
\sum_{i=2}^{N}
[ T_{i} - T_{i-1} ]
M_i.\\
\end{eqnarray*}
AUM is therefore a surrogate loss for ROC optimization, and it is differentiable almost everywhere $\Rightarrow$ gradient descent learning!
\end{frame}
\begin{frame}[fragile]
\frametitle{ROC curve pytorch code uses argsort}
\small
\begin{verbatim}
def ROC_curve(pred_tensor, label_tensor):
sorted_indices=torch.argsort(-pred_tensor)
...
return { # a dictionary of torch tensors
"FPR":FPR, "FNR":FNR, "TPR":1 - FNR,
"min(FPR,FNR)":torch.minimum(FPR, FNR),
"min_constant":torch.cat([
torch.tensor([-torch.inf]), uniq_thresh]),
"max_constant":torch.cat([
uniq_thresh, torch.tensor([torch.inf])]) }
>>> pd.DataFrame(ROC_curve(torch.tensor(
... [2.0, -3.5, -1.0, 1.5]), torch.tensor([0,0,1,1])))
FPR FNR TPR min(FPR,FNR) min_constant max_constant
0 0.0 1.0 0.0 0.0 -inf -2.0
1 0.5 1.0 0.0 0.5 -2.0 -1.5
2 0.5 0.5 0.5 0.5 -1.5 1.0
3 0.5 0.0 1.0 0.0 1.0 3.5
4 1.0 0.0 1.0 0.0 3.5 inf
\end{verbatim}
\url{https://tdhock.github.io/blog/2024/torch-roc-aum/}
\end{frame}
\begin{frame}[fragile]
\frametitle{AUC and AUM both use ROC curve}
\begin{itemize}
\item ROC AUC and proposed AUM are both implemented by first
computing the ROC curve.
\end{itemize}
\begin{verbatim}
def ROC_AUC(pred_tensor, label_tensor):
roc = ROC_curve(pred_tensor, label_tensor)
FPR_diff = roc["FPR"][1:]-roc["FPR"][:-1]
TPR_sum = roc["TPR"][1:]+roc["TPR"][:-1]
return torch.sum(FPR_diff*TPR_sum/2.0)
def Proposed_AUM(pred_tensor, label_tensor):
roc = ROC_curve(pred_tensor, label_tensor)
min_FPR_FNR = roc["min(FPR,FNR)"][1:-1]
constant_diff = roc["min_constant"][1:].diff()
return torch.sum(min_FPR_FNR * constant_diff)
\end{verbatim}
\url{https://tdhock.github.io/blog/2024/torch-roc-aum/}
\end{frame}
\begin{frame}[fragile]
\frametitle{AUM pytorch code, auto-grad demo}
\begin{itemize}
\item Assume two samples, $(x_0,y_0=0), (x_1,y_1=1)$,
\item Plot objective and gradient with respect to predicted scores.
\end{itemize}
\includegraphics[width=\textwidth]{gg_aum_grad}
\url{https://tdhock.github.io/blog/2024/torch-roc-aum/}
\end{frame}
\begin{frame}
\frametitle{Comparing AUM with weighted logistic loss}
\includegraphics[width=\textwidth]{figure-aum-neural-networks-test-auc}
\begin{itemize}
\item Two image classification data sets.
\item LeNet5 convolutional neural network, batch size 1000.
\item Step size from $10^{-4}$ to $10^2$ (keep best).
\item AUM rate uses Area Under Min of FPR/FNR, more accurate in
these data than AUM count (FP/FN totals), .
\item logistic unweighted is usual binary cross-entropy loss (uniform weight=1 for each sample).
\item for logistic weighted, we compute class frequencies, $n_1=\sum_{i=1}^N I[y_i=1]$ and $n_0$ similar; then weights are $w_i=1/n_{y_i}$ so that total weight of positive class equals total weight of negative class.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{AUM gradient descent increases validation AUC,\\
four image classification data sets}
\includegraphics[width=\textwidth]{data_Classif_batchtools_best_valid_scatter}
\begin{itemize}
\item Unbalanced binary classification: 10\% negative, 90\% positive.
\item Gradient descent with constant step size, best of $10^{-4}$ to $10^5$.
\item Full gradient (batch size = number of samples).
\item Linear model, max iterations = 100,000.
\item Max Validation AUC comparable or better than baselines.
\item Number of epochs comparable to baselines.
\item Time per epoch is $O(N \log N)$ (sort), small log factor larger than standard logistic/cross-entropy loss, $O(N)$.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Discussion}
\begin{itemize}
\item Classic baselines are simple (sum over samples).
\item Proposed AUM loss similar to recent losses that sum over all
pairs of positive and negative examples (both can be implemented
by sorting predicted scores); proposed AUM loss uses a
different/novel relaxation.
\item Proposed AUM loss can be used as a drop-in replacement for logistic/binary cross-entropy loss, \url{https://tdhock.github.io/blog/2024/torch-roc-aum/}
\item Best use with stochastic gradient algorithms?
At least one positive and one negative example is required in each
batch.
\item Margin-based algorithms like SVM?
\item Works well in binary classification, how to adapt to
multi-class setting, or other problems such as ranking/information
retrieval? (next: change-point detection)
\end{itemize}
\end{frame}
% \begin{frame}
% \frametitle{TODO other classification data sets}
% \includegraphics[width=\textwidth]{figure-unbalanced-grad-desc}
% \begin{itemize}
% \item TODO from JMLR paper
% \end{itemize}
% \end{frame}
\section{Problem setting 2: ROC curves for evaluating supervised changepoint algorithms}
\begin{frame}
\frametitle{Problem: unsupervised changepoint detection}
\begin{itemize}
\item Data sequence $z_1,\dots,z_T$ at $T$ points over time/space.
\item Ex: DNA copy number data for cancer diagnosis, $z_t\in\mathbb R$.
\item The penalized changepoint problem (Maidstone \emph{et al.} 2017)
$$\argmin_{u_1,\dots,u_T\in\mathbb R} \sum_{t=1}^T (u_t - z_t)^2 + \lambda\sum_{t=2}^T I[u_{t-1} \neq u_t].$$
\end{itemize}
\parbox{0.6\textwidth}{
\includegraphics[width=0.6\textwidth]{figure-fn-not-monotonic-no-labels}
}
\parbox{0.3\textwidth}{
Larger penalty $\lambda$ results in fewer changes/segments.
\vskip 0.5in
Smaller penalty $\lambda$ results in more changes/segments.
}
\end{frame}
\begin{frame}
\frametitle{Problem: weakly supervised changepoint detection}
\begin{itemize}
\item First described by Hocking \emph{et al.} ICML 2013.
\item We are given a data sequence $\mathbf z$ with labeled regions
$L$.
\end{itemize}
\includegraphics[width=0.56\textwidth]{figure-fn-not-monotonic}
\only<2>{\includegraphics[width=0.42\textwidth]{figure-fn-not-monotonic-error-standAlone}}
\only<2>{We compute features $\mathbf x=\phi(\mathbf z)\in\mathbf R^p$
and want to learn a function $f(\mathbf x)=-\log\lambda\in\mathbf R$ that minimizes label error (sum of false positives and false negatives), or maximizes AUC, Hocking, Hillman, \emph{Journal of Machine Learning Research} (2023).}
\end{frame}
\begin{frame}
\frametitle{Comparing changepoint algorithms using ROC curves}
Hocking TD, Srivastava A. Labeled Optimal
Partitioning. Computational Statistics (2022).
\includegraphics[width=0.8\textwidth]{figure-LOPART-roc}
LOPART algorithm (R package LOPART) has consistently larger
test AUC than previous algorithms.
\end{frame}
\begin{frame}
\frametitle{AUM gradient descent results in increased train AUC for
a real changepoint problem}
Hillman, Hocking, \emph{Journal of Machine Learning Research} (2023).
\includegraphics[height=3.7cm]{figure-aum-optimized-iterations.png}
\includegraphics[height=3.7cm]{figure-aum-train-both.png}
\begin{itemize}
\item Left/middle: changepoint problem initialized to prediction vector with
min label errors, gradient descent on prediction vector.
\item Right: linear model initialized by minimizing regularized convex
loss (surrogate for label error, Hocking \emph{et al.} ICML 2013),
gradient descent on weight vector.
\end{itemize}
\end{frame}
\section{Proposed complete line search algorithm for optimizing AUM and AUC (in progress)}
\begin{frame}
\frametitle{Using thresholds to compute AUM}
% \begin{itemize}
% \item $n$ is the number of observations (labeled examples or sequences),
% \item $B$ is the number of breakpoints in label error functions,
% \item each breakpoint $b\in\{1,\dots, B\}$ is represented by the tuple $(v_b, \Delta\text{FP}_b, \Delta\text{FN}_b, \mathcal I_b)$,
% \item $\mathcal I_b\in\{1,\dots,n\}$ is an example index, so there are changes $\Delta\text{FP}_b, \Delta\text{FN}_b$ at predicted value $v_b\in\mathbb R$ in the error functions
% \end{itemize}
% Let
% let $p_1,\dots,p_B$ be a permutation of $1,\dots,B$ such that
% thresholds are increasing: $T_$
\textbf{Hillman and Hocking, JMLR 2023}, showed that in a data set with $N$ samples and $B\geq N$ breakpoints in label error functions, the AUM for the
\textbf{current predictions} can be computed as a function of
$T_1<\cdots<T_B$, thresholds when min label error $M_b$ changes,
\begin{equation*}
\text{AUM} = \sum_{b=2}^B [T_{b} - T_{b-1}] M_b
\end{equation*}
\textbf{Proposed}: when learning a linear model,
$f(\mathbf x)= \mathbf w^T \mathbf x$, we can update the weights
$\mathbf w$ using AUM gradient descent. We compute an exact
representation of thresholds $T_b(s)$ and min error $M_b(s)$ \textbf{as a
function of step size} $s$, which results in a complete piecewise
linear representation of
\begin{equation*}
\text{AUM}(s) = \sum_{b=2}^B [T_{b}(s) - T_{b-1}(s)] M_b(s).
\end{equation*}
\end{frame}
\begin{frame}
\frametitle{Simple example, proposed line search with 4 binary labels}
Theorem: when learning a linear model, $f(\mathbf x)= \mathbf w^T \mathbf x$,
\begin{itemize}
\item AUC is piecewise constant, and
\item AUM is piecewise linear,
\end{itemize}
as a function of step size in gradient descent.
\begin{center}
\parbox{0.6\textwidth}{
\hspace{0.0cm}
\includegraphics[width=0.5\textwidth]{figure-line-search-example-binary-roc.png}
\\
\includegraphics[width=0.6\textwidth]{figure-line-search-example-binary-error.png}\\
ROC curves and error functions
}\parbox{0.39\textwidth}{
\includegraphics[width=0.35\textwidth]{figure-line-search-example-binary.png}\\
Proposed line search
}
\end{center}
Letters show correspondence between points on the ROC curve, and intervals of constants added to predicted values.
\end{frame}
\begin{frame}
\frametitle{Change-point example, comparison with grid search}
Theorem: when learning a linear model, $f(\mathbf x)= \mathbf w^T \mathbf x$,
\begin{itemize}
\item AUC is piecewise constant, and
\item AUM is piecewise linear,
\end{itemize}
as a function of step size in gradient descent.
\includegraphics[height=5cm]{figure-line-search-example-some}
\includegraphics[height=5cm]{figure-line-search-example-grid}
Proposed line search algorithm computes updates when there are
possible changes in slope of AUM / values of AUC (orange dots).
\end{frame}
\input{figure-line-search-example}
\begin{frame}
\frametitle{Complexity analysis of proposed algorithm}
For $N$ labeled observations, input $N$ threshold line
slope/intercept values. Possible next intersection points stored in
a C++ STL map (red-black tree, sorted by step size), $O(\log N)$
time insertion, $O(1)$ lookup of next intersection. Worst case $O(N)$
space.
%$O([N+I]\log N)$ time for $I$ iterations.
\begin{description}
\item[grid: standard grid search.] $O(G N\log N)$ time per step,
for $G$ grid points.
\item[linear(proposed): only first $N$ intersections.]
$O(N\log N)$ time per step, relatively small step sizes chosen,
relatively large number of steps overall in gradient descent.
\item[quadratic(proposed): all $O(N^2)$ intersections.]
$O(N^2\log N)$ time per step, large step sizes, small number of
steps.
\item[first min(proposed): keep iterating until first AUM increase.]
Same as quadratic in worst case, but may be faster on average (it
was faster than both quadratic and linear for the example on the
previous slide).
\end{description}
\end{frame}
%\section{Empirical results: increased speed and comparable accuracy using proposed complete line search}
\begin{frame}
\frametitle{Proposed search consistently faster than grid search}
Analyzed supervised genomic change-point detection data set
H3K4me3\_TDH\_immune ($N=1073$ to 1248)
% > subtrain.sizes[data.name=="H3K4me3_TDH_immune"]
% data.name cv.type test.fold n.subtrain.diffs
% 1: H3K4me3_TDH_immune equal_labels 1 1248
% 2: H3K4me3_TDH_immune equal_labels 2 1200
% 3: H3K4me3_TDH_immune equal_labels 3 1197
% 4: H3K4me3_TDH_immune equal_labels 4 1073
from UCI Machine Learning Repository,
{\scriptsize \url{https://archive.ics.uci.edu/ml/datasets/chipseq},}
Train/test splits defined via 4-fold CV, linear model initialized by
minimizing regularized convex loss (surrogate for label error,
Hocking \emph{et al.} ICML 2013), keep doing AUM rate gradient
descent steps (with line search) until subtrain loss stops decreasing.
\includegraphics[width=\textwidth]{figure-line-search-complexity-compare-H3K4me3_TDH_immune-equal_labels-rate-IntervalRegressionCV-seconds}
\begin{description}
\item[first min(proposed):] keep iterating until first AUM
increase.
\item[grid:] search over step
size$\in\{10^{-9},10^{-8},\dots,10^1,10^0\}$.
\item[quadratic(proposed):] all line search iterations.
\item[linear(proposed):] only first $N$ line search iterations.
\end{description}
\end{frame}
\begin{frame}
\frametitle{Proposed search has similar accuracy as grid search}
Analyzed supervised genomic change-point detection data set
H3K4me3\_TDH\_immune ($N=1073$ to 1248)
% > subtrain.sizes[data.name=="H3K4me3_TDH_immune"]
% data.name cv.type test.fold n.subtrain.diffs
% 1: H3K4me3_TDH_immune equal_labels 1 1248
% 2: H3K4me3_TDH_immune equal_labels 2 1200
% 3: H3K4me3_TDH_immune equal_labels 3 1197
% 4: H3K4me3_TDH_immune equal_labels 4 1073
from UCI Machine Learning Repository,
{\scriptsize \url{https://archive.ics.uci.edu/ml/datasets/chipseq},}
Train/test splits defined via 4-fold CV, linear model initialized by
minimizing regularized convex loss (surrogate for label error,
Hocking \emph{et al.} ICML 2013), keep doing AUM rate gradient
descent steps (with line search) until subtrain loss stops decreasing.
\includegraphics[width=\textwidth]{figure-line-search-complexity-compare-H3K4me3_TDH_immune-equal_labels-rate-IntervalRegressionCV-initial}
\begin{description}
\item[first min(proposed):] keep iterating until first AUM
increase.
\item[grid:] search over step
size$\in\{10^{-9},10^{-8},\dots,10^1,10^0\}$.
\item[quadratic(proposed):] all line search iterations.
\item[linear(proposed):] only first $N$ line search iterations.
\end{description}
\end{frame}
\begin{frame}
\frametitle{Asymptotic time complexity analysis}
\includegraphics[width=\textwidth]{data_Classif_figure_units}
\begin{description}
\item[max:] theoretical worst case, $N(N-1)/2$ iterations.
\item[quadratic(proposed):] all line search iterations.
\item[first min(proposed):] keep iterating until first AUM increase
(same number of steps/solution as quadratic, but asymptotically
faster/smaller slope).
\item[linear(proposed):] only first $N$ line search iterations.
\end{description}
\end{frame}
\section{Discussion and Conclusions}
\begin{frame}
\frametitle{Discussion and Conclusions}
\begin{itemize}
\item Area Under the ROC Curve (AUC) is used to evaluate binary
classification and changepoint detection algorithms.
% \item In changepoint detection there can be loops in ROC curves, so
% maximizing AUC greater than 1 is not be desirable.
% \item In changepoint detection, maximizing Area Under ROC curve is
% non-trivial even for the train set with unconstrained
% predictions.
\item Hocking, Hillman, \emph{Journal of Machine Learning Research}
(2023), proposed AUM=Area Under Min(FP,FN), a new
differentiable surrogate loss for AUC optimization.
\item In this talk we proposed new gradient descent algorithms with
efficient complete line search, for optimizing AUM/AUC.
\item Empirical results provide evidence that proposed complete line
search is consistently faster than grid search, and has
comparable accuracy (in terms of max validation AUC).
\item Line search implemented in R/C++:
{\scriptsize
\url{https://cloud.r-project.org/web/packages/aum/} (R/C++ line search)
}
\item Future work: line search for non-linear prediction functions
such as neural networks with ReLU activation?
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Thanks to co-author Jadon Fowler! (second from left)}
\includegraphics[height=3in]{2023-02-02-group-meeting}
Contact: toby.hocking@nau.edu
\end{frame}
\begin{frame}
\frametitle{AUC can be greater than one for change-point problems with non-monotonic error functions}
\centering
\includegraphics[height=0.7\textheight]{screenshot-auc-2-example}
\scriptsize
\url{https://nhintruong.github.io/figure-aum-convexity-interactive}
\end{frame}
\begin{frame}
\frametitle{Initial/optimized AUC/AUM for change-point problems}
\centering
\includegraphics[height=0.7\textheight]{figure-auc-improved-interactive-screenshot}
\url{https://tdhock.github.io/2023-11-21-auc-improved}
\end{frame}
\end{document}