forked from thomaskeck/MultivariateClassificationLecture
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path4_boosted_decision_trees.tex
138 lines (123 loc) · 5.74 KB
/
4_boosted_decision_trees.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
\section[bdt]{Boosted Decision Trees / Ensemble Methods}
\begin{frame}
\frametitle{Idea}
\begin{center}
\textbf{Average many simple models to obtain a robust complex model}
\begin{align*}
F\left( \vec{x} \right) &= \sum_m \gamma_m f_m(\vec{x})
\end{align*}
\end{center}
\end{frame}
\begin{frame}
\frametitle{Boosting}
\begin{center}
\begin{align*}
f_m\left( \vec{x} \right) &= f_{m-1} + \mathrm{arg} \min_f \sum_i^N L \left(y_i, f_{m-1} (\vec{x}_i) - f(\vec{x}_i) \right)
\end{align*}
\begin{columns}
\column{0.35\textwidth}
\begin{tikzpicture}
\node[rectangle,fill=blue!50] at (0,0) {\small{Training Sample}};
\node[rectangle,fill=blue!50] at (0,-1) {\small{Weighted Sample}};
\node[rectangle,fill=blue!50] at (0,-2) {\small{Weighted Sample}};
\node[rectangle,fill=blue!50] at (0,-3) {\small{Weighted Sample}};
\node[rectangle,fill=blue!50] at (0,-4.5) {\small{Weighted Sample}};
\draw[->,line width=1pt,Black] (1.5,0) -> (2,0) node[right] {$f_{0}(\vec{x})$};
\draw[->,line width=1pt,Black] (1.5,-1) -> (2,-1) node[right] {$f_{1}(\vec{x})$};
\draw[->,line width=1pt,Black] (1.5,-2) -> (2,-2) node[right] {$f_{2}(\vec{x})$};
\draw[->,line width=1pt,Black] (1.5,-3) -> (2,-3) node[right] {$f_{3}(\vec{x})$};
\draw[->,line width=1pt,Black] (1.5,-4.5) -> (2,-4.5) node[right] {$f_{m}(\vec{x})$};
\draw[->,line width=1pt,Black] (0,-0.3) -> (0,-0.7);
\draw[->,line width=1pt,Black] (0,-1.3) -> (0,-1.7);
\draw[->,line width=1pt,Black] (0,-2.3) -> (0,-2.7);
\draw[->,line width=1pt,Black,dotted] (0,-3.3) -> (0,-4.2);
\end{tikzpicture}
\column{0.65\textwidth}
\begin{itemize}
\item Reweight events w.r.t current prediction
\item Individual classifiers are simple; in order to avoid overfitting (weak-learners)
\item Focus on events near the optimal separation hyper-plane
\item Loss function $L$ is crucial
\begin{itemize}
\item Least square $\rightarrow$ Regression
\item Binomial deviance $\rightarrow$ GradientBoost classification
\item Exponential loss $\rightarrow$ AdaBoost classification
\end{itemize}
\end{itemize}
\end{columns}
\end{center}
\end{frame}
\begin{frame}
\frametitle{Bagging}
\begin{center}
\begin{tikzpicture}
\node[rectangle,fill=blue!50] (initial) at (0,1) {\small{Training Sample}};
\node[rectangle,fill=blue!50] (first) at (-4.5,0) {\small{Sub Sample}};
\node[rectangle,fill=blue!50] (second) at (-2,0) {\small{Sub Sample}};
\node[rectangle,fill=blue!50] (third) at (0.5,0) {\small{Sub Sample}};
\node[rectangle,fill=blue!50] (fourth) at (4,0) {\small{Sub Sample}};
\draw[->,line width=1pt,Black] (first.south) -> (-4.5,-1) node[below] {$f_{0}(\vec{x})$};
\draw[->,line width=1pt,Black] (second.south) -> (-2,-1) node[below] {$f_{1}(\vec{x})$};
\draw[->,line width=1pt,Black] (third.south) -> (0.5,-1) node[below] {$f_{2}(\vec{x})$};
\draw[->,line width=1pt,Black] (fourth.south) -> (4,-1) node[below] {$f_{m}(\vec{x})$};
\draw[-,line width=1pt,Black,dotted, ultra thick] (third.east) -> (fourth.west);
\draw[->,line width=1pt,Black] (initial.west) -> (first.north);
\draw[->,line width=1pt,Black] (initial.south west) -> (second.north);
\draw[->,line width=1pt,Black] (initial.south) -> (third.north);
\draw[->,line width=1pt,Black] (initial.east) -> (fourth.north);
\end{tikzpicture}
\begin{itemize}
\item Bagging -- Use only a fraction of events / features per classifier
\item Robustness against statistical fluctuations in the data
\item Embarrassingly parallel
\item Sampling method is crucial:
\begin{itemize}
\item Draw random events with replacements $\rightarrow$ Bagging
\item Draw random events without replacement $\rightarrow$ Pasting
\item Draw random features $\rightarrow$ Random Subspaces
\end{itemize}
\end{itemize}
\end{center}
\end{frame}
\begin{frame}
\frametitle{Stochastic Boosted Decision Trees}
\begin{center}
\begin{itemize}
\item Good out-of-the-box performance
\item Robust against over-fitting
\item Supports classification and regression
\item Widely used in HEP
\end{itemize}
\begin{tikzpicture}
\node[anchor=south west,inner sep=0] (image) at (0,0) {\includegraphics[width=0.6\textwidth]{forest_classifier.png}};
\end{tikzpicture}
\end{center}
\end{frame}
\begin{frame}
\frametitle{Example Classifier Quality}
\begin{center}
\begin{tikzpicture}
\node[anchor=south west,inner sep=0] (image) at (0,0) {\includegraphics[width=\textwidth]{forest_roc.png}};
\end{tikzpicture}
\end{center}
\end{frame}
\begin{frame}
\frametitle{Further Ensemble Methods}
\begin{center}
\textbf{Categorization}
\begin{itemize}
\item Divide feature-space into sub-spaces
\item Different behavior of the data in the chosen subspaces
\item e.g. train separate classifiers for Barrel and Endcap
\end{itemize}
\begin{tikzpicture}
\node[anchor=south west,inner sep=0] (image) at (0,0) {\includegraphics[width=0.4\textwidth]{belle2_detector.png}};
\end{tikzpicture}
\textbf{Combination}
\begin{itemize}
\item Combine different classifiers
\item Different regularization methods learn different aspects of the data
\item e.g. combine neural network, BDT and SVM
\end{itemize}
\end{center}
\end{frame}