Presentation.tex

\documentclass{beamer}

\beamertemplatenavigationsymbolsempty

\usepackage{ulem}
\usetheme{default}

\title{\sout{Micro} \sout{Nano} Femto \sout{Chat} GPT}
\subtitle{A small Generative Pretrained Transformer}
\author{Vincent Semeria}

\begin{document}
\begin{frame}
\titlepage
\end{frame}

\begin{frame}
\frametitle{What is the transformer language model?}

\begin{itemize}
\item A random text generator, proposed by Google in 2017: ``Attention Is All You Need'', https://arxiv.org/abs/1706.03762

\bigskip
\item Computes the probability of the next word given some words, based on a training text. For example,
\begin{itemize}
\item An apple a day keeps the doctor \_\_\_\_\_\_\_\_
\item (away, 99.1\%), (happy, 0.4\%), (uninterested, 0.1\%), (unemployed, 0.1\%), ...
\end{itemize}

\bigskip 
\item Temperature, creativity: truncate distribution to first $n$ words

\bigskip
\item Repeats to generate text
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{Implementation of a very simplified transformer}

\begin{itemize}
\item True GPT needs a cloud and 1 billion dollars for training

\bigskip
\item Simplify to study, as did Andrej Karpathy:  https://www.youtube.com/watch?v=kCc8FmEb1nY

\bigskip
\item Much smaller training set: Shakespeare instead of the Internet

\bigskip
\item Instead of words, characters as tokens (alphabet $\mathcal{A}$):  !\&',-.:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ \_ abcdefghijklmnopqrstuvwxyz

\bigskip
\item No libraries (PyTorch, Tensorflow, ...), no GPUs

\bigskip
\item Combined with classical programming for syntax rules
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{Selection of a language model: maximum likelihood}

\begin{itemize}
\item There are infinitely many language models (they have continuous parameters), we need a measure to find good ones: \textbf{the probability $P$ that the model generates the training text}
\begin{eqnarray*} P(\text{\color{red} t}\text{\color{blue}o}\text{ be or not to be}) &= & P(\text{\color{blue}o}\text{ be or not to be} | \text{\color{red} t} ) P(\text{\color{red}t})\\ &=& P(\text{ be or not to be} | \text{\color{red}t}\text{\color{blue}o} )P(\text{\color{blue}o}|\text{\color{red}t})P(\text{\color{red}t}) \\ &=& \prod P(character | prefix) \end{eqnarray*}
\item Probability too small for 64-bit numbers, compute its log
$$ \text{loss} \; =\; -\sum \ln  P(character | prefix) \; \geq 0 $$
the lower the loss, the better the model
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{For the mathematical purists}
A character-based language model is a probability $P$ on
\bigskip
\begin{itemize}
\item universe $\Omega = \mathcal{A}^\mathbb{N}$, the set of infinite-length texts on alphabet $\mathcal{A}$
\item product $\sigma$-algebra $\bigotimes_\mathbb{N}\mathcal{P}(\mathcal{A})$, in this case generated by $B_{k,c}$, $k\in\mathbb{N}, c\in \mathcal{A}$, the subsets of texts which $k$-th character is $c$
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{$n$-gram language model}

\begin{itemize}
\item It is the simplest language model: the prefix is the last $n-1$ characters, look it up in the training text and count possible next characters.
\bigskip
\item Example: $n=5$, training text is 1 million characters extracted from Shakespeare's theatrical plays. The prefix ``A cr'' occurs 6 times in the training text, followed by
$$ (\text{o}, 3), \; (\text{a}, 1), \; (\text{e},1), \; (\text{u}, 1) $$
Generation may produce ``A cro'', ``A cra'', ``A cre'' or ``A cru''
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{$5$-gram generated (fake) Shakespeare}
DUKE VINCENTIO:
I do wrongly.
For you, sir, and if the plain; and been, and complain!

Second Gentlement.

CLARENCE:
O, leave your draw my knacks,
So come to a husbands and
I never kinsman, the peers dead-piece of them puff'd, or not appare part:
Yet what royal serve me unbuckling man of thy brother spite of Angelo:
I am:
The lady:
would thou deny, thanks hear it once thy band?
Look, what much, seen thy was I,
Or elsewhere hast can down.

OXFORD:
The fair heaps it, meantic forth have you should not then were is that's you to graves,
As those thanks.

DUKE OF AUMERLE:
If!
Help to-nightful be gone! you done! be consul;
Intend yet me the may heave
doubt now, for they she winter that he clock on me?
\end{frame}

\begin{frame}
\frametitle{Problem of the $n$-gram language model: overfitting}

$$ \text{loss} = 1\, 220\, 044 = 1.216 \text{ per character} $$
\begin{itemize}
\item Decent generated text, words are English except ``appare'' and ``Gentlement'', even sequences of words make some sense
\item However, the loss is much bigger on another 100k characters of true Shakespeare, that we take as validation text
$$ \text{validation loss} = 211\, 421 = 1.897 \text{ per character} $$
\item The 5-gram recites the training text almost like a parrot, that's overfitting, plagiarism. Increasing $n$ will not help, longer training prefixes will appear even less in the validation text
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{Continuous embedding of characters}
\begin{itemize}
\item Use neural networks to generate better texts than $n$-gram

\bigskip
\item But there are only 64 characters in alphabet $\mathcal{A}$, that's discrete. Need an embedding into real numbers
$$ \text{embd} : \mathcal{A} \to \mathbb{R}^{d} $$
embd(z) is $d$ numbers that represent character z. Those numbers should be small, say in $[-2,2]$, because of the network's activation functions ($\text{ReLU}(x) =\max(x,0)$). $d$ is somewhere below 300.

\bigskip
\item embd is a rectangular matrix with alphabet size rows and $d$ columns. \textbf{That matrix is not chosen a priori, it is computed together with the other transformer parameters.}
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{2-layer perceptron (TLP)}

The simplest neural network

\begin{picture}(200,100)
\put(50,0){$W_1$}
\put(78,32){$\vdots$}
\put(78,70){$\vdots$}
\put(0,50){$x\in\mathbb{R}^n$}
\put(40,20){\line(1,0){35}}
\put(40,20){\line(1,1){35}}
\put(40,20){\line(1,2){35}}
\put(40,55){\line(1,0){35}}
\put(40,90){\line(1,0){35}}
\put(80,20){\circle{8}}
\put(80,55){\circle{8}}
\put(80,90){\circle{8}}
\put(105,17){ReLU}
\put(105,52){ReLU}
\put(105,87){ReLU}
\put(84,20){\line(1,0){20}}
\put(84,55){\line(1,0){20}}
\put(84,90){\line(1,0){20}}
\put(132,20){\line(1,0){35}}
\put(132,20){\line(1,1){35}}
\put(132,20){\line(1,2){35}}
\put(132,55){\line(1,0){35}}
\put(132,90){\line(1,0){35}}
\put(172,20){\circle{8}}
\put(172,55){\circle{8}}
\put(172,90){\circle{8}}
\put(150,0){$W_2$}
\put(190,50){$TLP(x)$}
\end{picture}

$$ TLP(x) = W_2\max(0, W_1 x + b_1) + b_2 $$
And in theory, the universal neural network

https://en.wikipedia.org/wiki/Universal\_approximation\_theorem
\end{frame}

\begin{frame}
\frametitle{TLP fails}
\begin{itemize}
\item But in practice, the hidden layer has too many neurons (i.e. common dimension of $W_1$ and $W_2$ too big), it is too slow. Also it overfits any training set.

\bigskip
\item Example: prefix 64 characters, embedding dimension 10, hidden layer 100, 71000 parameters: validation loss 2.216. Worse than $n$-gram. Increasing embedding dimension of number of layers does not help.
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{Attention: weights of previous words}

Add computation before TLP, specific of text generation.

\bigskip
\begin{quote}
On that \underline{sunny} \underline{day}, Jules, Elizabeth's fine-looking \underline{golden} \underline{retriever}, \underline{was} \_\_\_\_\_\_\_\_
\end{quote}

\bigskip
Important words:
\begin{itemize}
\item ``was'': the last word, so next word is probably a state or an action
\item ``golden retriever'': the subject, a dog
\item ``sunny day'': when the action takes place
\end{itemize}
Less important words:
\begin{itemize}
\item ``fine looking'', ``Jules'', ``Elizabeth's'': don't affect what the dog does
\item ``On that'': just grammar, could be deduced automatically
\end{itemize}

\end{frame}

\begin{frame}
\frametitle{Transformer: attention, aggregation of the prefix}
Too many inputs for the 2-layer perceptron: prefix size $\times$ embedding dimension. Reduce inputs to just embedding dimension, by averaging the prefix
$$ \text{average character} = \sum_{i \leq prefix} w_i c_i $$
This average is called \textbf{attention}, and it's possible thanks to the continuous embedding. The weights are
$$ q_i = \langle c_i, Q c_{\text{last}} \rangle, \quad w_i = \frac{\exp q_i}{\sum_k \exp q_k} \quad (\text{softmax}) $$
$Q$ is called the query matrix. It is computed, like the embedding matrix.
\end{frame}

\begin{frame}
\frametitle{Gradient descent}
\begin{itemize}
\item Loss is differentiable w.r.t. embedding matrix, query matrix, 2 matrices and biases of TLP
\bigskip
\item However multiple local minima are suspected, because peaky softmax (some $w_k \simeq 1$) has low gradient:
$$ d w_i = w_i d q_i - w_i \sum_j w_j d q_j $$
Those local minima are probably bad: degenerated average token, forced next character.
\bigskip
\item Among several models with similar traing loss, we can take the least validation loss, but then it also becomes part of training.
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{Find low local minimum: regularization}
\begin{itemize}
\item Small parameters (in [-2,2] for this toy transformer) tend to avoid peaky softmax (no $q_k$ dominates the others), and avoid the zero flat part of ReLU$(x)=\max(x,0)$
\bigskip
\item Can use RegulLoss = Loss + $\lambda \Vert \text{parameters} \Vert$
\bigskip
\item Can manually decrease high parameters and continue gradient descent
\bigskip
\item Scaled attention, layer norms: rescale computations in the network ; closer to biological neurons, which send finite spikes
\end{itemize}
\end{frame}

%\begin{frame}
%\frametitle{Adaptive stochastic gradient descent}
%\begin{itemize}
%\item The loss is a sum: $\sum - \ln  P(character | prefix)$. Approximate $\frac{\partial \text{loss}}{\partial \text{params}}$ by a random subset of this sum at each iteration. This is consistent with the hope that the language model generalizes to any Shakespeare text.
%\bigskip
%\item Also adapt the gradient step $\eta$ (aka learning rate), at each iteration:
%\begin{itemize}
%\item if $\text{loss}(p + \eta \frac{\partial \text{loss}}{\partial \text{params}}) < \text{loss}(p)$, move to $p + \eta \frac{\partial \text{loss}}{\partial \text{params}}$ and increase $\eta$
%\item otherwise add next batch to $\frac{\partial \text{loss}}{\partial \text{params}}$ and decrease $\eta$
%\end{itemize}
%\bigskip
%\item This algorithm converges to a loss's local minimum, because after $\frac{\text{text size}}{\text{batch size}}$ iterations, it either made progress, or the gradient is complete
%\end{itemize}
%\end{frame}

\begin{frame}
\frametitle{Transfer learning}
\begin{itemize}
\item Transfer learning is further training a pretrained neural network for a different task, which may be faster than training for the new task from scratch
\bigskip
\item Here we can increase the character embedding dimension by transfer learning. 

\begin{itemize}
\item The affinities $\langle c_i, Q c_{\text{last}} \rangle$ are preserved if we append zeros to the $c_i$. So we append zero columns to the embedding matrix.
\item Matrix $Q$ can be enlarged by random values in $[-1,1]$, to maximize the gradient. Likewise for the 2 matrices of the 2-layer perceptron.
\end{itemize}
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{Results}
\begin{tabular}{ccccccc} type & prefix & char embd & heads & layers & train loss & val loss \\
\hline
random & 0 & & & & 4.159 & 4.159 \\
English & 64 & & & & 2.559 & 2.540 \\
just TLP & 64 & 10 & & & & 2.216 \\
5-gram & 4 & & & & 1.216 & 1.897 \\
transf & 64 & 16 & 1 & 1 & 1.576 & 1.634 \\
transf & 64 & 64 & 2 & 1 & 1.528 & 1.600
\end{tabular}
%% train length	1003109    val length	111471

\end{frame}

\begin{frame}
\frametitle{Mix with classical programming}
\begin{itemize}
\item At the end of the network, the scores for the next character can be filtered by an arbitrarily complex, classically programmed function. Force the characters to form words in Shakespeare's vocabulary, force English syntax rules.
\bigskip
\item This is a linear diagonal operator, with either 1's or 0's on the diagonal; easy to include in the gradient computation
\bigskip
\item AI complements classical programming: when some rules can be programmed classically, often more efficient than using AI to learn them. Reduces the number of parameters in the network, explains better.
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{Future work}

\begin{itemize}
\item Generate texts in Shakespeare's style, without plagiarism
\bigskip
\item Easier, generate texts grammatically correct: $n$-gram training loss 1.216 per character, so beat that with a language model that generalizes (validation loss $\simeq$ training loss)
\bigskip
\item To reach that score, deep changes are required
\begin{itemize}
\item Work on words instead of characters
\item Increase prefix (currently 64 characters)
\item more parameters, more layers
\item Improve the gradient descent: Adam, skip connections, layer norms, dropout
\item Parallelize on GPU: PyTorch (Python or C++) or directly in CUDA
\end{itemize}
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{Lessons learnt}

\begin{itemize}
\item Data is expensive: cleanup didascaliae, headers, footers, editor's corrections. Merge different versions to get vocabulary.
\bigskip
\item Gradients are hard to compute correctly, harder to compute quickly. Automatic gradient (PyTorch, Tensorflow) helps. A gradient in dimension 53000 takes about 4x more time than its function, quite cheap.
\bigskip
\item IA is Experimental
\begin{itemize}
\item try a lot of architectures: before transformers there were RNN and LSTM, no clear reason why transformers work better
\item try a lot of gradient descents: when a local minimum is found, hard to know if there is a better one; transfer learning seems necessary
\end{itemize}
\bigskip
\item Mix IA with classical programming, not oppose them
\end{itemize}
\end{frame}


\end{document}