diff --git a/fastai2/_nbdev.py b/fastai2/_nbdev.py index bcc74cf9..614059c5 100644 --- a/fastai2/_nbdev.py +++ b/fastai2/_nbdev.py @@ -775,6 +775,9 @@ "CaptumInterpretation": "73_callback.captum.ipynb", "CaptumInterpretation.insights": "73_callback.captum.ipynb", "CutMix": "74_callback.cutmix.ipynb", + "BSFinder": "75_callback.bs_finder.ipynb", + "Recorder.plot_bs_find": "75_callback.bs_finder.ipynb", + "Learner.bs_find": "75_callback.bs_finder.ipynb", "synth_dbunch": "97_test_utils.ipynb", "RegModel": "97_test_utils.ipynb", "synth_learner": "97_test_utils.ipynb", @@ -831,6 +834,7 @@ "callback/neptune.py", "callback/captum.py", "callback/cutmix.py", + "callback/bs_finder.py", "test_utils.py", "_pytorch_doc.py"] diff --git a/fastai2/callback/bs_finder.py b/fastai2/callback/bs_finder.py new file mode 100644 index 00000000..77e7799d --- /dev/null +++ b/fastai2/callback/bs_finder.py @@ -0,0 +1,151 @@ +# AUTOGENERATED! DO NOT EDIT! File to edit: nbs/75_callback.bs_finder.ipynb (unless otherwise specified). + +__all__ = ['BSFinder'] + +# Cell +from ..basics import * + +# Cell +# Linear combination for the moving average +def _lin_comb(v1, v2, beta): return beta*v1 + (1-beta)*v2 + + +def _ema_with_debias(avg, beta, yi, i): + "Exponential moving average with debiasing" + if avg is None: avg=0 + avg = _lin_comb(avg, yi, beta) + return avg, avg/(1-beta**(i+1)) + + +def _get_flatten_grads(model): + parameters = L(model.parameters()) + grads = [param.grad.flatten().view(-1,1) for param in parameters if not type(param.grad) == type(None)] + grad = torch.cat(grads) + return grad + +# Cell +class BSFinder(Callback): + "A `Callback` that implements \"An Empirical Model of Large-Batch Training\" (https://arxiv.org/abs/1812.06162)" + run_after=Recorder + + def __init__(self, num_it:int=None, n_batch=5, beta=0.99, simulate_multi_gpus=True): + store_attr(self, 'num_it, n_batch, beta, simulate_multi_gpus') + + def begin_fit(self): + # Save original model + self.learn.save('_tmp') + + if not self.num_it: self.num_it = len(self.dls.train) * (self.n_batch if self.simulate_multi_gpus else 1) + + self.running_scale = None + self.running_noise = None + + # Use python list instead L fastai list as torch.cat doesn't understand the former + self.stored_grads = [] + + # Here, we will store the results + self.stats = L() + self.count=0 + + def begin_validate(self): raise CancelValidException() + + def after_backward(self): + if self.train_iter >= self.num_it: raise CancelFitException() + + # Get gradients and store them + self.stored_grads.append(_get_flatten_grads(self.model)) + + self.count += 1 + if self.count != len(self.stored_grads): + breakpoint() + + if self.simulate_multi_gpus and len(self.stored_grads) < self.n_batch: + self.opt.zero_grad() # Zero gradients to avoid acumulate them between batches + #print('a', self.count, self.train_iter, learn.model.embeds[0].weight[0][:3].tolist()) + raise CancelBatchException() #skip weight update + + if len(self.stored_grads) == self.n_batch: + self.count=0 + #print('b', self.count, self.train_iter, learn.model.embeds[0].weight[0][:3].tolist()) + # We have enough batches to compute Simple Noise Scale ratio. + + # We concatenate the batches and empty the buffer + stored_grads = torch.cat(self.stored_grads, dim=1) + self.stored_grads.clear() + + acc_grads = stored_grads.mean(dim = 1) + + # The original implementation uses .mean() although in the original article didn't do it. However, averaging g_big and g_small doesn't affect to noise_scale ratio + if self.simulate_multi_gpus: g_small = (stored_grads ** 2).sum(dim=0).mean() + else: g_small = (stored_grads[:,-1] ** 2).sum() # .mean() + + # print((stored_grads ** 2).sum(dim=0).mean().item(), (stored_grads[:,-1] ** 2).sum().item(), (stored_grads ** 2).sum(dim=0).tolist()) + + g_big = (acc_grads ** 2).sum() # .mean() + + bs = find_bs(self.yb) # or self.dls.train.bs + b_small, b_big = bs, bs * self.n_batch + + noise = (b_big * g_big - b_small * g_small) / (b_big - b_small) + scale = (g_small - g_big) / ((1 / b_small) - (1 / b_big)) + + self.running_scale, scale = _ema_with_debias(self.running_scale,self.beta,scale,self.iter) + self.running_noise, noise = _ema_with_debias(self.running_noise,self.beta,noise,self.iter) + + scale = scale.item() + noise = noise.item() + noise_scale = (scale / noise) + + # Save results + self.stats.append(dict(n_iter=(len(self.stats)) * (1 if self.simulate_multi_gpus else self.n_batch), + noise=noise, scale=scale, noise_scale=noise_scale)) + + def after_fit(self): + if self.train_iter < self.num_it: warn(f"Fitting was too short to complete all expectediterations. Please, increase the number of epochs") + + tmp_f = self.path/self.model_dir/'_tmp.pth' + if tmp_f.exists(): + self.learn.load('_tmp') + os.remove(tmp_f) + + if hasattr(self.learn, 'recorder'): + # index = pd.Index(torch.arange(1, len(self.stats)+1)*self.n_batch, name='n_iter') + df = pd.DataFrame(self.stats)#, index=index) + df.set_index('n_iter', inplace=True) + self.recorder.bs_find_stats = df + + _docs = {"begin_fit": "Initialize container for search results and auxiliary variables and save the model", + "after_fit": "Record hyper-parameters of this batch and potentially stop training", + "after_backward": "Store gradients and compute Simple Noise Scale", + "begin_validate": "Skip the validation part of training"} + + +# Cell +@patch +def plot_bs_find(self:Recorder): + "Plot the result of an BS Finder test (won't work if you didn't do `learn.bs_find()` before)" + fig, ax = plt.subplots(1,1) + stats = self.bs_find_stats + ax.plot(stats.index, stats.noise_scale) + ax.set_ylabel("Simple Noise Scale") + ax.set_xlabel("# iteration") + +# Cell +@delegates(BSFinder) +@patch +def bs_find(self:Learner, lr, num_it=None, n_batch=5, simulate_multi_gpus=True, show_plot=True, **kwargs): + """ + Launch a mock training to find a good batch size to minimaze training time. + However, it may not be a good batch size to minimize the validation loss. + + A good batch size is where the Simple Noise Scale converge ignoring the small growing trend + with the number of iterations if exists. The optimal batch size is about an order the magnitud + where Simple Noise scale converge. Typically, the optimial batch size in image classification + problems will be 2-3 times lower where + """ + num_it = num_it if num_it else len(self.dls.train) + num_it *= n_batch if simulate_multi_gpus else 1 + n_epoch = num_it//len(self.dls.train) + cb=BSFinder(num_it=num_it, n_batch=n_batch, simulate_multi_gpus=simulate_multi_gpus, **kwargs) + with self.no_logging(): self.fit(n_epoch, lr, cbs=cb) + if show_plot: self.recorder.plot_bs_find() \ No newline at end of file diff --git a/nbs/75_callback.bs_finder.ipynb b/nbs/75_callback.bs_finder.ipynb new file mode 100644 index 00000000..2b960a9f --- /dev/null +++ b/nbs/75_callback.bs_finder.ipynb @@ -0,0 +1,1123 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#default_exp callback.bs_finder" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#export\n", + "from fastai2.basics import *" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from nbdev.showdoc import *" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Batch size finder callback\n", + "> Callback to apply [An Empirical Model of Large-Batch Training](https://arxiv.org/pdf/1812.06162.pdf) to find the optimal batch size to minimaze the training time. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Be aware that this batch size may not be a the best one to minimize the validation loss. From the paper:\n", + "> Some papers have reported a “generalizaion gap” in which large batch sizes lead to good training loss but cause a degradation in test loss, apparently unrelated to overfitting. \n", + "\n", + "Althought, also from the paper:\n", + "> ... but recent work has found no evidence of a generalization gap when hyperparameters are properly tuned.\n", + "\n", + "See section 2.4 \"Assumptions and Caveats\" from the research paper for more caveats of this method. \n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Helper functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#export\n", + "# Linear combination for the moving average\n", + "def _lin_comb(v1, v2, beta): return beta*v1 + (1-beta)*v2\n", + "\n", + "\n", + "def _ema_with_debias(avg, beta, yi, i):\n", + " \"Exponential moving average with debiasing\"\n", + " if avg is None: avg=0\n", + " avg = _lin_comb(avg, yi, beta)\n", + " return avg, avg/(1-beta**(i+1))\n", + "\n", + "\n", + "def _get_flatten_grads(model):\n", + " parameters = L(model.parameters())\n", + " grads = [param.grad.flatten().view(-1,1) for param in parameters if not type(param.grad) == type(None)]\n", + " grad = torch.cat(grads)\n", + " return grad " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The callback itself" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#export\n", + "class BSFinder(Callback):\n", + " \"A `Callback` that implements \\\"An Empirical Model of Large-Batch Training\\\" (https://arxiv.org/abs/1812.06162)\"\n", + " run_after=Recorder\n", + " \n", + " def __init__(self, num_it:int=None, n_batch=5, beta=0.99, simulate_multi_gpus=True): \n", + " store_attr(self, 'num_it, n_batch, beta, simulate_multi_gpus')\n", + "\n", + " def begin_fit(self):\n", + " # Save original model\n", + " self.learn.save('_tmp')\n", + " \n", + " if not self.num_it: self.num_it = len(self.dls.train) * (self.n_batch if self.simulate_multi_gpus else 1)\n", + " \n", + " self.running_scale = None\n", + " self.running_noise = None\n", + " \n", + " # Use python list instead L fastai list as torch.cat doesn't understand the former\n", + " self.stored_grads = []\n", + "\n", + " # Here, we will store the results\n", + " self.stats = L()\n", + " self.count=0\n", + "\n", + " def begin_validate(self): raise CancelValidException()\n", + " \n", + " def after_backward(self): \n", + " if self.train_iter >= self.num_it: raise CancelFitException()\n", + " \n", + " # Get gradients and store them\n", + " self.stored_grads.append(_get_flatten_grads(self.model))\n", + " \n", + " self.count += 1\n", + " if self.count != len(self.stored_grads):\n", + " breakpoint()\n", + "\n", + " if self.simulate_multi_gpus and len(self.stored_grads) < self.n_batch: \n", + " self.opt.zero_grad() # Zero gradients to avoid acumulate them between batches\n", + " #print('a', self.count, self.train_iter, learn.model.embeds[0].weight[0][:3].tolist())\n", + " raise CancelBatchException() #skip weight update\n", + " \n", + " if len(self.stored_grads) == self.n_batch: \n", + " self.count=0\n", + " #print('b', self.count, self.train_iter, learn.model.embeds[0].weight[0][:3].tolist())\n", + " # We have enough batches to compute Simple Noise Scale ratio.\n", + " \n", + " # We concatenate the batches and empty the buffer\n", + " stored_grads = torch.cat(self.stored_grads, dim=1)\n", + " self.stored_grads.clear()\n", + " \n", + " acc_grads = stored_grads.mean(dim = 1)\n", + " \n", + " # The original implementation uses .mean() although in the original article didn't do it. However, averaging g_big and g_small doesn't affect to noise_scale ratio \n", + " if self.simulate_multi_gpus: g_small = (stored_grads ** 2).sum(dim=0).mean() \n", + " else: g_small = (stored_grads[:,-1] ** 2).sum() # .mean()\n", + " \n", + " # print((stored_grads ** 2).sum(dim=0).mean().item(), (stored_grads[:,-1] ** 2).sum().item(), (stored_grads ** 2).sum(dim=0).tolist())\n", + " \n", + " g_big = (acc_grads ** 2).sum() # .mean()\n", + " \n", + " bs = find_bs(self.yb) # or self.dls.train.bs\n", + " b_small, b_big = bs, bs * self.n_batch\n", + " \n", + " noise = (b_big * g_big - b_small * g_small) / (b_big - b_small)\n", + " scale = (g_small - g_big) / ((1 / b_small) - (1 / b_big))\n", + "\n", + " self.running_scale, scale = _ema_with_debias(self.running_scale,self.beta,scale,self.iter)\n", + " self.running_noise, noise = _ema_with_debias(self.running_noise,self.beta,noise,self.iter)\n", + "\n", + " scale = scale.item()\n", + " noise = noise.item()\n", + " noise_scale = (scale / noise)\n", + " \n", + " # Save results\n", + " self.stats.append(dict(n_iter=(len(self.stats)) * (1 if self.simulate_multi_gpus else self.n_batch),\n", + " noise=noise, scale=scale, noise_scale=noise_scale))\n", + " \n", + " def after_fit(self):\n", + " if self.train_iter < self.num_it: warn(f\"Fitting was too short to complete all expectediterations. Please, increase the number of epochs\")\n", + " \n", + " tmp_f = self.path/self.model_dir/'_tmp.pth'\n", + " if tmp_f.exists():\n", + " self.learn.load('_tmp')\n", + " os.remove(tmp_f)\n", + " \n", + " if hasattr(self.learn, 'recorder'): \n", + " # index = pd.Index(torch.arange(1, len(self.stats)+1)*self.n_batch, name='n_iter')\n", + " df = pd.DataFrame(self.stats)#, index=index)\n", + " df.set_index('n_iter', inplace=True)\n", + " self.recorder.bs_find_stats = df\n", + " \n", + " _docs = {\"begin_fit\": \"Initialize container for search results and auxiliary variables and save the model\",\n", + " \"after_fit\": \"Record hyper-parameters of this batch and potentially stop training\",\n", + " \"after_backward\": \"Store gradients and compute Simple Noise Scale\",\n", + " \"begin_validate\": \"Skip the validation part of training\"}\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "add_docs(BSFinder, **BSFinder._docs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "

BSFinder.begin_fit[source]

\n", + "\n", + "> BSFinder.begin_fit()\n", + "\n", + "Initialize container for search results and auxiliary variables and save the model" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "show_doc(BSFinder.begin_fit)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "

BSFinder.after_fit[source]

\n", + "\n", + "> BSFinder.after_fit()\n", + "\n", + "Record hyper-parameters of this batch and potentially stop training" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "show_doc(BSFinder.after_fit)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "

BSFinder.after_backward[source]

\n", + "\n", + "> BSFinder.after_backward()\n", + "\n", + "Store gradients and compute Simple Noise Scale" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "show_doc(BSFinder.after_backward)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "

BSFinder.begin_validate[source]

\n", + "\n", + "> BSFinder.begin_validate()\n", + "\n", + "Skip the validation part of training" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "show_doc(BSFinder.begin_validate)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Methods for `Recorder` and `Learner` classes related to `BSFinder` callback. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#export\n", + "@patch\n", + "def plot_bs_find(self:Recorder):\n", + " \"Plot the result of an BS Finder test (won't work if you didn't do `learn.bs_find()` before)\"\n", + " fig, ax = plt.subplots(1,1)\n", + " stats = self.bs_find_stats\n", + " ax.plot(stats.index, stats.noise_scale)\n", + " ax.set_ylabel(\"Simple Noise Scale\")\n", + " ax.set_xlabel(\"# iteration\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#export\n", + "@delegates(BSFinder)\n", + "@patch\n", + "def bs_find(self:Learner, lr, num_it=None, n_batch=5, simulate_multi_gpus=True, show_plot=True, **kwargs):\n", + " \"\"\"\n", + " Launch a mock training to find a good batch size to minimaze training time. \n", + " However, it may not be a good batch size to minimize the validation loss. \n", + " \n", + " A good batch size is where the Simple Noise Scale converge ignoring the small growing trend \n", + " with the number of iterations if exists. The optimal batch size is about an order the magnitud\n", + " where Simple Noise scale converge. Typically, the optimial batch size in image classification \n", + " problems will be 2-3 times lower where \n", + " \"\"\" \n", + " num_it = num_it if num_it else len(self.dls.train)\n", + " num_it *= n_batch if simulate_multi_gpus else 1\n", + " n_epoch = num_it//len(self.dls.train)\n", + " cb=BSFinder(num_it=num_it, n_batch=n_batch, simulate_multi_gpus=simulate_multi_gpus, **kwargs)\n", + " with self.no_logging(): self.fit(n_epoch, lr, cbs=cb)\n", + " if show_plot: self.recorder.plot_bs_find()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "

class BSFinder[source]

\n", + "\n", + "> BSFinder(**`num_it`**:`int`=*`None`*, **`n_batch`**=*`5`*, **`beta`**=*`0.99`*, **`simulate_multi_gpus`**=*`True`*) :: [`Callback`](/callback.core#Callback)\n", + "\n", + "A [`Callback`](/callback.core#Callback) that implements \"An Empirical Model of Large-Batch Training\" (https://arxiv.org/abs/1812.06162)" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "show_doc(BSFinder)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`num_it` is the number of batches you want to process. It can be set to `None` and it will automatically train during one epoch (or `n_batch` if `simulate_multi_gpus` is set to `True`). `n_batch` is the number of batches you want to store before computing the Simple Noise Scale. 5 to 20 seems to work well across different tasks.\n", + "\n", + "`beta` is the beta parameter for an exponential moving average to compute the sum of variances, and the scale of the gradient. If the plot is too irregular, try increasing to 0.999 or more if needed, or increase the `n_batch` parameter.\n", + "\n", + "`simulate_multi_gpus` controls if multi GPU setup is simulated with `n_batch` gpus to match with the original research paper implementation. The multi GPU setup is done by running `n_batch` batches without updating the model weights. Setting it to `False` use [DanyWind aproximation](https://towardsdatascience.com/implementing-a-batch-size-finder-in-fastai-how-to-get-a-4x-speedup-with-better-generalization-813d686f6bdf). It's faster but numerically more inestable and finds a Simple Noise Scale smaller than the original Simple Noise Scale. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ¿How to select a good the batch size with `BSFinder` to speed up the training?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. First, let's quickly create the `dls` where we can use a very big batch size. Here, it doesn't matter which batch size we use\n", + "2. Second, find a good learning rate with `Learner.lr_find`.\n", + "3. Third, find a good batch size with `Learner.bs_find`. **Remember** that this batch size is optimized for training speed but **not** necessarily to minimaze validation error. Regardless, it gives you a reference to the max batch size to use.\n", + "4. Fourth, recreate `dls` with the batch size\n", + "5. Fifth, find again a good learning rate with `Learner.lr_find` as usually the best lr depends on the batch size. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from fastai2.tabular.all import *" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create the `dls`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "path = untar_data(URLs.ADULT_SAMPLE)\n", + "df = pd.read_csv(path/'adult.csv')\n", + "\n", + "cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']\n", + "cont_names = ['age', 'fnlwgt', 'education-num']\n", + "\n", + "procs = [Categorify, FillMissing, Normalize]\n", + "y_names = 'salary'\n", + "y_block = CategoryBlock()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "to = TabularPandas(df, procs=procs, cat_names=cat_names, cont_names=cont_names,\n", + " y_names=y_names, y_block=CategoryBlock(), splits=RandomSplitter()(range_of(df)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dls = to.dataloaders()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
workclasseducationmarital-statusoccupationrelationshipraceeducation-num_naagefnlwgteducation-numsalary
0Federal-govSome-collegeMarried-civ-spouseAdm-clericalHusbandWhiteFalse61.000000136786.99969110.0>=50k
1PrivateHS-gradWidowedOther-serviceUnmarriedWhiteFalse42.000000153160.0012839.0<50k
2State-govBachelorsMarried-civ-spouseExec-managerialHusbandWhiteFalse44.000000175696.00041113.0>=50k
3PrivateBachelorsSeparatedAdm-clericalUnmarriedBlackFalse40.000000289403.00207113.0<50k
4PrivateBachelorsDivorcedProf-specialtyUnmarriedWhiteFalse54.99999980167.00411113.0<50k
5Private7th-8thMarried-civ-spouseCraft-repairHusbandWhiteFalse25.000000294400.0022684.0<50k
6PrivateHS-gradMarried-civ-spouseHandlers-cleanersHusbandBlackFalse43.000000477983.0074149.0<50k
7PrivateMastersNever-marriedProf-specialtyNot-in-familyWhiteFalse30.000000426431.00660614.0>=50k
8Local-govMastersMarried-civ-spouseProf-specialtyHusbandWhiteFalse51.000000108435.00061914.0>=50k
9PrivateSome-collegeNever-marriedSalesOwn-childWhiteFalse19.000001389142.99869910.0<50k
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "dls.show_batch()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The default batch size is:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "64" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dls.bs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Find a good learning rate" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "learn = tabular_learner(dls, [200,100], metrics=accuracy)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "learn.lr_find(show_plot=False)\n", + "learn.recorder.plot_lr_find(skip_end=10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Find a good batch size." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "learn.bs_find(1e-3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also access in the recorder with `Learner.recorder.plot_bs_find()` or access to the raw stats with `Learner.recorder.bs_find_stats`. Here we only show the first rows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
noisescalenoise_scale
n_iter
00.96715295.02965598.257166
10.74873095.985123128.197203
20.67157485.072830126.676711
30.56401684.955147150.625408
40.49414186.296661174.639852
\n", + "
" + ], + "text/plain": [ + " noise scale noise_scale\n", + "n_iter \n", + "0 0.967152 95.029655 98.257166\n", + "1 0.748730 95.985123 128.197203\n", + "2 0.671574 85.072830 126.676711\n", + "3 0.564016 84.955147 150.625408\n", + "4 0.494141 86.296661 174.639852" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "learn.recorder.bs_find_stats.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A good batch size is were Simple Noise Scale converge. It's expected to exist a small growing trend with the number of iterations. In case that Simple Noise Scale doesn't converge, try to increase `n_batch`. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this case, Simple Noise Scale has converged around 2000 or 2500. So a good batch size will be 2048 as they are powers of 2. Again, remember that this batch size is optimized for training speed but **not** to necessarily minimaze validation error. So, it gives you a reference to the max batch size to use." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, we recreate the dataloaders with the good batch size." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "learn.dls = to.dataloaders(bs=2048)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Lastly, find a good learning rate for this batch size." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "SuggestedLRs(lr_min=0.025118863582611083, lr_steep=0.0030199517495930195)" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "learn.lr_find()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "See some comparisons in the training speed. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
epochtrain_lossvalid_lossaccuracytime
00.5583740.6251990.76658500:00
10.4628990.5169870.77641300:00
20.4196760.4673840.79453300:00
30.3965230.4390860.81019700:00
40.3829780.4112330.82616700:00
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "learn.fit_one_cycle(5, 1e-3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With the optimized batch size, training is almost instantaneous." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In contrast, with default batch size:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
epochtrain_lossvalid_lossaccuracytime
00.3696080.3584640.83369200:04
10.3472420.3586080.83507400:04
20.3479590.3518340.83922000:04
30.3430890.3469790.84229100:04
40.3331520.3471570.84213800:04
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "learn.dls = dls\n", + "learn.fit_one_cycle(5, 1e-3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We have an infinite speed increase :D" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Just remember, this batch size is optimized for training speed but may not minimaze the validation error. Regardless, it gives you a reference to the max batch size to use." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}