From e813af396a81fccd6d73bd2abbebf5e2eeca793a Mon Sep 17 00:00:00 2001 From: zhangbo9674 Date: Fri, 3 Sep 2021 12:16:51 +0000 Subject: [PATCH 1/9] split minimize() to step() + update() --- .../paddle/fluid/dygraph/amp/loss_scaler.py | 122 +++++++++++++++++- 1 file changed, 121 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/dygraph/amp/loss_scaler.py b/python/paddle/fluid/dygraph/amp/loss_scaler.py index a9fe2c9f3ed7b..193db28e22dfc 100644 --- a/python/paddle/fluid/dygraph/amp/loss_scaler.py +++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py @@ -21,10 +21,22 @@ import warnings import numpy as np from paddle import _C_ops +from collections import defaultdict +from enum import Enum __all__ = ['AmpScaler'] +class OptimizerState(Enum): + INIT = 0 + UNSCALED = 1 + STEPPED = 2 + + +def _refresh_optimizer_state(): + return {"state": OptimizerState.INIT} + + class AmpScaler(object): """ :api_attr: imperative @@ -117,6 +129,7 @@ def __init__(self, self._scale = to_variable( np.array([self._init_loss_scaling]).astype(np.float32)) self._cache_founf_inf = None + self._optimizer_states = defaultdict(_refresh_optimizer_state) def scale(self, var): """ @@ -155,6 +168,88 @@ def scale(self, var): return var * self._scale + def step(self, optimizer, *args, **kwargs): + """ + This function is similar as `Optimizer.step()`, which performs parameters updating. + + If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped. + Otherwise, it first unscales the scaled gradients of parameters, then updates the parameters. + Args: + optimizer(Optimizer): The optimizer used to update parameters. + args: Arguments, which will be forward to `optimizer.minimize()`. + kwargs: Keyword arguments, which will be forward to `Optimizer.minimize()`. + Examples: + .. code-block:: python + import numpy as np + import paddle.fluid as fluid + data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32') + with fluid.dygraph.guard(): + model = fluid.dygraph.Conv2D(3, 2, 3) + optimizer = fluid.optimizer.SGDOptimizer( + learning_rate=0.01, parameter_list=model.parameters()) + scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024) + data = fluid.dygraph.to_variable(data) + with fluid.dygraph.amp_guard(): + conv = model(data) + loss = fluid.layers.reduce_mean(conv) + scaled = scaler.scale(loss) + scaled.backward() + scaler.step(optimizer, scaled) + scaler.update() + """ + if not self._enable: + return optimizer.minimize(*args, **kwargs) + + optimizer_state = self._optimizer_states[id(optimizer)] + if optimizer_state["state"] is OptimizerState.STEPPED: + raise RuntimeError( + "step() has already been called since the last update().") + + # unscale the grad + if optimizer_state["state"] is OptimizerState.INIT: + self._unscale(optimizer) + + optimize_ops, params_grads = (None, None) + + if self._found_inf: + self._cache_founf_inf = True + else: + optimize_ops, params_grads = optimizer.minimize(*args, **kwargs) + self._cache_founf_inf = False + + optimizer_state["state"] = OptimizerState.STEPPED + + return optimize_ops, params_grads + + def update(self): + """ + This function is used to update loss scaling ratio. + Examples: + .. code-block:: python + import numpy as np + import paddle.fluid as fluid + data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32') + with fluid.dygraph.guard(): + model = fluid.dygraph.Conv2D(3, 2, 3) + optimizer = fluid.optimizer.SGDOptimizer( + learning_rate=0.01, parameter_list=model.parameters()) + scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024) + data = fluid.dygraph.to_variable(data) + with fluid.dygraph.amp_guard(): + conv = model(data) + loss = fluid.layers.reduce_mean(conv) + scaled = scaler.scale(loss) + scaled.backward() + scaler.step(optimizer, scaled) + scaler.update() + """ + if not self._enable: + return + if self._use_dynamic_loss_scaling: + self._update() + self._optimizer_states = defaultdict(_refresh_optimizer_state) + return + def minimize(self, optimizer, *args, **kwargs): """ This function is similar as `Optimizer.minimize()`, which performs parameters updating. @@ -192,8 +287,14 @@ def minimize(self, optimizer, *args, **kwargs): if not self._enable: return optimizer.minimize(*args, **kwargs) + optimizer_state = self._optimizer_states[id(optimizer)] + if optimizer_state["state"] is OptimizerState.STEPPED: + raise RuntimeError( + "step() has already been called since the last update().") + # unscale the grad - self._unscale(optimizer) + if optimizer_state["state"] is OptimizerState.INIT: + self._unscale(optimizer) optimize_ops, params_grads = (None, None) @@ -210,9 +311,26 @@ def minimize(self, optimizer, *args, **kwargs): return optimize_ops, params_grads def _unscale(self, optimizer): + """ + Unscale the gradients of parameters, multiplies the gradients of parameters by 1/(loss scaling ratio). + If this instance of :class:`GradScaler` is not enabled, output are returned unmodified. + Args: + optimizer(Optimizer): The optimizer used to update parameters. + Returns: + The unscaled parameters or original parameters. + """ if not self._enable: return + optimizer_state = self._optimizer_states[id(optimizer)] + + if optimizer_state["state"] is OptimizerState.UNSCALED: + raise RuntimeError( + "unscale_() has already been called on this optimizer since the last update()." + ) + elif optimizer_state["state"] is OptimizerState.STEPPED: + raise RuntimeError("unscale_() is being called after step().") + if getattr(optimizer, '_param_groups', None) and isinstance( optimizer._param_groups[0], dict): param_grads = [] @@ -228,6 +346,8 @@ def _unscale(self, optimizer): _C_ops.check_finite_and_unscale(param_grads, self._scale, param_grads, self._found_inf) + optimizer_state["state"] = OptimizerState.UNSCALED + def _update(self): """ Updates the loss_scaling. From 048bd89589742769be90b5ca9d7849a72e7d2bef Mon Sep 17 00:00:00 2001 From: zhangbo9674 Date: Thu, 9 Sep 2021 13:12:49 +0000 Subject: [PATCH 2/9] add unscale and step for grad_scaler --- python/paddle/amp/grad_scaler.py | 63 +++++++++++++++---- .../paddle/fluid/dygraph/amp/loss_scaler.py | 12 ++-- 2 files changed, 54 insertions(+), 21 deletions(-) diff --git a/python/paddle/amp/grad_scaler.py b/python/paddle/amp/grad_scaler.py index 5c3b575f2f069..eec908f450502 100644 --- a/python/paddle/amp/grad_scaler.py +++ b/python/paddle/amp/grad_scaler.py @@ -173,21 +173,58 @@ def step(self, optimizer): scaler.step(optimizer) optimizer.clear_grad() """ - if not self._enable: - return optimizer.step() + return super(GradScaler, self).step(optimizer) - # unscale the grad - self._unscale(optimizer) - - if self._found_inf: - self._cache_founf_inf = True - else: - optimizer.step() - self._cache_founf_inf = False + def update(self): + """ + Updates the loss_scaling. + + Examples: + .. code-block:: python + import paddle + model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True) + optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters()) + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + data = paddle.rand([10, 3, 32, 32]) + with paddle.amp.auto_cast(): + conv = model(data) + loss = paddle.mean(conv) + scaled = scaler.scale(loss) # scale the loss + scaled.backward() # do backward + scaler.unscale(optimizer) # unscale the parameter + scaler.step(optimizer, scaled) + scaler.update() # update the loss scaling ratio + optimizer.clear_grad() + """ + return super(GradScaler, self).update() - if self._use_dynamic_loss_scaling: - # uopdate the scale - self._update() + def unscale(self, optimizer): + """ + Unscale the gradients of parameters, multiplies the gradients of parameters by 1/(loss scaling ratio). + If this instance of :class:`GradScaler` is not enabled, output are returned unmodified. + Args: + optimizer(Optimizer): The optimizer used to update parameters. + Returns: + The unscaled parameters or original parameters. + + Examples: + .. code-block:: python + import paddle + model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True) + optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters()) + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + data = paddle.rand([10, 3, 32, 32]) + with paddle.amp.auto_cast(): + conv = model(data) + loss = paddle.mean(conv) + scaled = scaler.scale(loss) # scale the loss + scaled.backward() # do backward + scaler.unscale(optimizer) # unscale the parameter + scaler.step(optimizer, scaled) + scaler.update() + optimizer.clear_grad() + """ + return super(GradScaler, self)._unscale(optimizer) def is_enable(self): """ diff --git a/python/paddle/fluid/dygraph/amp/loss_scaler.py b/python/paddle/fluid/dygraph/amp/loss_scaler.py index 193db28e22dfc..1d2421feeff19 100644 --- a/python/paddle/fluid/dygraph/amp/loss_scaler.py +++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py @@ -168,7 +168,7 @@ def scale(self, var): return var * self._scale - def step(self, optimizer, *args, **kwargs): + def step(self, optimizer): """ This function is similar as `Optimizer.step()`, which performs parameters updating. @@ -194,11 +194,11 @@ def step(self, optimizer, *args, **kwargs): loss = fluid.layers.reduce_mean(conv) scaled = scaler.scale(loss) scaled.backward() - scaler.step(optimizer, scaled) + scaler.step(optimizer) scaler.update() """ if not self._enable: - return optimizer.minimize(*args, **kwargs) + return optimizer.step() optimizer_state = self._optimizer_states[id(optimizer)] if optimizer_state["state"] is OptimizerState.STEPPED: @@ -209,18 +209,14 @@ def step(self, optimizer, *args, **kwargs): if optimizer_state["state"] is OptimizerState.INIT: self._unscale(optimizer) - optimize_ops, params_grads = (None, None) - if self._found_inf: self._cache_founf_inf = True else: - optimize_ops, params_grads = optimizer.minimize(*args, **kwargs) + optimizer.step() self._cache_founf_inf = False optimizer_state["state"] = OptimizerState.STEPPED - return optimize_ops, params_grads - def update(self): """ This function is used to update loss scaling ratio. From eaede1c6c9dd895b5f865aced468fa2061fb365a Mon Sep 17 00:00:00 2001 From: zhangbo9674 Date: Fri, 17 Sep 2021 03:57:11 +0000 Subject: [PATCH 3/9] add unittest --- python/paddle/amp/grad_scaler.py | 82 ++++++++----- .../paddle/fluid/dygraph/amp/loss_scaler.py | 116 ++++++++++-------- .../test_imperative_auto_mixed_precision.py | 3 +- 3 files changed, 113 insertions(+), 88 deletions(-) diff --git a/python/paddle/amp/grad_scaler.py b/python/paddle/amp/grad_scaler.py index eec908f450502..52705cf0ff5a8 100644 --- a/python/paddle/amp/grad_scaler.py +++ b/python/paddle/amp/grad_scaler.py @@ -21,10 +21,14 @@ class GradScaler(AmpScaler): """ GradScaler is used for Auto-Mixed-Precision training in dynamic graph mode. It controls the scaling of loss, helps avoiding numerical overflow. - The object of this class has two methods `scale()`, `minimize()`. + The object of this class has nineteen methods `scale()`, `unscale()`, `minimize()`, `step()`, `update()` and `get`/`set` api of parameters. `scale()` is used to multiply the loss by a scale ratio. - `minimize()` is similar as `optimizer.minimize()`, performs parameters updating. + `unscale()` is used to unscale the gradients of parameters, multiplies the gradients of parameters by 1/(scale ratio) + `minimize()` is similar as `optimizer.minimize()`, performs parameters updating, and it will update the loss_scaling, it equal to `step()` + `update()`. + `step()` is similar as `optimizer.step()`, which performs parameters updating. + `update` is used to update the loss_scaling. + Commonly, it is used together with `paddle.amp.auto_cast` to achieve Auto-Mixed-Precision in dynamic graph mode. @@ -115,7 +119,7 @@ def minimize(self, optimizer, *args, **kwargs): This function is similar as `optimizer.minimize()`, which performs parameters updating. If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped. - Otherwise, it first unscales the scaled gradients of parameters, then updates the parameters. + Otherwise, if `unscale()` has not been called, it first unscales the scaled gradients of parameters, then updates the parameters. Finally, the loss scaling ratio is updated. @@ -151,16 +155,18 @@ def step(self, optimizer): This function is similar as `optimizer.step()`, which performs parameters updating. If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped. - Otherwise, it first unscales the scaled gradients of parameters, then updates the parameters. + Otherwise, if `unscale()` has not been called, it first unscales the scaled gradients of parameters, then updates the parameters. Args: optimizer(Optimizer): The optimizer used to update parameters. Examples: + .. code-block:: python # required: gpu import paddle + model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True) optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters()) scaler = paddle.amp.GradScaler(init_loss_scaling=1024) @@ -170,7 +176,8 @@ def step(self, optimizer): loss = paddle.mean(conv) scaled = scaler.scale(loss) # scale the loss scaled.backward() # do backward - scaler.step(optimizer) + scaler.step(optimizer) # update parameters + scaler.update() # update the loss scaling ratio optimizer.clear_grad() """ return super(GradScaler, self).step(optimizer) @@ -180,21 +187,24 @@ def update(self): Updates the loss_scaling. Examples: + .. code-block:: python - import paddle - model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True) - optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters()) - scaler = paddle.amp.GradScaler(init_loss_scaling=1024) - data = paddle.rand([10, 3, 32, 32]) - with paddle.amp.auto_cast(): - conv = model(data) - loss = paddle.mean(conv) - scaled = scaler.scale(loss) # scale the loss - scaled.backward() # do backward - scaler.unscale(optimizer) # unscale the parameter - scaler.step(optimizer, scaled) - scaler.update() # update the loss scaling ratio - optimizer.clear_grad() + + # required: gpu + import paddle + + model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True) + optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters()) + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + data = paddle.rand([10, 3, 32, 32]) + with paddle.amp.auto_cast(): + conv = model(data) + loss = paddle.mean(conv) + scaled = scaler.scale(loss) # scale the loss + scaled.backward() # do backward + scaler.step(optimizer) # update parameters + scaler.update() # update the loss scaling ratio + optimizer.clear_grad() """ return super(GradScaler, self).update() @@ -202,27 +212,33 @@ def unscale(self, optimizer): """ Unscale the gradients of parameters, multiplies the gradients of parameters by 1/(loss scaling ratio). If this instance of :class:`GradScaler` is not enabled, output are returned unmodified. + Args: optimizer(Optimizer): The optimizer used to update parameters. + Returns: The unscaled parameters or original parameters. Examples: + .. code-block:: python - import paddle - model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True) - optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters()) - scaler = paddle.amp.GradScaler(init_loss_scaling=1024) - data = paddle.rand([10, 3, 32, 32]) - with paddle.amp.auto_cast(): - conv = model(data) - loss = paddle.mean(conv) - scaled = scaler.scale(loss) # scale the loss - scaled.backward() # do backward - scaler.unscale(optimizer) # unscale the parameter - scaler.step(optimizer, scaled) - scaler.update() - optimizer.clear_grad() + + # required: gpu + import paddle + + model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True) + optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters()) + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + data = paddle.rand([10, 3, 32, 32]) + with paddle.amp.auto_cast(): + conv = model(data) + loss = paddle.mean(conv) + scaled = scaler.scale(loss) # scale the loss + scaled.backward() # do backward + scaler.unscale(optimizer) # unscale the parameter + scaler.step(optimizer, scaled) + scaler.update() + optimizer.clear_grad() """ return super(GradScaler, self)._unscale(optimizer) diff --git a/python/paddle/fluid/dygraph/amp/loss_scaler.py b/python/paddle/fluid/dygraph/amp/loss_scaler.py index 1d2421feeff19..cf1404eff31cc 100644 --- a/python/paddle/fluid/dygraph/amp/loss_scaler.py +++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py @@ -43,10 +43,13 @@ class AmpScaler(object): AmpScaler is used for Auto-Mixed-Precision training/inferring in imperative mode. It controls the scaling of loss, helps avoiding numerical overflow. - The object of this class has two methods `scale()`, `minimize()`. + The object of this class has nineteen methods `scale()`, `unscale()`, `minimize()`, `step()`, `update()` and `get`/`set` api of parameters. `scale()` is used to multiply the loss by a scale ratio. - `minimize()` is similar as `Optimizer.minimize()`, performs parameters updating. + `unscale()` is used to unscale the gradients of parameters, multiplies the gradients of parameters by 1/(scale ratio) + `minimize()` is similar as `optimizer.minimize()`, performs parameters updating, and it will update the loss_scaling, it equal to `step()` + `update()`. + `step()` is similar as `optimizer.step()`, which performs parameters updating. + `update` is used to update the loss_scaling. Commonly, it is used together with `amp_guard` to achieve Auto-Mixed-Precision in imperative mode. @@ -173,29 +176,31 @@ def step(self, optimizer): This function is similar as `Optimizer.step()`, which performs parameters updating. If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped. - Otherwise, it first unscales the scaled gradients of parameters, then updates the parameters. + Otherwise, if `unscale()` has not been called, it first unscales the scaled gradients of parameters, then updates the parameters. + Args: optimizer(Optimizer): The optimizer used to update parameters. - args: Arguments, which will be forward to `optimizer.minimize()`. - kwargs: Keyword arguments, which will be forward to `Optimizer.minimize()`. + Examples: + .. code-block:: python - import numpy as np - import paddle.fluid as fluid - data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32') - with fluid.dygraph.guard(): - model = fluid.dygraph.Conv2D(3, 2, 3) - optimizer = fluid.optimizer.SGDOptimizer( - learning_rate=0.01, parameter_list=model.parameters()) - scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024) - data = fluid.dygraph.to_variable(data) - with fluid.dygraph.amp_guard(): - conv = model(data) - loss = fluid.layers.reduce_mean(conv) - scaled = scaler.scale(loss) - scaled.backward() - scaler.step(optimizer) - scaler.update() + + import numpy as np + import paddle.fluid as fluid + data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32') + with fluid.dygraph.guard(): + model = fluid.dygraph.Conv2D(3, 2, 3) + optimizer = fluid.optimizer.SGDOptimizer( + learning_rate=0.01, parameter_list=model.parameters()) + scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024) + data = fluid.dygraph.to_variable(data) + with fluid.dygraph.amp_guard(): + conv = model(data) + loss = fluid.layers.reduce_mean(conv) + scaled = scaler.scale(loss) + scaled.backward() + scaler.step(optimizer) + scaler.update() """ if not self._enable: return optimizer.step() @@ -220,24 +225,27 @@ def step(self, optimizer): def update(self): """ This function is used to update loss scaling ratio. + Examples: + .. code-block:: python - import numpy as np - import paddle.fluid as fluid - data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32') - with fluid.dygraph.guard(): - model = fluid.dygraph.Conv2D(3, 2, 3) - optimizer = fluid.optimizer.SGDOptimizer( - learning_rate=0.01, parameter_list=model.parameters()) - scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024) - data = fluid.dygraph.to_variable(data) - with fluid.dygraph.amp_guard(): - conv = model(data) - loss = fluid.layers.reduce_mean(conv) - scaled = scaler.scale(loss) - scaled.backward() - scaler.step(optimizer, scaled) - scaler.update() + + import numpy as np + import paddle.fluid as fluid + data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32') + with fluid.dygraph.guard(): + model = fluid.dygraph.Conv2D(3, 2, 3) + optimizer = fluid.optimizer.SGDOptimizer( + learning_rate=0.01, parameter_list=model.parameters()) + scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024) + data = fluid.dygraph.to_variable(data) + with fluid.dygraph.amp_guard(): + conv = model(data) + loss = fluid.layers.reduce_mean(conv) + scaled = scaler.scale(loss) + scaled.backward() + scaler.step(optimizer) + scaler.update() """ if not self._enable: return @@ -251,7 +259,7 @@ def minimize(self, optimizer, *args, **kwargs): This function is similar as `Optimizer.minimize()`, which performs parameters updating. If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped. - Otherwise, it first unscales the scaled gradients of parameters, then updates the parameters. + Otherwise, if `unscale()` has not been called, it first unscales the scaled gradients of parameters, then updates the parameters. Finally, the loss scaling ratio is updated. @@ -263,22 +271,22 @@ def minimize(self, optimizer, *args, **kwargs): Examples: .. code-block:: python - import numpy as np - import paddle.fluid as fluid - - data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32') - with fluid.dygraph.guard(): - model = fluid.dygraph.Conv2D(3, 2, 3) - optimizer = fluid.optimizer.SGDOptimizer( - learning_rate=0.01, parameter_list=model.parameters()) - scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024) - data = fluid.dygraph.to_variable(data) - with fluid.dygraph.amp_guard(): - conv = model(data) - loss = fluid.layers.reduce_mean(conv) - scaled = scaler.scale(loss) - scaled.backward() - scaler.minimize(optimizer, scaled) + import numpy as np + import paddle.fluid as fluid + + data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32') + with fluid.dygraph.guard(): + model = fluid.dygraph.Conv2D(3, 2, 3) + optimizer = fluid.optimizer.SGDOptimizer( + learning_rate=0.01, parameter_list=model.parameters()) + scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024) + data = fluid.dygraph.to_variable(data) + with fluid.dygraph.amp_guard(): + conv = model(data) + loss = fluid.layers.reduce_mean(conv) + scaled = scaler.scale(loss) + scaled.backward() + scaler.minimize(optimizer, scaled) """ if not self._enable: return optimizer.minimize(*args, **kwargs) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py index 330c4c5ffec3d..f4185e70b2bdc 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py @@ -481,8 +481,9 @@ def train_resnet(self, scaled_loss = scaler.scale(avg_loss) scaled_loss.backward() - + scaler.unscale(optimizer) scaler.step(optimizer) + scaler.update() dy_grad_value = {} for param in resnet.parameters(): From 4244cdfe36e0c5e4b34997546e056c433b82a4b3 Mon Sep 17 00:00:00 2001 From: zhangbo9674 Date: Fri, 17 Sep 2021 06:50:19 +0000 Subject: [PATCH 4/9] refine code in minimize --- python/paddle/fluid/dygraph/amp/loss_scaler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/paddle/fluid/dygraph/amp/loss_scaler.py b/python/paddle/fluid/dygraph/amp/loss_scaler.py index cf1404eff31cc..889758ca3d950 100644 --- a/python/paddle/fluid/dygraph/amp/loss_scaler.py +++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py @@ -311,6 +311,7 @@ def minimize(self, optimizer, *args, **kwargs): if self._use_dynamic_loss_scaling: # uopdate the scale self._update() + self._optimizer_states = defaultdict(_refresh_optimizer_state) return optimize_ops, params_grads From 11c39f07e9738cccb96d9b79c0a4dc41cdb709d9 Mon Sep 17 00:00:00 2001 From: zhangbo9674 Date: Fri, 17 Sep 2021 08:12:08 +0000 Subject: [PATCH 5/9] delete step in loss_scaler --- python/paddle/amp/grad_scaler.py | 45 +++++++-- .../paddle/fluid/dygraph/amp/loss_scaler.py | 91 +------------------ .../test_imperative_auto_mixed_precision.py | 2 +- 3 files changed, 42 insertions(+), 96 deletions(-) diff --git a/python/paddle/amp/grad_scaler.py b/python/paddle/amp/grad_scaler.py index 52705cf0ff5a8..81fd2481a2732 100644 --- a/python/paddle/amp/grad_scaler.py +++ b/python/paddle/amp/grad_scaler.py @@ -13,18 +13,24 @@ # limitations under the License. from paddle.fluid.dygraph.amp import AmpScaler +from paddle.fluid.dygraph.amp import OptimizerState +from collections import defaultdict __all__ = [] +def _refresh_optimizer_state(): + return {"state": OptimizerState.INIT} + + class GradScaler(AmpScaler): """ GradScaler is used for Auto-Mixed-Precision training in dynamic graph mode. It controls the scaling of loss, helps avoiding numerical overflow. - The object of this class has nineteen methods `scale()`, `unscale()`, `minimize()`, `step()`, `update()` and `get`/`set` api of parameters. + The object of this class has nineteen methods `scale()`, `unscale_()`, `minimize()`, `step()`, `update()` and `get`/`set` api of parameters. `scale()` is used to multiply the loss by a scale ratio. - `unscale()` is used to unscale the gradients of parameters, multiplies the gradients of parameters by 1/(scale ratio) + `unscale_()` is used to unscale the gradients of parameters, multiplies the gradients of parameters by 1/(scale ratio) `minimize()` is similar as `optimizer.minimize()`, performs parameters updating, and it will update the loss_scaling, it equal to `step()` + `update()`. `step()` is similar as `optimizer.step()`, which performs parameters updating. `update` is used to update the loss_scaling. @@ -119,7 +125,7 @@ def minimize(self, optimizer, *args, **kwargs): This function is similar as `optimizer.minimize()`, which performs parameters updating. If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped. - Otherwise, if `unscale()` has not been called, it first unscales the scaled gradients of parameters, then updates the parameters. + Otherwise, if `unscale_()` has not been called, it first unscales the scaled gradients of parameters, then updates the parameters. Finally, the loss scaling ratio is updated. @@ -155,7 +161,7 @@ def step(self, optimizer): This function is similar as `optimizer.step()`, which performs parameters updating. If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped. - Otherwise, if `unscale()` has not been called, it first unscales the scaled gradients of parameters, then updates the parameters. + Otherwise, if `unscale_()` has not been called, it first unscales the scaled gradients of parameters, then updates the parameters. Args: optimizer(Optimizer): The optimizer used to update parameters. @@ -180,7 +186,25 @@ def step(self, optimizer): scaler.update() # update the loss scaling ratio optimizer.clear_grad() """ - return super(GradScaler, self).step(optimizer) + if not self._enable: + return optimizer.step() + + optimizer_state = self._optimizer_states[id(optimizer)] + if optimizer_state["state"] is OptimizerState.STEPPED: + raise RuntimeError( + "step() has already been called since the last update().") + + # unscale the grad + if optimizer_state["state"] is OptimizerState.INIT: + self._unscale(optimizer) + + if self._found_inf: + self._cache_founf_inf = True + else: + optimizer.step() + self._cache_founf_inf = False + + optimizer_state["state"] = OptimizerState.STEPPED def update(self): """ @@ -206,9 +230,14 @@ def update(self): scaler.update() # update the loss scaling ratio optimizer.clear_grad() """ - return super(GradScaler, self).update() + if not self._enable: + return + if self._use_dynamic_loss_scaling: + self._update() + self._optimizer_states = defaultdict(_refresh_optimizer_state) + return - def unscale(self, optimizer): + def unscale_(self, optimizer): """ Unscale the gradients of parameters, multiplies the gradients of parameters by 1/(loss scaling ratio). If this instance of :class:`GradScaler` is not enabled, output are returned unmodified. @@ -235,7 +264,7 @@ def unscale(self, optimizer): loss = paddle.mean(conv) scaled = scaler.scale(loss) # scale the loss scaled.backward() # do backward - scaler.unscale(optimizer) # unscale the parameter + scaler.unscale_(optimizer) # unscale the parameter scaler.step(optimizer, scaled) scaler.update() optimizer.clear_grad() diff --git a/python/paddle/fluid/dygraph/amp/loss_scaler.py b/python/paddle/fluid/dygraph/amp/loss_scaler.py index 889758ca3d950..5a2512f655f4b 100644 --- a/python/paddle/fluid/dygraph/amp/loss_scaler.py +++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py @@ -24,7 +24,7 @@ from collections import defaultdict from enum import Enum -__all__ = ['AmpScaler'] +__all__ = ['AmpScaler', 'OptimizerState'] class OptimizerState(Enum): @@ -43,10 +43,10 @@ class AmpScaler(object): AmpScaler is used for Auto-Mixed-Precision training/inferring in imperative mode. It controls the scaling of loss, helps avoiding numerical overflow. - The object of this class has nineteen methods `scale()`, `unscale()`, `minimize()`, `step()`, `update()` and `get`/`set` api of parameters. + The object of this class has nineteen methods `scale()`, `unscale_()`, `minimize()`, `step()`, `update()` and `get`/`set` api of parameters. `scale()` is used to multiply the loss by a scale ratio. - `unscale()` is used to unscale the gradients of parameters, multiplies the gradients of parameters by 1/(scale ratio) + `unscale_()` is used to unscale the gradients of parameters, multiplies the gradients of parameters by 1/(scale ratio) `minimize()` is similar as `optimizer.minimize()`, performs parameters updating, and it will update the loss_scaling, it equal to `step()` + `update()`. `step()` is similar as `optimizer.step()`, which performs parameters updating. `update` is used to update the loss_scaling. @@ -171,95 +171,12 @@ def scale(self, var): return var * self._scale - def step(self, optimizer): - """ - This function is similar as `Optimizer.step()`, which performs parameters updating. - - If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped. - Otherwise, if `unscale()` has not been called, it first unscales the scaled gradients of parameters, then updates the parameters. - - Args: - optimizer(Optimizer): The optimizer used to update parameters. - - Examples: - - .. code-block:: python - - import numpy as np - import paddle.fluid as fluid - data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32') - with fluid.dygraph.guard(): - model = fluid.dygraph.Conv2D(3, 2, 3) - optimizer = fluid.optimizer.SGDOptimizer( - learning_rate=0.01, parameter_list=model.parameters()) - scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024) - data = fluid.dygraph.to_variable(data) - with fluid.dygraph.amp_guard(): - conv = model(data) - loss = fluid.layers.reduce_mean(conv) - scaled = scaler.scale(loss) - scaled.backward() - scaler.step(optimizer) - scaler.update() - """ - if not self._enable: - return optimizer.step() - - optimizer_state = self._optimizer_states[id(optimizer)] - if optimizer_state["state"] is OptimizerState.STEPPED: - raise RuntimeError( - "step() has already been called since the last update().") - - # unscale the grad - if optimizer_state["state"] is OptimizerState.INIT: - self._unscale(optimizer) - - if self._found_inf: - self._cache_founf_inf = True - else: - optimizer.step() - self._cache_founf_inf = False - - optimizer_state["state"] = OptimizerState.STEPPED - - def update(self): - """ - This function is used to update loss scaling ratio. - - Examples: - - .. code-block:: python - - import numpy as np - import paddle.fluid as fluid - data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32') - with fluid.dygraph.guard(): - model = fluid.dygraph.Conv2D(3, 2, 3) - optimizer = fluid.optimizer.SGDOptimizer( - learning_rate=0.01, parameter_list=model.parameters()) - scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024) - data = fluid.dygraph.to_variable(data) - with fluid.dygraph.amp_guard(): - conv = model(data) - loss = fluid.layers.reduce_mean(conv) - scaled = scaler.scale(loss) - scaled.backward() - scaler.step(optimizer) - scaler.update() - """ - if not self._enable: - return - if self._use_dynamic_loss_scaling: - self._update() - self._optimizer_states = defaultdict(_refresh_optimizer_state) - return - def minimize(self, optimizer, *args, **kwargs): """ This function is similar as `Optimizer.minimize()`, which performs parameters updating. If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped. - Otherwise, if `unscale()` has not been called, it first unscales the scaled gradients of parameters, then updates the parameters. + Otherwise, if `unscale_()` has not been called, it first unscales the scaled gradients of parameters, then updates the parameters. Finally, the loss scaling ratio is updated. diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py index f4185e70b2bdc..3c9dc7fc53ec9 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py @@ -481,7 +481,7 @@ def train_resnet(self, scaled_loss = scaler.scale(avg_loss) scaled_loss.backward() - scaler.unscale(optimizer) + scaler.unscale_(optimizer) scaler.step(optimizer) scaler.update() From eeb4c1cac96cc5b5b3bcceb1ccbcc9f87ae15340 Mon Sep 17 00:00:00 2001 From: zhangbo9674 Date: Fri, 17 Sep 2021 08:32:34 +0000 Subject: [PATCH 6/9] fix example bug --- python/paddle/amp/grad_scaler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/amp/grad_scaler.py b/python/paddle/amp/grad_scaler.py index 81fd2481a2732..d3764a6ebd84a 100644 --- a/python/paddle/amp/grad_scaler.py +++ b/python/paddle/amp/grad_scaler.py @@ -265,7 +265,7 @@ def unscale_(self, optimizer): scaled = scaler.scale(loss) # scale the loss scaled.backward() # do backward scaler.unscale_(optimizer) # unscale the parameter - scaler.step(optimizer, scaled) + scaler.step(optimizer) scaler.update() optimizer.clear_grad() """ From 2a1cbf9688f49060319be430d0e7c648c34432c6 Mon Sep 17 00:00:00 2001 From: zhangbo9674 Date: Fri, 17 Sep 2021 12:05:20 +0000 Subject: [PATCH 7/9] refine comment --- python/paddle/fluid/dygraph/amp/loss_scaler.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/python/paddle/fluid/dygraph/amp/loss_scaler.py b/python/paddle/fluid/dygraph/amp/loss_scaler.py index 5a2512f655f4b..cf48151d6d8d1 100644 --- a/python/paddle/fluid/dygraph/amp/loss_scaler.py +++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py @@ -43,13 +43,11 @@ class AmpScaler(object): AmpScaler is used for Auto-Mixed-Precision training/inferring in imperative mode. It controls the scaling of loss, helps avoiding numerical overflow. - The object of this class has nineteen methods `scale()`, `unscale_()`, `minimize()`, `step()`, `update()` and `get`/`set` api of parameters. + The object of this class has seventeen methods `scale()`, `unscale_()`, `minimize()` and `get`/`set` api of parameters. `scale()` is used to multiply the loss by a scale ratio. `unscale_()` is used to unscale the gradients of parameters, multiplies the gradients of parameters by 1/(scale ratio) - `minimize()` is similar as `optimizer.minimize()`, performs parameters updating, and it will update the loss_scaling, it equal to `step()` + `update()`. - `step()` is similar as `optimizer.step()`, which performs parameters updating. - `update` is used to update the loss_scaling. + `minimize()` is similar as `optimizer.minimize()`, performs parameters updating, and it will update the loss_scaling. Commonly, it is used together with `amp_guard` to achieve Auto-Mixed-Precision in imperative mode. From ff76fb46996ecfff3ab4d77f8fdadf2a926510f8 Mon Sep 17 00:00:00 2001 From: zhangbo9674 Date: Sat, 18 Sep 2021 08:50:06 +0000 Subject: [PATCH 8/9] refine unittest --- python/paddle/amp/grad_scaler.py | 3 ++ .../paddle/fluid/dygraph/amp/loss_scaler.py | 40 ++++++++-------- .../test_imperative_auto_mixed_precision.py | 46 +++++++++++++++++++ 3 files changed, 69 insertions(+), 20 deletions(-) diff --git a/python/paddle/amp/grad_scaler.py b/python/paddle/amp/grad_scaler.py index d3764a6ebd84a..83f57fc74e89a 100644 --- a/python/paddle/amp/grad_scaler.py +++ b/python/paddle/amp/grad_scaler.py @@ -206,6 +206,9 @@ def step(self, optimizer): optimizer_state["state"] = OptimizerState.STEPPED + if not self._use_dynamic_loss_scaling: + self._optimizer_states = defaultdict(_refresh_optimizer_state) + def update(self): """ Updates the loss_scaling. diff --git a/python/paddle/fluid/dygraph/amp/loss_scaler.py b/python/paddle/fluid/dygraph/amp/loss_scaler.py index cf48151d6d8d1..445937f821950 100644 --- a/python/paddle/fluid/dygraph/amp/loss_scaler.py +++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py @@ -143,24 +143,25 @@ def scale(self, var): The scaled variable or original variable. Examples: + .. code-block:: python - import numpy as np - import paddle.fluid as fluid - - data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32') - with fluid.dygraph.guard(): - model = fluid.dygraph.Conv2D(3, 2, 3) - optimizer = fluid.optimizer.SGDOptimizer( - learning_rate=0.01, parameter_list=model.parameters()) - scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024) - data = fluid.dygraph.to_variable(data) - with fluid.dygraph.amp_guard(): - conv = model(data) - loss = fluid.layers.reduce_mean(conv) - scaled = scaler.scale(loss) - scaled.backward() - scaler.minimize(optimizer, scaled) + import numpy as np + import paddle.fluid as fluid + + data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32') + with fluid.dygraph.guard(): + model = fluid.dygraph.Conv2D(3, 2, 3) + optimizer = fluid.optimizer.SGDOptimizer( + learning_rate=0.01, parameter_list=model.parameters()) + scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024) + data = fluid.dygraph.to_variable(data) + with fluid.dygraph.amp_guard(): + conv = model(data) + loss = fluid.layers.reduce_mean(conv) + scaled = scaler.scale(loss) + scaled.backward() + scaler.minimize(optimizer, scaled) """ check_type(var, "var", core.VarBase, 'AmpScaler.scale()') @@ -184,6 +185,7 @@ def minimize(self, optimizer, *args, **kwargs): kwargs: Keyword arguments, which will be forward to `Optimizer.minimize()`. Examples: + .. code-block:: python import numpy as np @@ -207,9 +209,6 @@ def minimize(self, optimizer, *args, **kwargs): return optimizer.minimize(*args, **kwargs) optimizer_state = self._optimizer_states[id(optimizer)] - if optimizer_state["state"] is OptimizerState.STEPPED: - raise RuntimeError( - "step() has already been called since the last update().") # unscale the grad if optimizer_state["state"] is OptimizerState.INIT: @@ -226,7 +225,8 @@ def minimize(self, optimizer, *args, **kwargs): if self._use_dynamic_loss_scaling: # uopdate the scale self._update() - self._optimizer_states = defaultdict(_refresh_optimizer_state) + + self._optimizer_states = defaultdict(_refresh_optimizer_state) return optimize_ops, params_grads diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py index 3c9dc7fc53ec9..e98d8e23f2d3d 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py @@ -212,6 +212,52 @@ def test_nan_inf(self): self.assertTrue( np.array_equal(param.numpy(), params_init[param.name])) + def test_step_update_exception(self): + def func1(): + model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True) + optimizer = paddle.optimizer.SGD(learning_rate=0.01, + parameters=model.parameters()) + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + data = paddle.rand([10, 3, 32, 32]) + conv = model(data) + loss = paddle.mean(conv) + scaled = scaler.scale(loss) + scaled.backward() + scaler.unscale_(optimizer) + scaler.unscale_(optimizer) + + self.assertRaises(RuntimeError, func1) + + def func2(): + model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True) + optimizer = paddle.optimizer.SGD(learning_rate=0.01, + parameters=model.parameters()) + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + data = paddle.rand([10, 3, 32, 32]) + conv = model(data) + loss = paddle.mean(conv) + scaled = scaler.scale(loss) + scaled.backward() + scaler.step(optimizer) + scaler.unscale_(optimizer) + + self.assertRaises(RuntimeError, func2) + + def func3(): + model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True) + optimizer = paddle.optimizer.SGD(learning_rate=0.01, + parameters=model.parameters()) + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + data = paddle.rand([10, 3, 32, 32]) + conv = model(data) + loss = paddle.mean(conv) + scaled = scaler.scale(loss) + scaled.backward() + scaler.step(optimizer) + scaler.step(optimizer) + + self.assertRaises(RuntimeError, func3) + def test_get_and_set(self): with fluid.dygraph.guard(): scaler = paddle.amp.GradScaler( From c394837a41bbbcba80453579f4855beb48c4807d Mon Sep 17 00:00:00 2001 From: zhangbo9674 Date: Mon, 20 Sep 2021 11:43:39 +0000 Subject: [PATCH 9/9] add unittest --- .../test_imperative_auto_mixed_precision.py | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py index 137161baa4f43..5f1f4a4641168 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py @@ -222,6 +222,47 @@ def run_simple_conv(inp_np, use_scaler=True): np.allclose(outs_with_scaler[1][i][0].numpy(), outs_no_scaler[1][i][0].numpy()), True) + def test_step(self): + inp_np = np.random.random(size=[1, 3, 128, 128]).astype(np.float32) + + def run_simple_conv(inp_np, use_scaler=True): + paddle.seed(10) + paddle.framework.random._manual_program_seed(10) + with fluid.dygraph.guard(): + model = SimpleConv( + num_channels=3, + num_filters=64, + filter_size=7, + stride=2, + act='relu') + optimizer = paddle.optimizer.SGD(learning_rate=0.01, + parameters=model.parameters()) + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + data = fluid.dygraph.to_variable(inp_np) + + out = model(data) + loss = fluid.layers.mean(out) + if use_scaler: + print('use scaler') + scaled_loss = scaler.scale(loss) + scaled_loss.backward() + scaler.step(optimizer) + scaler.update() + else: + print('use no scaler') + loss.backward() + optimizer.step() + return optimizer._parameter_list + + outs_with_scaler = run_simple_conv(inp_np, use_scaler=True) + outs_no_scaler = run_simple_conv(inp_np, use_scaler=False) + + for i in range(len(outs_with_scaler)): + # check each parameter + self.assertEqual( + np.allclose(outs_with_scaler[i].numpy(), + outs_no_scaler[i].numpy()), True) + def test_nan_inf(self): inp_np = np.random.random(size=[1, 3, 128, 128]).astype(np.float32) inp_np[0][1][2][3] = np.nan