Skip to content

Commit

Permalink
refine amp scaler found_inf (#49864)
Browse files Browse the repository at this point in the history
* refine _found_inf
  • Loading branch information
wanghuancoder authored Jan 30, 2023
1 parent 320958e commit 382e9a0
Show file tree
Hide file tree
Showing 10 changed files with 142 additions and 97 deletions.
42 changes: 25 additions & 17 deletions python/paddle/amp/grad_scaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

import numpy as np

from paddle import _legacy_C_ops
from paddle import _C_ops, _legacy_C_ops
from paddle.fluid import core, in_dygraph_mode
from paddle.fluid.data_feeder import check_type
from paddle.fluid.dygraph import to_variable
Expand Down Expand Up @@ -228,11 +228,9 @@ def minimize(self, optimizer, *args, **kwargs):

optimize_ops, params_grads = (None, None)

if self._found_inf:
self._cache_founf_inf = True
else:
optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
self._cache_founf_inf = False
optimizer._set_auxiliary_var('found_inf', self._found_inf)
optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
self._cache_founf_inf = optimizer._get_auxiliary_var('found_inf')

if self._use_dynamic_loss_scaling:
# uopdate the scale
Expand Down Expand Up @@ -330,6 +328,9 @@ def _unscale(self, optimizer):
param_grads_fp16,
self._temp_found_inf_fp16,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, self._temp_found_inf_fp16
)
if len(param_grads_bf16):
_legacy_C_ops.check_finite_and_unscale(
param_grads_bf16,
Expand All @@ -338,6 +339,9 @@ def _unscale(self, optimizer):
param_grads_bf16,
self._temp_found_inf_bf16,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, self._temp_found_inf_bf16
)
if len(param_grads_fp32):
_legacy_C_ops.check_finite_and_unscale(
param_grads_fp32,
Expand All @@ -346,6 +350,9 @@ def _unscale(self, optimizer):
param_grads_fp32,
self._temp_found_inf_fp32,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, self._temp_found_inf_fp32
)
else:
if len(param_grads_fp16):
_legacy_C_ops.check_finite_and_unscale(
Expand All @@ -354,26 +361,29 @@ def _unscale(self, optimizer):
param_grads_fp16,
self._temp_found_inf_fp16,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, self._temp_found_inf_fp16
)
if len(param_grads_bf16):
_legacy_C_ops.check_finite_and_unscale(
param_grads_bf16,
self._scale,
param_grads_bf16,
self._temp_found_inf_bf16,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, self._temp_found_inf_bf16
)
if len(param_grads_fp32):
_legacy_C_ops.check_finite_and_unscale(
param_grads_fp32,
self._scale,
param_grads_fp32,
self._temp_found_inf_fp32,
)

self._found_inf = (
self._temp_found_inf_fp16
or self._temp_found_inf_bf16
or self._temp_found_inf_fp32
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, self._temp_found_inf_fp32
)

optimizer_state["state"] = OptimizerState.UNSCALED

Expand Down Expand Up @@ -761,11 +771,9 @@ def step(self, optimizer):
if optimizer_state["state"] is OptimizerState.INIT:
self._unscale(optimizer)

if self._found_inf:
self._cache_founf_inf = True
else:
optimizer.step()
self._cache_founf_inf = False
optimizer._set_auxiliary_var('found_inf', self._found_inf)
optimizer.step()
self._cache_founf_inf = optimizer._get_auxiliary_var('found_inf')

optimizer_state["state"] = OptimizerState.STEPPED

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,9 @@ def minimize(self, optimizer, *args, **kwargs):

optimize_ops, params_grads = (None, None)

if self._found_inf:
self._cache_founf_inf = True
else:
optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
self._cache_founf_inf = False
optimizer._set_auxiliary_var('found_inf', self._found_inf)
optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
self._cache_founf_inf = optimizer._get_auxiliary_var('found_inf')

if self._use_dynamic_loss_scaling:
self._update()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@
import numpy as np

import paddle
from paddle import _legacy_C_ops
from paddle import _C_ops, _legacy_C_ops
from paddle.common_ops_import import dygraph_only
from paddle.fluid import core
from paddle.fluid.dygraph import to_variable
from paddle.framework import core
from paddle.nn import clip


Expand Down Expand Up @@ -231,22 +231,27 @@ def unscale_method(self, optimizer):
param_grads_fp16,
temp_found_inf_fp16,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, temp_found_inf_fp16
)
if len(param_grads_fp32):
_legacy_C_ops.check_finite_and_unscale(
param_grads_fp32,
self._scale,
param_grads_fp32,
temp_found_inf_fp32,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, temp_found_inf_fp32
)

self._found_inf = 1 if temp_found_inf_fp16 or temp_found_inf_fp32 else 0
is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32")
self._found_inf = self._found_inf.cast("int32")

paddle.distributed.all_reduce(
is_found_inf, op=paddle.distributed.ReduceOp.SUM, group=None
self._found_inf, op=paddle.distributed.ReduceOp.MAX, group=None
)

self._found_inf = is_found_inf.numpy()[0]
self._found_inf = self._found_inf.cast("bool")

scaler._unscale = MethodType(unscale_method, scaler)
return scaler
Expand Down
15 changes: 10 additions & 5 deletions python/paddle/distributed/fleet/scaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import numpy as np

import paddle
from paddle import _legacy_C_ops
from paddle import _C_ops, _legacy_C_ops
from paddle.distributed import fleet
from paddle.fluid.dygraph import to_variable
from paddle.framework import core
Expand Down Expand Up @@ -73,24 +73,29 @@ def unscale_method(self, optimizer):
param_grads_fp16,
temp_found_inf_fp16,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, temp_found_inf_fp16
)
if len(param_grads_fp32):
_legacy_C_ops.check_finite_and_unscale(
param_grads_fp32,
self._scale,
param_grads_fp32,
temp_found_inf_fp32,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, temp_found_inf_fp32
)

self._found_inf = 1 if temp_found_inf_fp16 or temp_found_inf_fp32 else 0
is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32")
self._found_inf = self._found_inf.cast("int32")

# TODO(shenliang03) Since dp allreduce in the optimizer is
# after the gradscaler, check_finite needs to synchronize global
# information. In the future, we should use check_group to speed.
paddle.distributed.all_reduce(
is_found_inf, op=paddle.distributed.ReduceOp.MAX, group=None
self._found_inf, op=paddle.distributed.ReduceOp.MAX, group=None
)
self._found_inf = is_found_inf.numpy()[0]
self._found_inf = self._found_inf.cast("bool")

# Only data_parallel doesn't need to modify scaler
fleet_env = fleet.fleet
Expand Down
17 changes: 12 additions & 5 deletions python/paddle/fluid/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -893,11 +893,18 @@ def _create_optimization_pass(self, parameters_and_grads):
self._create_global_learning_rate()

if in_dygraph_mode():
for param_and_grad in parameters_and_grads:
if param_and_grad[1] is None:
continue
if param_and_grad[0].trainable is True:
self._append_optimize_op(target_block, param_and_grad)
found_inf = self._get_auxiliary_var('found_inf')
if found_inf:
if isinstance(found_inf, core.eager.Tensor):
self._set_auxiliary_var('found_inf', True)
else:
if isinstance(found_inf, core.eager.Tensor):
self._set_auxiliary_var('found_inf', False)
for param_and_grad in parameters_and_grads:
if param_and_grad[1] is None:
continue
if param_and_grad[0].trainable is True:
self._append_optimize_op(target_block, param_and_grad)
else:
for param_and_grad in parameters_and_grads:
if param_and_grad[1] is None:
Expand Down
41 changes: 23 additions & 18 deletions python/paddle/optimizer/adam.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,8 +360,6 @@ def _append_optimize_op(self, block, param_and_grad):
# create the adam optimize op

if framework.in_dygraph_mode():
found_inf = self._get_auxiliary_var('found_inf')

_beta1 = (
self._beta1
if not isinstance(self._beta1, Variable)
Expand All @@ -382,7 +380,7 @@ def _append_optimize_op(self, block, param_and_grad):
beta1_pow_acc,
beta2_pow_acc,
master_weight,
found_inf,
None,
_beta1,
_beta2,
self._epsilon,
Expand Down Expand Up @@ -693,21 +691,28 @@ def _append_optimize_multi_tensor_op(
if master_weight is not None
else None
)
_, _, _, _, _, _ = _C_ops.merged_adam_(
self._param_dict[key][param_group_idx],
grad_dict[key],
lr_dict[key],
self._moment1_dict[key][param_group_idx],
self._moment2_dict[key][param_group_idx],
self._beta1_pow_acc_dict[key][param_group_idx],
self._beta2_pow_acc_dict[key][param_group_idx],
master_weight,
_beta1,
_beta2,
self._epsilon,
find_master,
False,
)
found_inf = self._get_auxiliary_var('found_inf')
if found_inf:
if isinstance(found_inf, core.eager.Tensor):
self._set_auxiliary_var('found_inf', True)
else:
if isinstance(found_inf, core.eager.Tensor):
self._set_auxiliary_var('found_inf', False)
_, _, _, _, _, _ = _C_ops.merged_adam_(
self._param_dict[key][param_group_idx],
grad_dict[key],
lr_dict[key],
self._moment1_dict[key][param_group_idx],
self._moment2_dict[key][param_group_idx],
self._beta1_pow_acc_dict[key][param_group_idx],
self._beta2_pow_acc_dict[key][param_group_idx],
master_weight,
_beta1,
_beta2,
self._epsilon,
find_master,
False,
)
else:
inputs = {
"Param": self._param_dict[key][param_group_idx],
Expand Down
3 changes: 1 addition & 2 deletions python/paddle/optimizer/adamw.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,6 @@ def _append_optimize_op(self, block, param_and_grad):
else self._beta2.numpy().item(0)
)

found_inf = self._get_auxiliary_var('found_inf')
_, _, _, _, _, _ = _C_ops.adamw_(
param_and_grad[0],
param_and_grad[1],
Expand All @@ -501,7 +500,7 @@ def _append_optimize_op(self, block, param_and_grad):
beta1_pow_acc,
beta2_pow_acc,
master_weight,
found_inf,
None,
_beta1,
_beta2,
self._epsilon,
Expand Down
4 changes: 2 additions & 2 deletions python/paddle/optimizer/lamb.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,6 @@ def _append_optimize_op(self, block, param_and_grad):
self._used_master_weights[p_name] = master_weight.name
else:
master_weight = None
found_inf = self._get_auxiliary_var('found_inf')

if framework.in_dygraph_mode():
_C_ops.lamb_(
Expand All @@ -305,7 +304,7 @@ def _append_optimize_op(self, block, param_and_grad):
beta1_pow_acc,
beta2_pow_acc,
master_weight,
found_inf,
None,
weight_decay,
self._beta1,
self._beta2,
Expand Down Expand Up @@ -343,6 +342,7 @@ def _append_optimize_op(self, block, param_and_grad):
inputs["MasterParam"] = master_weight
outputs["MasterParamOut"] = master_weight

found_inf = self._get_auxiliary_var('found_inf')
if found_inf:
inputs["SkipUpdate"] = found_inf

Expand Down
37 changes: 24 additions & 13 deletions python/paddle/optimizer/momentum.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,19 +530,30 @@ def _append_optimize_multi_tensor_op(
)

if in_dygraph_mode():
_, _, _ = _C_ops.merged_momentum_(
self._param_dict[key][param_group_idx],
grad_dict[key],
self._velocity_dict[key][param_group_idx],
lr_dict[key],
master_weight,
self._momentum,
self._use_nesterov,
self._regularization_method_dict[key][param_group_idx],
self._regularization_coeff_dict[key][param_group_idx],
find_master,
self._rescale_grad,
)
found_inf = self._get_auxiliary_var('found_inf')
if found_inf:
if isinstance(found_inf, core.eager.Tensor):
self._set_auxiliary_var('found_inf', True)
else:
if isinstance(found_inf, core.eager.Tensor):
self._set_auxiliary_var('found_inf', False)
_, _, _ = _C_ops.merged_momentum_(
self._param_dict[key][param_group_idx],
grad_dict[key],
self._velocity_dict[key][param_group_idx],
lr_dict[key],
master_weight,
self._momentum,
self._use_nesterov,
self._regularization_method_dict[key][
param_group_idx
],
self._regularization_coeff_dict[key][
param_group_idx
],
find_master,
self._rescale_grad,
)
else:
inputs = {
"Param": self._param_dict[key][param_group_idx],
Expand Down
Loading

0 comments on commit 382e9a0

Please sign in to comment.