Skip to content

Commit

Permalink
Fix DistributedFusedLAMB NaN problem (#46011)
Browse files Browse the repository at this point in the history
* fix distributed_fused_lamb nan

* remove CUDA_ASSERT
  • Loading branch information
sneaxiy authored Sep 14, 2022
1 parent 65dd828 commit 6833ecf
Showing 1 changed file with 59 additions and 10 deletions.
69 changes: 59 additions & 10 deletions paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1193,6 +1193,38 @@ static void PrintAllMinMaxRange(const framework::ExecutionContext &ctx,
}
}

template <typename T>
static bool HasNanInf(const phi::GPUContext &dev_ctx, const T *x, int numel) {
if (numel <= 0) return false;
cub::TransformInputIterator<bool, IsNanInfFunctor<T>, const T *> iter(
x, IsNanInfFunctor<T>());
memory::Buffer buffer(dev_ctx.GetPlace());
memory::Buffer out(dev_ctx.GetPlace());
CubDeviceReduce(iter,
out.Alloc<bool>(1),
numel,
OrFunctor(),
false,
dev_ctx.stream(),
&buffer);
bool flag;
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(&flag,
out.Get<bool>(),
sizeof(flag),
hipMemcpyDeviceToHost,
dev_ctx.stream()));
#else
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(&flag,
out.Get<bool>(),
sizeof(flag),
cudaMemcpyDeviceToHost,
dev_ctx.stream()));
#endif
dev_ctx.Wait();
return flag;
}

static void CheckHasNanInfGrad(const float *fp32_grad,
int fp32_numel,
const platform::float16 *fp16_grad,
Expand Down Expand Up @@ -1830,17 +1862,11 @@ class DistributedFusedLambOpKernel<phi::GPUContext, T>
} else {
VLOG(1) << "Grad scale: " << FlattenToString(fp16_scale, 1, place);
}
if (nranks > 1) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::ncclAllReduce(fp32_square_grad_norm,
fp32_square_grad_norm,
1,
ncclFloat32,
ncclSum,
global_comm,
stream));
}
// (3) Do ReduceScatter with scale
VLOG(1) << "FP32 HasNanInf before all reduce: "
<< HasNanInf(dev_ctx, fp32_grad, fp32_numel);
VLOG(1) << "FP16 HasNanInf before all reduce: "
<< HasNanInf(dev_ctx, fp16_grad, fp16_numel);
if (local_shard) {
if (use_hierarchical_allreduce) {
NCCLReduceScatterWithScale(
Expand Down Expand Up @@ -1916,6 +1942,29 @@ class DistributedFusedLambOpKernel<phi::GPUContext, T>
dev_ctx,
fp16_scale);
}
VLOG(1) << "FP32 HasNanInf after all reduce: "
<< HasNanInf(dev_ctx, fp32_sum_grad, fp32_numel_each_device);
VLOG(1) << "FP16 HasNanInf after all reduce: "
<< HasNanInf(dev_ctx, fp16_sum_grad, fp16_numel_each_device);
CheckHasNanInfGrad(fp32_sum_grad,
fp32_numel_each_device,
fp16_sum_grad,
fp16_numel_each_device,
fp32_square_grad_norm,
stream,
&cub_tmp_buffer);
if (num_devices > 1) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::ncclAllReduce(fp32_square_grad_norm,
fp32_square_grad_norm,
1,
ncclFloat32,
ncclSum,
local_comm,
stream));
VLOG(1) << "Grad square norm after all reduce: "
<< FlattenToString(fp32_square_grad_norm, 1, place);
}
// (4) mark max_global_grad_norm as 0, meaning that clip has been
// already performed
max_global_grad_norm = 0;
Expand Down

0 comments on commit 6833ecf

Please sign in to comment.