Skip to content

Commit

Permalink
[NPU] avoid tensor copy in check_finite_and_scale (#33244)
Browse files Browse the repository at this point in the history
  • Loading branch information
zhiqiu authored Jun 4, 2021
1 parent 53d3f5e commit d523dff
Showing 1 changed file with 15 additions and 31 deletions.
46 changes: 15 additions & 31 deletions paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,11 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {

found_inf->mutable_data<bool>(ctx.GetPlace());

bool found_inf_data = false;

auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();

// step1: inverse scale(RealDiv)
// step1: inverse scale
Tensor const_tensor;
const_tensor.mutable_data<T>({1}, ctx.GetPlace());
FillNpuTensorWithConstant<T>(&const_tensor, static_cast<T>(1.0));
Expand All @@ -66,7 +64,6 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
// NOTE(zhiqiu):
Tensor tmp;
tmp.mutable_data<float>({8}, ctx.GetPlace());

// NOTE(zhiqiu): NPUGetFloatStatus updates data on input in-place.
// tmp is only placeholder.
const auto& runner_float_status =
Expand All @@ -81,39 +78,26 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
{{"axes", std::vector<int>{0}}, {"keep_dims", true}});
runner_reduce_sum.Run(stream);

std::vector<float> sum_vec;
TensorToVector(
sum, ctx.template device_context<paddle::platform::NPUDeviceContext>(),
&sum_vec);
found_inf_data = (sum_vec[0] > 1);

VLOG(4) << "found_inf_data:" << found_inf_data;

const auto& runner_greater =
NpuOpRunner("GreaterEqual", {sum, const_tensor}, {*found_inf}, {});
runner_greater.Run(stream);

// NOTE(zhiqiu): The normal logic is :
// out = in, if found_inf = true
// out = in/scale, if found_inf = false
// However, on NPU, in order to avoid stream sync, we do not copy the
// found_inf data to cpu to check whether to unscale or not.
// Instead, we do the Mul no matter found_inf or not.
// And, a fact is, only few steps contains nan/inf during training.
for (size_t i = 0; i < xs.size(); ++i) {
const auto* x = xs[i];
auto* out = outs[i];
out->mutable_data<T>(ctx.GetPlace());
if (!found_inf_data) {
// MatMul
const auto& runner_matmul =
NpuOpRunner("Mul", {*x, *tmp_inverse_out}, {*out}, {});
runner_matmul.Run(stream);
}
const auto& runner_mul =
NpuOpRunner("Mul", {*x, *tmp_inverse_out}, {*out}, {});
runner_mul.Run(stream);
}

// set found_inf to true
VLOG(4) << "found overflow:" << found_inf_data;
Tensor found_inf_tensor;
found_inf_tensor.Resize({1});
bool* is_found_inf =
found_inf_tensor.mutable_data<bool>(paddle::platform::CPUPlace());
*is_found_inf = found_inf_data;

framework::TensorCopy(
found_inf_tensor, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(), found_inf);
ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();

const auto& runner_clear_status =
NpuOpRunner("NPUClearFloatStatus", {*float_status}, {tmp});
runner_clear_status.Run(stream);
Expand Down

0 comments on commit d523dff

Please sign in to comment.