Skip to content

Commit

Permalink
fix FusedResidualDropoutBias nan in v100 (#42344)
Browse files Browse the repository at this point in the history
  • Loading branch information
wangxicoding authored Apr 28, 2022
1 parent 8ad3870 commit 687219f
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 3 deletions.
14 changes: 11 additions & 3 deletions paddle/fluid/operators/fused/fused_dropout_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,17 @@ inline platform::GpuLaunchConfig Get1DBlocksAnd2DGrids(
const platform::CUDADeviceContext &ctx, const uint32_t rows,
const uint32_t cols, const int vec_size) {
const uint32_t tmp_cols = cols / vec_size;
int threads = std::max(
static_cast<uint32_t>(32),
std::min(tmp_cols, static_cast<uint32_t>(ctx.GetMaxThreadsPerBlock())));
// NOTE(wangxi): We set max_block_size to 512, for `FusedResidualDropoutBias`
// needs too many register resources. If data_type is float16, CUDA
// error(701) will occur when block_size is 1024. Which error is
// 'cudaErrorLaunchOutOfResources', this indicates that a launch did not
// occur because it did not have appropriate resources.
// Of course, this kernel can be optimized later to reduce the use
// of registers.
int threads =
std::max(static_cast<uint32_t>(32),
std::min(tmp_cols, static_cast<uint32_t>(std::min(
ctx.GetMaxThreadsPerBlock(), 512))));
const auto blocks_x =
std::max(static_cast<uint32_t>(1), (tmp_cols + threads - 1) / threads);
const auto blocks_y = std::max(static_cast<uint32_t>(1), rows);
Expand Down
19 changes: 19 additions & 0 deletions paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ struct TestFusedResidualDropoutBias {
dropout_prob, is_upscale_in_train, is_test);
}
ctx->Wait();
PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
// add residual
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
Expand Down Expand Up @@ -186,6 +187,7 @@ struct TestFusedResidualDropoutBias {
src.data<T>(), residual.data<T>(), bias_ptr, mask.data<uint8_t>(),
out.data<T>(), *ctx);
ctx->Wait();
PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
}

void FusedBackward() {
Expand Down Expand Up @@ -313,3 +315,20 @@ TEST(FusedDropout, GPUFusedResidualDropoutBiasLargeShape) {
test.CheckOut(static_cast<float>(1e-5));
test.CheckGrad(static_cast<float>(1e-3));
}

TEST(FusedDropout, GPUFusedResidualDropoutBiasLargeShapeFp16) {
// Used to test that `cudaErrorLaunchOutOfResources` will not occur
int rows = 1;
int cols = 12288;
if (std::getenv("_rows") != nullptr) {
rows = atoi(std::getenv("_rows"));
}
if (std::getenv("_cols") != nullptr) {
cols = atoi(std::getenv("_cols"));
}
TestFusedResidualDropoutBias<platform::float16> test(rows, cols, 0, 0.0, true,
true);
test.Run();
test.CheckOut(static_cast<platform::float16>(1e-1));
test.CheckGrad(static_cast<platform::float16>(1e-1));
}

0 comments on commit 687219f

Please sign in to comment.