Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix FusedResidualDropoutBias nan in v100 #42344

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions paddle/fluid/operators/fused/fused_dropout_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,17 @@ inline platform::GpuLaunchConfig Get1DBlocksAnd2DGrids(
const platform::CUDADeviceContext &ctx, const uint32_t rows,
const uint32_t cols, const int vec_size) {
const uint32_t tmp_cols = cols / vec_size;
int threads = std::max(
static_cast<uint32_t>(32),
std::min(tmp_cols, static_cast<uint32_t>(ctx.GetMaxThreadsPerBlock())));
// NOTE(wangxi): We set max_block_size to 512, for `FusedResidualDropoutBias`
// needs too many register resources. If data_type is float16, CUDA
// error(701) will occur when block_size is 1024. Which error is
// 'cudaErrorLaunchOutOfResources', this indicates that a launch did not
// occur because it did not have appropriate resources.
// Of course, this kernel can be optimized later to reduce the use
// of registers.
int threads =
std::max(static_cast<uint32_t>(32),
std::min(tmp_cols, static_cast<uint32_t>(std::min(
ctx.GetMaxThreadsPerBlock(), 512))));
const auto blocks_x =
std::max(static_cast<uint32_t>(1), (tmp_cols + threads - 1) / threads);
const auto blocks_y = std::max(static_cast<uint32_t>(1), rows);
Expand Down
19 changes: 19 additions & 0 deletions paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ struct TestFusedResidualDropoutBias {
dropout_prob, is_upscale_in_train, is_test);
}
ctx->Wait();
PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
// add residual
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
Expand Down Expand Up @@ -186,6 +187,7 @@ struct TestFusedResidualDropoutBias {
src.data<T>(), residual.data<T>(), bias_ptr, mask.data<uint8_t>(),
out.data<T>(), *ctx);
ctx->Wait();
PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
}

void FusedBackward() {
Expand Down Expand Up @@ -313,3 +315,20 @@ TEST(FusedDropout, GPUFusedResidualDropoutBiasLargeShape) {
test.CheckOut(static_cast<float>(1e-5));
test.CheckGrad(static_cast<float>(1e-3));
}

TEST(FusedDropout, GPUFusedResidualDropoutBiasLargeShapeFp16) {
// Used to test that `cudaErrorLaunchOutOfResources` will not occur
int rows = 1;
int cols = 12288;
if (std::getenv("_rows") != nullptr) {
rows = atoi(std::getenv("_rows"));
}
if (std::getenv("_cols") != nullptr) {
cols = atoi(std::getenv("_cols"));
}
TestFusedResidualDropoutBias<platform::float16> test(rows, cols, 0, 0.0, true,
true);
test.Run();
test.CheckOut(static_cast<platform::float16>(1e-1));
test.CheckGrad(static_cast<platform::float16>(1e-1));
}