Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

【PaddlePaddle Hackathon 4 No.49】:为 Paddle bce_loss 支持 float16 数据类型 #51401

Closed
wants to merge 11 commits into from
14 changes: 14 additions & 0 deletions paddle/phi/kernels/gpu/bce_loss_grad_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
#include "paddle/phi/core/hostdevice.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/elementwise_base.h"
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/common/float16.h"

namespace phi {

Expand All @@ -37,6 +39,11 @@ struct BCELossGradFunctor {
HOSTDEVICE inline T operator()(const T x, const T label, const T dout) const {
T term1 = max((one - x) * x, eps);
return (dout * (x - label) / term1);
using MT = typename phi::dtype::MPTypeTrait<T>::Type;
MT x_mt = static_cast<MT>(x);
MT term1 = max((static_cast<MT>(one) - x_mt) * x_mt, static_cast<MT>(eps));
return static_cast<T>(static_cast<MT>(dout) *
(x_mt - static_cast<MT>(label)) / term1);
}
};

Expand All @@ -57,3 +64,10 @@ void BCELossGradKernel(const Context& dev_ctx,

PD_REGISTER_KERNEL(
bce_loss_grad, GPU, ALL_LAYOUT, phi::BCELossGradKernel, float, double) {}
PD_REGISTER_KERNEL(bce_loss_grad,
GPU,
ALL_LAYOUT,
phi::BCELossGradKernel,
float,
double,
phi::dtype::float16) {}
18 changes: 18 additions & 0 deletions paddle/phi/kernels/gpu/bce_loss_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/elementwise_base.h"
#include "paddle/phi/kernels/primitive/functor_primitives.h"
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/common/float16.h"

namespace phi {

Expand All @@ -43,6 +45,15 @@ struct BCELossFunctor {
T term1 = max(phi::kps::details::Log(x), neg_100);
T term2 = max(phi::kps::details::Log(one - x), neg_100);
return (((label - one) * term2) - (label * term1));
using MT = typename phi::dtype::MPTypeTrait<T>::Type;
MT term1 = max(phi::kps::details::Log(static_cast<MT>(x)),
static_cast<MT>(neg_100));
MT term2 =
max(phi::kps::details::Log(static_cast<MT>(one) - static_cast<MT>(x)),
static_cast<MT>(neg_100));
return static_cast<T>(
((static_cast<MT>(label) - static_cast<MT>(one)) * term2) -
(static_cast<MT>(label) * term1));
}
};

Expand All @@ -62,3 +73,10 @@ void BCELossKernel(const Context& dev_ctx,

PD_REGISTER_KERNEL(
bce_loss, GPU, ALL_LAYOUT, phi::BCELossKernel, float, double) {}
PD_REGISTER_KERNEL(bce_loss,
GPU,
ALL_LAYOUT,
phi::BCELossKernel,
float,
double,
phi::dtype::float16) {}
9 changes: 9 additions & 0 deletions paddle/phi/kernels/gpu/maxout_grad_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,14 @@
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/maxout_grad_kernel_impl.h"


PD_REGISTER_KERNEL(
maxout_grad, GPU, ALL_LAYOUT, phi::MaxOutGradKernel, float, double) {}

PD_REGISTER_KERNEL(maxout_grad,
GPU,
ALL_LAYOUT,
phi::MaxOutGradKernel,
float,
double,
phi::dtype::float16) {}
9 changes: 9 additions & 0 deletions paddle/phi/kernels/gpu/maxout_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,13 @@
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/maxout_kernel_impl.h"


PD_REGISTER_KERNEL(maxout, GPU, ALL_LAYOUT, phi::MaxOutKernel, float, double) {}

PD_REGISTER_KERNEL(maxout,
GPU,
ALL_LAYOUT,
phi::MaxOutKernel,
float,
double,
phi::dtype::float16) {}
6 changes: 6 additions & 0 deletions paddle/phi/kernels/impl/maxout_grad_kernel_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/maxouting.h"
#include "paddle/phi/kernels/maxout_grad_kernel.h"
#include "paddle/phi/common/amp_type_traits.h"

namespace phi {

Expand All @@ -33,10 +34,15 @@ void MaxOutGradKernel(const Context& dev_ctx,
}

phi::funcs::SetConstant<Context, T> zero;
using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
phi::funcs::SetConstant<Context, MPType> zero;
if (x_grad) {
dev_ctx.template Alloc<T>(x_grad);
zero(dev_ctx, x_grad, static_cast<T>(0.0));
phi::funcs::MaxOutGradFunctor<Context, T> maxout_backward;
dev_ctx.template Alloc<MPType>(x_grad);
zero(dev_ctx, x_grad, static_cast<MPType>(0.0));
phi::funcs::MaxOutGradFunctor<Context, MPType> maxout_backward;
maxout_backward(dev_ctx, x, x_grad, out, out_grad, groups, axis);
}
}
Expand Down
3 changes: 3 additions & 0 deletions paddle/phi/kernels/impl/maxout_kernel_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#include "paddle/phi/kernels/funcs/maxouting.h"
#include "paddle/phi/kernels/maxout_kernel.h"
#include "paddle/phi/common/amp_type_traits.h"

namespace phi {

Expand All @@ -30,6 +31,8 @@ void MaxOutKernel(const Context& dev_ctx,
}

phi::funcs::MaxOutFunctor<Context, T> maxout_forward;
using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
phi::funcs::MaxOutFunctor<Context, MPType> maxout_forward;
maxout_forward(dev_ctx, x, out, groups, axis);
}

Expand Down
2 changes: 2 additions & 0 deletions paddle/phi/kernels/maxout_grad_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

#pragma once

#include "paddle/phi/core/device_context.h"

#include "paddle/phi/core/dense_tensor.h"

namespace phi {
Expand Down
2 changes: 2 additions & 0 deletions paddle/phi/kernels/maxout_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

#pragma once

#include "paddle/phi/core/device_context.h"

#include "paddle/phi/core/dense_tensor.h"

namespace phi {
Expand Down
43 changes: 43 additions & 0 deletions python/paddle/fluid/tests/unittests/test_bce_loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core


def test_static_layer(
Expand Down Expand Up @@ -278,6 +279,48 @@ class TestBceLossOpCase2(OpTest):
def init_test_cast(self):
self.shape = [2, 3, 20]

@unittest.skipIf(
not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
)
class TestBceLossOpFP16(OpTest):
def setUp(self):
self.init_test_case()
self.op_type = "bce_loss"
self.python_api = bce_wrapper
input_np = np.random.uniform(0.1, 0.8, self.shape).astype("float16")
label_np = np.random.randint(0, 2, self.shape).astype("float16")
output_np = bce_loss(input_np, label_np)

self.inputs = {'X': input_np, 'Label': label_np}
self.outputs = {'Out': output_np}

def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)

def test_check_grad(self):
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_grad_with_place(
place, ['X'], 'Out', max_relative_error=0.5
)

def init_test_case(self):
self.shape = [10, 10]


class TestBceLossOpFP16Case1(OpTest):
def init_test_cast(self):
self.shape = [20, 30, 40, 50]


class TestBceLossOpFP16Case2(OpTest):
def init_test_cast(self):
self.shape = [2, 3, 20]



if __name__ == "__main__":
paddle.enable_static()
Expand Down
28 changes: 28 additions & 0 deletions python/paddle/fluid/tests/unittests/test_maxout_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,34 @@ def test_errors(self):
x_float32 = paddle.fluid.data(name='x_float32', shape=[2, 4, 6, 8])
self.assertRaises(ValueError, F.maxout, x_float32, 2, 2)

@unittest.skipIf(
not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
)
class TestMaxOutOpFP16(OpTest):
def setUp(self):
self.op_type = "maxout"
self.python_api = paddle.nn.Maxout
input_np = np.random.uniform(-1, 1, [2, 6, 5, 4]).astype(np.float16)
self.groups = 2
self.axis = 1
output_np = maxout_forward_naive(input_np, self.groups, self.axis)
self.attrs = {'groups': self.groups, 'axis': self.axis}
self.inputs = {'X': input_np}
self.outputs = {'Out': output_np}

def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)

def test_check_grad(self):
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_grad_with_place(
place, ['X'], 'Out', max_relative_error=0.5
)


if __name__ == '__main__':
unittest.main()
1 change: 1 addition & 0 deletions python/paddle/nn/layer/loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ class BCEWithLogitsLoss(Layer):
weight (Tensor, optional): A manual rescaling weight given to the loss of each
batch element. If given, it has to be a 1D Tensor whose size is `[N, ]`,
The data type is float32, float64. Default is ``'None'``.
The data type is float16, float32, float64. Default is ``'None'``.
reduction (str, optional): Indicate how to average the loss by batch_size,
the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
Expand Down