PaddlePaddle · longranger2 · Mar 14, 2023 · Mar 14, 2023 · Mar 22, 2023 · Mar 22, 2023
diff --git a/paddle/phi/kernels/gpu/bernoulli_kernel.cu b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
@@ -32,6 +32,33 @@
 
 namespace phi {
 
+template <typename T>
+__device__ T convert_to_T(float rand_value, T x_value);
+
+template <>
+__device__ phi::dtype::float16 convert_to_T<phi::dtype::float16>(
+    float rand_value, phi::dtype::float16 x_value) {
+  return static_cast<phi::dtype::float16>(rand_value <=
+                                          static_cast<float>(x_value));
+}
+
+template <>
+__device__ phi::dtype::bfloat16 convert_to_T<phi::dtype::bfloat16>(
+    float rand_value, phi::dtype::bfloat16 x_value) {
+  return static_cast<phi::dtype::bfloat16>(rand_value <=
+                                           static_cast<float>(x_value));
+}
+
+template <>
+__device__ float convert_to_T<float>(float rand_value, float x_value) {
+  return static_cast<float>(rand_value <= x_value);
+}
+
+template <>
+__device__ double convert_to_T<double>(float rand_value, double x_value) {
+  return static_cast<double>(rand_value <= x_value);
+}
+
 // 'curand_uniform4/hiprand_uniform4' generate 4 random number each time
 template <typename T>
 __global__ void bernoulli_cuda_kernel(
@@ -55,7 +82,7 @@ __global__ void bernoulli_cuda_kernel(
     for (size_t j = 0; j < 4; j++) {
       size_t idx = i + j;
       if (idx < size) {
-        out_data[idx] = static_cast<T>((&rand.x)[j] <= x_data[idx]);
+        out_data[idx] = convert_to_T<T>((&rand.x)[j], x_data[idx]);
       }
     }
   }
@@ -85,5 +112,11 @@ void BernoulliKernel(const Context& ctx,
 
 }  // namespace phi
 
-PD_REGISTER_KERNEL(
-    bernoulli, GPU, ALL_LAYOUT, phi::BernoulliKernel, float, double) {}
+PD_REGISTER_KERNEL(bernoulli,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BernoulliKernel,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/trunc_grad_kernel.cu b/paddle/phi/kernels/gpu/trunc_grad_kernel.cu
@@ -52,4 +52,6 @@ PD_REGISTER_KERNEL(trunc_grad,
                    float,
                    double,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/trunc_kernel.cu b/paddle/phi/kernels/gpu/trunc_kernel.cu
@@ -23,14 +23,39 @@ namespace phi {
 
 using phi::PADDLE_CUDA_NUM_THREADS;
 
+template <typename T>
+__device__ T device_trunc(T x);
+
+template <>
+__device__ float device_trunc<float>(float x) {
+  return truncf(x);
+}
+
+template <>
+__device__ double device_trunc<double>(double x) {
+  return trunc(x);
+}
+
+template <>
+__device__ phi::dtype::float16 device_trunc<phi::dtype::float16>(
+    phi::dtype::float16 x) {
+  return static_cast<phi::dtype::float16>(truncf(static_cast<float>(x)));
+}
+
+template <>
+__device__ phi::dtype::bfloat16 device_trunc<phi::dtype::bfloat16>(
+    phi::dtype::bfloat16 x) {
+  return static_cast<phi::dtype::bfloat16>(truncf(static_cast<float>(x)));
+}
+
 template <typename T>
 class TruncFunctor {
  public:
-  __device__ TruncFunctor(const T x) : x_(x) {}
-  __device__ T operator()() { return trunc(x_); }
+  __device__ TruncFunctor(T x) : x_(x) {}
+  __device__ T operator()() { return device_trunc(x_); }
 
  public:
-  const T x_;
+  T x_;
 };
 
 template <>
@@ -78,5 +103,13 @@ void TruncKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PD_REGISTER_KERNEL(
-    trunc, GPU, ALL_LAYOUT, phi::TruncKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(trunc,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TruncKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/python/paddle/fluid/tests/unittests/test_bernoulli_op.py b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
@@ -15,9 +15,10 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
+from paddle.fluid import core
 
 
 def output_hist(out):
@@ -31,10 +32,15 @@ def output_hist(out):
 class TestBernoulliOp(OpTest):
     def setUp(self):
         self.op_type = "bernoulli"
-        self.inputs = {"X": np.random.uniform(size=(1000, 784))}
+        self.inputs = {
+            "X": np.random.uniform(size=(1000, 784)).astype(self.dtype)
+        }
         self.attrs = {}
         self.outputs = {"Out": np.zeros((1000, 784)).astype("float32")}
 
+    def init_dtype(self):
+        self.dtype = np.float32
+
     def test_check_output(self):
         self.check_output_customized(self.verify_output)
 
@@ -98,5 +104,39 @@ def test_fixed_random_number(self):
         paddle.enable_static()
 
 
+class TestBernoulliFP16Op(TestBernoulliOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not complied with CUDA and not support the bfloat16",
+)
+class TestBernoulliBF16Op(OpTest):
+    def setUp(self):
+        self.python_api = paddle.bernoulli
+        self.op_type = "bernoulli"
+        self.dtype = np.uint16
+        self.init_test_case()
+
+        self.inputs = {'X': convert_float_to_uint16(self.x)}
+        self.attrs = {}
+        self.outputs = {'Out': convert_float_to_uint16(self.out)}
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place_customized(self.verify_output, place)
+
+    def init_test_case(self):
+        self.x = np.random.uniform(size=(1000, 784)).astype("float32")
+        self.out = np.zeros((1000, 784)).astype("float32")
+
+    def verify_output(self, outs):
+        hist, prob = output_hist(np.array(outs[0]))
+        np.testing.assert_allclose(hist, prob, rtol=0, atol=0.01)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_trunc_op.py b/python/paddle/fluid/tests/unittests/test_trunc_op.py
@@ -15,9 +15,10 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
+from paddle.fluid import core
 
 paddle.enable_static()
 
@@ -90,5 +91,35 @@ def test_errors(self):
             self.assertRaises(TypeError, paddle.trunc, x)
 
 
+class TestTruncFP16OP(TestTruncOp):
+    def init_dtype_type(self):
+        self.dtype = np.float16
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not complied with CUDA and not support the bfloat16",
+)
+class TestTruncBF16OP(OpTest):
+    def setUp(self):
+        self.python_api = paddle.trunc
+        self.op_type = "trunc"
+        self.dtype = np.uint16
+        np.random.seed(2021)
+        x = np.random.random((20, 20)).astype("float32")
+        out = np.trunc(x)
+        self.inputs = {'X': convert_float_to_uint16(x)}
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['X'], 'Out', numeric_grad_delta=1e-5)
+
+
 if __name__ == "__main__":
     unittest.main()