From a1d05222f4baa3dffa4c8f6eec531f4745e86b9e Mon Sep 17 00:00:00 2001
From: loneranger <836253168@qq.com>
Date: Tue, 14 Mar 2023 21:29:28 +0800
Subject: [PATCH 01/22] add fp16 and bf16 support for bernoulli

---
 paddle/phi/kernels/gpu/bernoulli_kernel.cu    | 11 +++++++--
 .../tests/unittests/test_bernoulli_op.py      | 23 +++++++++++++++++++
 2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/paddle/phi/kernels/gpu/bernoulli_kernel.cu b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
index edcf29e2d88d3..7847a9ce371d1 100644
--- a/paddle/phi/kernels/gpu/bernoulli_kernel.cu
+++ b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
@@ -26,6 +26,7 @@
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/distribution_helper.h"
@@ -85,5 +86,11 @@ void BernoulliKernel(const Context& ctx,
 
 }  // namespace phi
 
-PD_REGISTER_KERNEL(
-    bernoulli, GPU, ALL_LAYOUT, phi::BernoulliKernel, float, double) {}
+PD_REGISTER_KERNEL(bernoulli,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BernoulliKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/python/paddle/fluid/tests/unittests/test_bernoulli_op.py b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
index 33a450310d1ef..3e26a3e32bf95 100644
--- a/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
@@ -18,6 +18,7 @@
 from eager_op_test import OpTest
 
 import paddle
+import paddle.fluid.core as core
 
 
 def output_hist(out):
@@ -98,5 +99,27 @@ def test_fixed_random_number(self):
         paddle.enable_static()
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_float16_supported(core.CUDAPlace(0)),
+    "core is not complied with CUDA and not support the float16",
+)
+class TestBernoulliFP16OP(OpTest):
+    def setUp(self):
+        self.op_type = "bernoulli"
+        self.python_api = paddle.bernoulli
+        self.dtype = np.float16
+        self.__class__.op_type = self.op_type
+        x = np.random.uniform(size=(1000, 784)).astype(np.float32)
+        out = np.zeros((1000, 784)).astype(np.float32)
+        self.inputs = {"X": x.astype(self.dtype)}
+        self.attrs = {}
+        self.outputs = {"Out": out}
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place, atol=1e-3, check_eager=False)
+
+
 if __name__ == "__main__":
     unittest.main()

From f6455e78b5678cf7958e1fcd8efb4cc5c069e48a Mon Sep 17 00:00:00 2001
From: loneranger <836253168@qq.com>
Date: Tue, 14 Mar 2023 21:43:43 +0800
Subject: [PATCH 02/22] add fp16 and bf16 support for trunc

---
 paddle/phi/kernels/gpu/trunc_grad_kernel.cu   |  5 +-
 paddle/phi/kernels/gpu/trunc_kernel.cu        | 13 +++-
 paddle/phi/kernels/trunc_grad_kernel.h        |  1 +
 paddle/phi/kernels/trunc_kernel.h             |  1 +
 .../fluid/tests/unittests/test_trunc_op.py    | 65 ++++++++++++++++++-
 5 files changed, 81 insertions(+), 4 deletions(-)

diff --git a/paddle/phi/kernels/gpu/trunc_grad_kernel.cu b/paddle/phi/kernels/gpu/trunc_grad_kernel.cu
index 8a88383e6e4f0..a3f7ae556951c 100644
--- a/paddle/phi/kernels/gpu/trunc_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/trunc_grad_kernel.cu
@@ -17,6 +17,7 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
+#include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
@@ -52,4 +53,6 @@ PD_REGISTER_KERNEL(trunc_grad,
                    float,
                    double,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/trunc_kernel.cu b/paddle/phi/kernels/gpu/trunc_kernel.cu
index dfc4f6589e9cf..f93ab9fbd3528 100644
--- a/paddle/phi/kernels/gpu/trunc_kernel.cu
+++ b/paddle/phi/kernels/gpu/trunc_kernel.cu
@@ -17,6 +17,7 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
+#include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
@@ -78,5 +79,13 @@ void TruncKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PD_REGISTER_KERNEL(
-    trunc, GPU, ALL_LAYOUT, phi::TruncKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(trunc,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TruncKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/trunc_grad_kernel.h b/paddle/phi/kernels/trunc_grad_kernel.h
index f3f8032d3a23c..ad20fd59fc37f 100644
--- a/paddle/phi/kernels/trunc_grad_kernel.h
+++ b/paddle/phi/kernels/trunc_grad_kernel.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/trunc_kernel.h b/paddle/phi/kernels/trunc_kernel.h
index d9a7ea6339348..13acab7582dba 100644
--- a/paddle/phi/kernels/trunc_kernel.h
+++ b/paddle/phi/kernels/trunc_kernel.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
 
 namespace phi {
 
diff --git a/python/paddle/fluid/tests/unittests/test_trunc_op.py b/python/paddle/fluid/tests/unittests/test_trunc_op.py
index db45b36b56302..181dcb68c0f78 100644
--- a/python/paddle/fluid/tests/unittests/test_trunc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_trunc_op.py
@@ -15,9 +15,10 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
+import paddle.fluid.core as core
 
 paddle.enable_static()
 
@@ -90,5 +91,67 @@ def test_errors(self):
             self.assertRaises(TypeError, paddle.trunc, x)
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_float16_supported(core.CUDAPlace(0)),
+    "core is not complied with CUDA and not support the float16",
+)
+class TestTruncFP16OP(OpTest):
+    def setUp(self):
+        self.op_type = "trunc"
+        self.python_api = paddle.trunc
+        self.init_dtype_type()
+        self.__class__.op_type = self.op_type
+        np.random.seed(2021)
+        x = np.random.random((20, 20)).astype(np.float32)
+        out = np.trunc(x)
+        self.inputs = {'X': x.astype(self.dtype)}
+        self.outputs = {'Out': out}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place, atol=1e-3, check_eager=False)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(
+            place, ['X'], 'Out', max_relative_error=1e-2, check_eager=True
+        )
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not complied with CUDA and not support the bfloat16",
+)
+class TestTruncBF16(OpTest):
+    def setUp(self):
+        self.op_type = "trunc"
+        self.python_api = paddle.trunc
+        self.init_dtype_type()
+        self.__class__.op_type = self.op_type
+        np.random.seed(2021)
+        x = np.random.random((20, 20)).astype(np.float32)
+        out = np.trunc(x)
+        self.inputs = {'X': convert_float_to_uint16(x)}
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+
+    def init_dtype_type(self):
+        self.dtype = np.uint16
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place, atol=1e-3, check_eager=False)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(
+            place, ['X'], 'Out', max_relative_error=1e-2, check_eager=True
+        )
+
+
 if __name__ == "__main__":
     unittest.main()

From 99f58543197a5f015438ecb519ff1610289b974f Mon Sep 17 00:00:00 2001
From: longranger2 <836253168@qq.com>
Date: Wed, 22 Mar 2023 19:22:02 +0800
Subject: [PATCH 03/22] fix bug

---
 paddle/phi/kernels/gpu/bernoulli_kernel.cu    |  1 -
 paddle/phi/kernels/gpu/trunc_grad_kernel.cu   |  1 -
 paddle/phi/kernels/gpu/trunc_kernel.cu        |  1 -
 paddle/phi/kernels/trunc_grad_kernel.h        |  1 -
 paddle/phi/kernels/trunc_kernel.h             |  1 -
 .../tests/unittests/test_bernoulli_op.py      | 29 ++++----------
 .../fluid/tests/unittests/test_trunc_op.py    | 39 +++----------------
 7 files changed, 13 insertions(+), 60 deletions(-)

diff --git a/paddle/phi/kernels/gpu/bernoulli_kernel.cu b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
index 7847a9ce371d1..3905378694e17 100644
--- a/paddle/phi/kernels/gpu/bernoulli_kernel.cu
+++ b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
@@ -26,7 +26,6 @@
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
-#include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/distribution_helper.h"
diff --git a/paddle/phi/kernels/gpu/trunc_grad_kernel.cu b/paddle/phi/kernels/gpu/trunc_grad_kernel.cu
index a3f7ae556951c..40e1404cd900d 100644
--- a/paddle/phi/kernels/gpu/trunc_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/trunc_grad_kernel.cu
@@ -17,7 +17,6 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
-#include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/gpu/trunc_kernel.cu b/paddle/phi/kernels/gpu/trunc_kernel.cu
index f93ab9fbd3528..09f9c804a1d1b 100644
--- a/paddle/phi/kernels/gpu/trunc_kernel.cu
+++ b/paddle/phi/kernels/gpu/trunc_kernel.cu
@@ -17,7 +17,6 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
-#include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/trunc_grad_kernel.h b/paddle/phi/kernels/trunc_grad_kernel.h
index ad20fd59fc37f..f3f8032d3a23c 100644
--- a/paddle/phi/kernels/trunc_grad_kernel.h
+++ b/paddle/phi/kernels/trunc_grad_kernel.h
@@ -15,7 +15,6 @@
 #pragma once
 
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/device_context.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/trunc_kernel.h b/paddle/phi/kernels/trunc_kernel.h
index 13acab7582dba..d9a7ea6339348 100644
--- a/paddle/phi/kernels/trunc_kernel.h
+++ b/paddle/phi/kernels/trunc_kernel.h
@@ -15,7 +15,6 @@
 #pragma once
 
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/device_context.h"
 
 namespace phi {
 
diff --git a/python/paddle/fluid/tests/unittests/test_bernoulli_op.py b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
index 3e26a3e32bf95..c1c734d389571 100644
--- a/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
@@ -18,7 +18,6 @@
 from eager_op_test import OpTest
 
 import paddle
-import paddle.fluid.core as core
 
 
 def output_hist(out):
@@ -32,10 +31,15 @@ def output_hist(out):
 class TestBernoulliOp(OpTest):
     def setUp(self):
         self.op_type = "bernoulli"
-        self.inputs = {"X": np.random.uniform(size=(1000, 784))}
+        self.inputs = {
+            "X": np.random.uniform(size=(1000, 784)).astype(self.dtype)
+        }
         self.attrs = {}
         self.outputs = {"Out": np.zeros((1000, 784)).astype("float32")}
 
+    def init_dtype(self):
+        self.dtype = np.float32
+
     def test_check_output(self):
         self.check_output_customized(self.verify_output)
 
@@ -99,26 +103,9 @@ def test_fixed_random_number(self):
         paddle.enable_static()
 
 
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or not core.is_float16_supported(core.CUDAPlace(0)),
-    "core is not complied with CUDA and not support the float16",
-)
-class TestBernoulliFP16OP(OpTest):
-    def setUp(self):
-        self.op_type = "bernoulli"
-        self.python_api = paddle.bernoulli
+class TestBernoulliFP16OP(TestBernoulliOp):
+    def init_dtype(self):
         self.dtype = np.float16
-        self.__class__.op_type = self.op_type
-        x = np.random.uniform(size=(1000, 784)).astype(np.float32)
-        out = np.zeros((1000, 784)).astype(np.float32)
-        self.inputs = {"X": x.astype(self.dtype)}
-        self.attrs = {}
-        self.outputs = {"Out": out}
-
-    def test_check_output(self):
-        place = core.CUDAPlace(0)
-        self.check_output_with_place(place, atol=1e-3, check_eager=False)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_trunc_op.py b/python/paddle/fluid/tests/unittests/test_trunc_op.py
index 525aecdffaff3..4e882de91dfb0 100644
--- a/python/paddle/fluid/tests/unittests/test_trunc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_trunc_op.py
@@ -91,50 +91,23 @@ def test_errors(self):
             self.assertRaises(TypeError, paddle.trunc, x)
 
 
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or not core.is_float16_supported(core.CUDAPlace(0)),
-    "core is not complied with CUDA and not support the float16",
-)
-class TestTruncFP16OP(OpTest):
-    def setUp(self):
-        self.op_type = "trunc"
-        self.python_api = paddle.trunc
-        self.init_dtype_type()
-        self.__class__.op_type = self.op_type
-        np.random.seed(2021)
-        x = np.random.random((20, 20)).astype(np.float32)
-        out = np.trunc(x)
-        self.inputs = {'X': x.astype(self.dtype)}
-        self.outputs = {'Out': out}
-
+class TestTruncFP16OP(TestTruncOp):
     def init_dtype_type(self):
         self.dtype = np.float16
 
-    def test_check_output(self):
-        place = core.CUDAPlace(0)
-        self.check_output_with_place(place, atol=1e-3, check_eager=False)
-
-    def test_check_grad(self):
-        place = core.CUDAPlace(0)
-        self.check_grad_with_place(
-            place, ['X'], 'Out', max_relative_error=1e-2, check_eager=True
-        )
-
 
 @unittest.skipIf(
     not core.is_compiled_with_cuda()
     or not core.is_bfloat16_supported(core.CUDAPlace(0)),
     "core is not complied with CUDA and not support the bfloat16",
 )
-class TestTruncBF16(OpTest):
+class TestTruncBF16OP(OpTest):
     def setUp(self):
         self.op_type = "trunc"
         self.python_api = paddle.trunc
         self.init_dtype_type()
-        self.__class__.op_type = self.op_type
         np.random.seed(2021)
-        x = np.random.random((20, 20)).astype(np.float32)
+        x = np.random.random((20, 20)).astype(np.float64)
         out = np.trunc(x)
         self.inputs = {'X': convert_float_to_uint16(x)}
         self.outputs = {'Out': convert_float_to_uint16(out)}
@@ -144,13 +117,11 @@ def init_dtype_type(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place, atol=1e-3, check_eager=False)
+        self.check_output_with_place(place)
 
     def test_check_grad(self):
         place = core.CUDAPlace(0)
-        self.check_grad_with_place(
-            place, ['X'], 'Out', max_relative_error=1e-2, check_eager=True
-        )
+        self.check_grad_with_place(place, ['X'], 'Out')
 
 
 if __name__ == "__main__":

From dce175459ca26bc81cd83b95a67bd790e33f0f4b Mon Sep 17 00:00:00 2001
From: longranger2 <836253168@qq.com>
Date: Mon, 3 Apr 2023 20:28:04 +0800
Subject: [PATCH 04/22] fix bug

---
 .../tests/unittests/test_bernoulli_op.py      | 30 +++++++++++++++++--
 .../fluid/tests/unittests/test_trunc_op.py    |  2 +-
 2 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_bernoulli_op.py b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
index c1c734d389571..fe2a4491582fa 100644
--- a/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
@@ -15,9 +15,10 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
+from paddle.fluid import core
 
 
 def output_hist(out):
@@ -103,10 +104,35 @@ def test_fixed_random_number(self):
         paddle.enable_static()
 
 
-class TestBernoulliFP16OP(TestBernoulliOp):
+class TestBernoulliFP16Op(TestBernoulliOp):
     def init_dtype(self):
         self.dtype = np.float16
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not complied with CUDA and not support the bfloat16",
+)
+class TestBernoulliBF16Op(OpTest):
+    def setUp(self):
+        self.python_api = paddle.bernoulli
+        self.op_type = "bernoulli"
+        self.dtype = np.uint16
+        self.init_test_case()
+
+        self.inputs = {'X': convert_float_to_uint16(self.x)}
+        self.attrs = {}
+        self.outputs = {'Out': convert_float_to_uint16(self.out)}
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+    def init_test_case(self):
+        self.x = np.random.uniform(size=(1000, 784)).astype("float32")
+        self.out = np.zeros((1000, 784)).astype("float32")
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_trunc_op.py b/python/paddle/fluid/tests/unittests/test_trunc_op.py
index e10ae279ac589..2f34c5577b5e5 100644
--- a/python/paddle/fluid/tests/unittests/test_trunc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_trunc_op.py
@@ -18,7 +18,7 @@
 from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
-import paddle.fluid.core as core
+from paddle.fluid import core
 
 paddle.enable_static()
 

From b1771ebd4039f1b23fa80566f921aac527d40462 Mon Sep 17 00:00:00 2001
From: longranger2 <836253168@qq.com>
Date: Sat, 22 Apr 2023 05:48:31 +0000
Subject: [PATCH 05/22] fix bug

---
 paddle/phi/kernels/gpu/bernoulli_kernel.cu | 31 +++++++++++++++++++---
 paddle/phi/kernels/gpu/trunc_kernel.cu     |  8 +++++-
 2 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/paddle/phi/kernels/gpu/bernoulli_kernel.cu b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
index 3905378694e17..e0a9d3750d3fc 100644
--- a/paddle/phi/kernels/gpu/bernoulli_kernel.cu
+++ b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
@@ -32,6 +32,29 @@
 
 namespace phi {
 
+template <typename T>
+__device__ T convert_to_T(float rand_value, T x_value);
+
+template <>
+__device__ phi::dtype::float16 convert_to_T<phi::dtype::float16>(float rand_value, phi::dtype::float16 x_value) {
+  return static_cast<phi::dtype::float16>(rand_value <= static_cast<float>(x_value));
+}
+
+template <>
+__device__ phi::dtype::bfloat16 convert_to_T<phi::dtype::bfloat16>(float rand_value, phi::dtype::bfloat16 x_value) {
+  return static_cast<phi::dtype::bfloat16>(rand_value <= static_cast<float>(x_value));
+}
+
+template <>
+__device__ float convert_to_T<float>(float rand_value, float x_value) {
+  return static_cast<float>(rand_value <= x_value);
+}
+
+template <>
+__device__ double convert_to_T<double>(float rand_value, double x_value) {
+  return static_cast<double>(rand_value <= x_value);
+}
+
 // 'curand_uniform4/hiprand_uniform4' generate 4 random number each time
 template <typename T>
 __global__ void bernoulli_cuda_kernel(
@@ -55,7 +78,7 @@ __global__ void bernoulli_cuda_kernel(
     for (size_t j = 0; j < 4; j++) {
       size_t idx = i + j;
       if (idx < size) {
-        out_data[idx] = static_cast<T>((&rand.x)[j] <= x_data[idx]);
+        out_data[idx] = convert_to_T<T>((&rand.x)[j], x_data[idx]);
       }
     }
   }
@@ -89,7 +112,7 @@ PD_REGISTER_KERNEL(bernoulli,
                    GPU,
                    ALL_LAYOUT,
                    phi::BernoulliKernel,
-                   float,
-                   double,
                    phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
+                   phi::dtype::bfloat16,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/trunc_kernel.cu b/paddle/phi/kernels/gpu/trunc_kernel.cu
index 09f9c804a1d1b..449fe9bf400c8 100644
--- a/paddle/phi/kernels/gpu/trunc_kernel.cu
+++ b/paddle/phi/kernels/gpu/trunc_kernel.cu
@@ -27,7 +27,13 @@ template <typename T>
 class TruncFunctor {
  public:
   __device__ TruncFunctor(const T x) : x_(x) {}
-  __device__ T operator()() { return trunc(x_); }
+  __device__ T operator()() {
+    if constexpr (std::is_same<T, phi::dtype::float16>::value || std::is_same<T, phi::dtype::bfloat16>::value) {
+      return static_cast<T>(trunc(static_cast<float>(x_)));
+    } else {
+      return trunc(x_);
+    }
+  }
 
  public:
   const T x_;

From 528e5b88881250c8ea623e767a18ac9609520775 Mon Sep 17 00:00:00 2001
From: longranger2 <836253168@qq.com>
Date: Sat, 22 Apr 2023 14:02:33 +0800
Subject: [PATCH 06/22] fix PR-CI-Codestyle-Check

---
 paddle/phi/kernels/gpu/bernoulli_kernel.cu | 12 ++++++++----
 paddle/phi/kernels/gpu/trunc_kernel.cu     |  3 ++-
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/paddle/phi/kernels/gpu/bernoulli_kernel.cu b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
index e0a9d3750d3fc..f86e128db3e2a 100644
--- a/paddle/phi/kernels/gpu/bernoulli_kernel.cu
+++ b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
@@ -36,13 +36,17 @@ template <typename T>
 __device__ T convert_to_T(float rand_value, T x_value);
 
 template <>
-__device__ phi::dtype::float16 convert_to_T<phi::dtype::float16>(float rand_value, phi::dtype::float16 x_value) {
-  return static_cast<phi::dtype::float16>(rand_value <= static_cast<float>(x_value));
+__device__ phi::dtype::float16 convert_to_T<phi::dtype::float16>(
+    float rand_value, phi::dtype::float16 x_value) {
+  return static_cast<phi::dtype::float16>(rand_value <=
+                                          static_cast<float>(x_value));
 }
 
 template <>
-__device__ phi::dtype::bfloat16 convert_to_T<phi::dtype::bfloat16>(float rand_value, phi::dtype::bfloat16 x_value) {
-  return static_cast<phi::dtype::bfloat16>(rand_value <= static_cast<float>(x_value));
+__device__ phi::dtype::bfloat16 convert_to_T<phi::dtype::bfloat16>(
+    float rand_value, phi::dtype::bfloat16 x_value) {
+  return static_cast<phi::dtype::bfloat16>(rand_value <=
+                                           static_cast<float>(x_value));
 }
 
 template <>
diff --git a/paddle/phi/kernels/gpu/trunc_kernel.cu b/paddle/phi/kernels/gpu/trunc_kernel.cu
index 449fe9bf400c8..c56344f4c7887 100644
--- a/paddle/phi/kernels/gpu/trunc_kernel.cu
+++ b/paddle/phi/kernels/gpu/trunc_kernel.cu
@@ -28,7 +28,8 @@ class TruncFunctor {
  public:
   __device__ TruncFunctor(const T x) : x_(x) {}
   __device__ T operator()() {
-    if constexpr (std::is_same<T, phi::dtype::float16>::value || std::is_same<T, phi::dtype::bfloat16>::value) {
+    if constexpr (std::is_same<T, phi::dtype::float16>::value ||
+                  std::is_same<T, phi::dtype::bfloat16>::value) {
       return static_cast<T>(trunc(static_cast<float>(x_)));
     } else {
       return trunc(x_);

From 2fc39e1bda36571169c1f54cc19e831c7769a2f2 Mon Sep 17 00:00:00 2001
From: longranger2 <836253168@qq.com>
Date: Sat, 22 Apr 2023 16:56:35 +0800
Subject: [PATCH 07/22] fix bug of trunc_kernel.cu

---
 paddle/phi/kernels/gpu/trunc_kernel.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/phi/kernels/gpu/trunc_kernel.cu b/paddle/phi/kernels/gpu/trunc_kernel.cu
index c56344f4c7887..103eecfced886 100644
--- a/paddle/phi/kernels/gpu/trunc_kernel.cu
+++ b/paddle/phi/kernels/gpu/trunc_kernel.cu
@@ -26,7 +26,7 @@ using phi::PADDLE_CUDA_NUM_THREADS;
 template <typename T>
 class TruncFunctor {
  public:
-  __device__ TruncFunctor(const T x) : x_(x) {}
+  __device__ TruncFunctor(T x) : x_(x) {}
   __device__ T operator()() {
     if constexpr (std::is_same<T, phi::dtype::float16>::value ||
                   std::is_same<T, phi::dtype::bfloat16>::value) {
@@ -37,7 +37,7 @@ class TruncFunctor {
   }
 
  public:
-  const T x_;
+  T x_;
 };
 
 template <>

From 8b8361dd8b41792607aad2ebaa8f22fc507294b9 Mon Sep 17 00:00:00 2001
From: longranger2 <836253168@qq.com>
Date: Sat, 22 Apr 2023 17:07:32 +0800
Subject: [PATCH 08/22] fix bug of trunc_kernel.cu

---
 paddle/phi/kernels/gpu/trunc_kernel.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/phi/kernels/gpu/trunc_kernel.cu b/paddle/phi/kernels/gpu/trunc_kernel.cu
index 103eecfced886..b3713c69b8aa9 100644
--- a/paddle/phi/kernels/gpu/trunc_kernel.cu
+++ b/paddle/phi/kernels/gpu/trunc_kernel.cu
@@ -28,8 +28,8 @@ class TruncFunctor {
  public:
   __device__ TruncFunctor(T x) : x_(x) {}
   __device__ T operator()() {
-    if constexpr (std::is_same<T, phi::dtype::float16>::value ||
-                  std::is_same<T, phi::dtype::bfloat16>::value) {
+    if (phi::is_same<T, phi::dtype::float16>::value ||
+        phi::is_same<T, phi::dtype::bfloat16>::value) {
       return static_cast<T>(trunc(static_cast<float>(x_)));
     } else {
       return trunc(x_);

From 099d3bb3681276b0cd47256c21da8ab0c234bbf3 Mon Sep 17 00:00:00 2001
From: longranger2 <836253168@qq.com>
Date: Sat, 22 Apr 2023 17:32:27 +0800
Subject: [PATCH 09/22] fix bug of trunc_kernel.cu

---
 paddle/phi/kernels/gpu/trunc_kernel.cu | 34 ++++++++++++++++++++------
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/paddle/phi/kernels/gpu/trunc_kernel.cu b/paddle/phi/kernels/gpu/trunc_kernel.cu
index b3713c69b8aa9..5b014f84a5270 100644
--- a/paddle/phi/kernels/gpu/trunc_kernel.cu
+++ b/paddle/phi/kernels/gpu/trunc_kernel.cu
@@ -23,18 +23,36 @@ namespace phi {
 
 using phi::PADDLE_CUDA_NUM_THREADS;
 
+template <typename T>
+__device__ T device_trunc(T x);
+
+template <>
+__device__ float device_trunc<float>(float x) {
+  return truncf(x);
+}
+
+template <>
+__device__ double device_trunc<double>(double x) {
+  return trunc(x);
+}
+
+template <>
+__device__ phi::dtype::float16 device_trunc<phi::dtype::float16>(
+    phi::dtype::float16 x) {
+  return static_cast<phi::dtype::float16>(truncf(static_cast<float>(x)));
+}
+
+template <>
+__device__ phi::dtype::bfloat16 device_trunc<phi::dtype::bfloat16>(
+    phi::dtype::bfloat16 x) {
+  return static_cast<phi::dtype::bfloat16>(truncf(static_cast<float>(x)));
+}
+
 template <typename T>
 class TruncFunctor {
  public:
   __device__ TruncFunctor(T x) : x_(x) {}
-  __device__ T operator()() {
-    if (phi::is_same<T, phi::dtype::float16>::value ||
-        phi::is_same<T, phi::dtype::bfloat16>::value) {
-      return static_cast<T>(trunc(static_cast<float>(x_)));
-    } else {
-      return trunc(x_);
-    }
-  }
+  __device__ T operator()() { return device_trunc(x_); }
 
  public:
   T x_;

From 22dbf8d1384e8fea0976fdaf9b64e8dcbbb96943 Mon Sep 17 00:00:00 2001
From: longranger2 <836253168@qq.com>
Date: Wed, 3 May 2023 15:05:39 +0800
Subject: [PATCH 10/22] fix bug of trunc and bernoulli

---
 .../paddle/fluid/tests/unittests/test_bernoulli_op.py |  6 +++++-
 python/paddle/fluid/tests/unittests/test_trunc_op.py  | 11 ++++-------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_bernoulli_op.py b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
index e955f81338f6f..7ff356a74cc70 100644
--- a/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
@@ -127,12 +127,16 @@ def setUp(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place)
+        self.check_output_with_place_customized(self.verify_output, place)
 
     def init_test_case(self):
         self.x = np.random.uniform(size=(1000, 784)).astype("float32")
         self.out = np.zeros((1000, 784)).astype("float32")
 
+    def verify_output(self, outs):
+        hist, prob = output_hist(np.array(outs[0]))
+        np.testing.assert_allclose(hist, prob, rtol=0, atol=0.01)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_trunc_op.py b/python/paddle/fluid/tests/unittests/test_trunc_op.py
index 2f34c5577b5e5..404403fca0453 100644
--- a/python/paddle/fluid/tests/unittests/test_trunc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_trunc_op.py
@@ -103,25 +103,22 @@ def init_dtype_type(self):
 )
 class TestTruncBF16OP(OpTest):
     def setUp(self):
-        self.op_type = "trunc"
         self.python_api = paddle.trunc
-        self.init_dtype_type()
+        self.op_type = "trunc"
+        self.dtype = np.uint16
         np.random.seed(2021)
-        x = np.random.random((20, 20)).astype(np.float64)
+        x = np.random.random((20, 20)).astype("float32")
         out = np.trunc(x)
         self.inputs = {'X': convert_float_to_uint16(x)}
         self.outputs = {'Out': convert_float_to_uint16(out)}
 
-    def init_dtype_type(self):
-        self.dtype = np.uint16
-
     def test_check_output(self):
         place = core.CUDAPlace(0)
         self.check_output_with_place(place)
 
     def test_check_grad(self):
         place = core.CUDAPlace(0)
-        self.check_grad_with_place(place, ['X'], 'Out')
+        self.check_grad_with_place(place, ['X'], 'Out', numeric_grad_delta=1e-5)
 
 
 if __name__ == "__main__":

From 9db702f753c6fc9a80e32490e3c0719a68e0d2e1 Mon Sep 17 00:00:00 2001
From: longranger2 <836253168@qq.com>
Date: Tue, 9 May 2023 16:22:15 +0000
Subject: [PATCH 11/22] fix bug

---
 paddle/phi/kernels/gpu/bernoulli_kernel.cu    | 29 +----------------
 paddle/phi/kernels/gpu/trunc_kernel.cu        | 31 ++-----------------
 .../tests/unittests/test_bernoulli_op.py      |  2 +-
 3 files changed, 5 insertions(+), 57 deletions(-)

diff --git a/paddle/phi/kernels/gpu/bernoulli_kernel.cu b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
index f86e128db3e2a..a2055597e7a64 100644
--- a/paddle/phi/kernels/gpu/bernoulli_kernel.cu
+++ b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
@@ -32,33 +32,6 @@
 
 namespace phi {
 
-template <typename T>
-__device__ T convert_to_T(float rand_value, T x_value);
-
-template <>
-__device__ phi::dtype::float16 convert_to_T<phi::dtype::float16>(
-    float rand_value, phi::dtype::float16 x_value) {
-  return static_cast<phi::dtype::float16>(rand_value <=
-                                          static_cast<float>(x_value));
-}
-
-template <>
-__device__ phi::dtype::bfloat16 convert_to_T<phi::dtype::bfloat16>(
-    float rand_value, phi::dtype::bfloat16 x_value) {
-  return static_cast<phi::dtype::bfloat16>(rand_value <=
-                                           static_cast<float>(x_value));
-}
-
-template <>
-__device__ float convert_to_T<float>(float rand_value, float x_value) {
-  return static_cast<float>(rand_value <= x_value);
-}
-
-template <>
-__device__ double convert_to_T<double>(float rand_value, double x_value) {
-  return static_cast<double>(rand_value <= x_value);
-}
-
 // 'curand_uniform4/hiprand_uniform4' generate 4 random number each time
 template <typename T>
 __global__ void bernoulli_cuda_kernel(
@@ -82,7 +55,7 @@ __global__ void bernoulli_cuda_kernel(
     for (size_t j = 0; j < 4; j++) {
       size_t idx = i + j;
       if (idx < size) {
-        out_data[idx] = convert_to_T<T>((&rand.x)[j], x_data[idx]);
+        out_data[idx] = convert_to_T<T>((&rand.x)[j], static_cast<MPType>(x_data[idx]));
       }
     }
   }
diff --git a/paddle/phi/kernels/gpu/trunc_kernel.cu b/paddle/phi/kernels/gpu/trunc_kernel.cu
index 5b014f84a5270..c0004d5ba9fda 100644
--- a/paddle/phi/kernels/gpu/trunc_kernel.cu
+++ b/paddle/phi/kernels/gpu/trunc_kernel.cu
@@ -23,39 +23,14 @@ namespace phi {
 
 using phi::PADDLE_CUDA_NUM_THREADS;
 
-template <typename T>
-__device__ T device_trunc(T x);
-
-template <>
-__device__ float device_trunc<float>(float x) {
-  return truncf(x);
-}
-
-template <>
-__device__ double device_trunc<double>(double x) {
-  return trunc(x);
-}
-
-template <>
-__device__ phi::dtype::float16 device_trunc<phi::dtype::float16>(
-    phi::dtype::float16 x) {
-  return static_cast<phi::dtype::float16>(truncf(static_cast<float>(x)));
-}
-
-template <>
-__device__ phi::dtype::bfloat16 device_trunc<phi::dtype::bfloat16>(
-    phi::dtype::bfloat16 x) {
-  return static_cast<phi::dtype::bfloat16>(truncf(static_cast<float>(x)));
-}
-
 template <typename T>
 class TruncFunctor {
  public:
-  __device__ TruncFunctor(T x) : x_(x) {}
-  __device__ T operator()() { return device_trunc(x_); }
+   __device__ TruncFunctor(const T x) : x_(x) {}
+  __device__ T operator()() { return trunc(static_cast<MPType>(x_)); }
 
  public:
-  T x_;
+  const T x_;
 };
 
 template <>
diff --git a/python/paddle/fluid/tests/unittests/test_bernoulli_op.py b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
index 7ff356a74cc70..8f0871585ce98 100644
--- a/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
@@ -36,7 +36,7 @@ def setUp(self):
             "X": np.random.uniform(size=(1000, 784)).astype(self.dtype)
         }
         self.attrs = {}
-        self.outputs = {"Out": np.zeros((1000, 784)).astype("float32")}
+        self.outputs = {"Out": np.zeros((1000, 784))}
 
     def init_dtype(self):
         self.dtype = np.float32

From 38d7bc1e61b9dff87321384b7a69a11261dde9c5 Mon Sep 17 00:00:00 2001
From: longranger2 <836253168@qq.com>
Date: Tue, 9 May 2023 16:40:00 +0000
Subject: [PATCH 12/22] fix bug

---
 paddle/phi/kernels/gpu/bernoulli_kernel.cu | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/paddle/phi/kernels/gpu/bernoulli_kernel.cu b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
index a2055597e7a64..38e2a6801dd54 100644
--- a/paddle/phi/kernels/gpu/bernoulli_kernel.cu
+++ b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
@@ -29,6 +29,7 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/distribution_helper.h"
+#include "paddle/phi/common/amp_type_traits.h"
 
 namespace phi {
 
@@ -52,10 +53,11 @@ __global__ void bernoulli_cuda_kernel(
     funcs::uniform_distribution<float> dist;
     float4 rand = dist(&state);
 #pragma unroll
+    using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
     for (size_t j = 0; j < 4; j++) {
       size_t idx = i + j;
       if (idx < size) {
-        out_data[idx] = convert_to_T<T>((&rand.x)[j], static_cast<MPType>(x_data[idx]));
+        out_data[idx] = static_cast<T>((&rand.x)[j], static_cast<MPType>(x_data[idx]));
       }
     }
   }

From f4ce7735ce29f7dca1b5c26760011cb76c2c05fa Mon Sep 17 00:00:00 2001
From: longranger2 <836253168@qq.com>
Date: Wed, 10 May 2023 03:12:15 +0000
Subject: [PATCH 13/22] fix bug of MPType

---
 paddle/phi/kernels/gpu/bernoulli_kernel.cu | 3 ++-
 paddle/phi/kernels/gpu/trunc_kernel.cu     | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/paddle/phi/kernels/gpu/bernoulli_kernel.cu b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
index 38e2a6801dd54..b1d0f7608aa7c 100644
--- a/paddle/phi/kernels/gpu/bernoulli_kernel.cu
+++ b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
@@ -57,7 +57,8 @@ __global__ void bernoulli_cuda_kernel(
     for (size_t j = 0; j < 4; j++) {
       size_t idx = i + j;
       if (idx < size) {
-        out_data[idx] = static_cast<T>((&rand.x)[j], static_cast<MPType>(x_data[idx]));
+        out_data[idx] =
+            static_cast<T>((&rand.x)[j], static_cast<MPType>(x_data[idx]));
       }
     }
   }
diff --git a/paddle/phi/kernels/gpu/trunc_kernel.cu b/paddle/phi/kernels/gpu/trunc_kernel.cu
index c0004d5ba9fda..4db1622de4efc 100644
--- a/paddle/phi/kernels/gpu/trunc_kernel.cu
+++ b/paddle/phi/kernels/gpu/trunc_kernel.cu
@@ -22,11 +22,12 @@
 namespace phi {
 
 using phi::PADDLE_CUDA_NUM_THREADS;
+using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
 
 template <typename T>
 class TruncFunctor {
  public:
-   __device__ TruncFunctor(const T x) : x_(x) {}
+  __device__ TruncFunctor(const T x) : x_(x) {}
   __device__ T operator()() { return trunc(static_cast<MPType>(x_)); }
 
  public:

From bd62029b96fc99ca6ae1196741ca5a204458545b Mon Sep 17 00:00:00 2001
From: longranger2 <836253168@qq.com>
Date: Wed, 10 May 2023 03:18:41 +0000
Subject: [PATCH 14/22] fix check_variable_and_dtype

---
 python/paddle/tensor/random.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index 9629796494055..1db1ae522b43b 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -76,7 +76,7 @@ def bernoulli(x, name=None):
     if in_dygraph_mode():
         return _C_ops.bernoulli(x)
     else:
-        check_variable_and_dtype(x, "x", ["float32", "float64"], "bernoulli")
+        check_variable_and_dtype(x, "x", ["float16", "float32", "float64"], "bernoulli")
 
         helper = LayerHelper("randint", **locals())
         out = helper.create_variable_for_type_inference(

From 3782bd1c9efabc04fef25a6f83e7b6aeb3ddc4d8 Mon Sep 17 00:00:00 2001
From: longranger2 <836253168@qq.com>
Date: Wed, 10 May 2023 14:54:43 +0800
Subject: [PATCH 15/22] fix bug of MPType

---
 paddle/phi/kernels/gpu/trunc_kernel.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/phi/kernels/gpu/trunc_kernel.cu b/paddle/phi/kernels/gpu/trunc_kernel.cu
index 4db1622de4efc..a87ba55a310b3 100644
--- a/paddle/phi/kernels/gpu/trunc_kernel.cu
+++ b/paddle/phi/kernels/gpu/trunc_kernel.cu
@@ -17,6 +17,7 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {

From b20ac1a28ee46c238c5814cbb97749c924962bb6 Mon Sep 17 00:00:00 2001
From: longranger2 <836253168@qq.com>
Date: Wed, 10 May 2023 17:42:59 +0800
Subject: [PATCH 16/22] fix bug of undefined T

---
 paddle/phi/kernels/gpu/trunc_kernel.cu | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/paddle/phi/kernels/gpu/trunc_kernel.cu b/paddle/phi/kernels/gpu/trunc_kernel.cu
index a87ba55a310b3..4724edf506382 100644
--- a/paddle/phi/kernels/gpu/trunc_kernel.cu
+++ b/paddle/phi/kernels/gpu/trunc_kernel.cu
@@ -23,13 +23,15 @@
 namespace phi {
 
 using phi::PADDLE_CUDA_NUM_THREADS;
-using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
 
 template <typename T>
 class TruncFunctor {
  public:
   __device__ TruncFunctor(const T x) : x_(x) {}
-  __device__ T operator()() { return trunc(static_cast<MPType>(x_)); }
+  __device__ T operator()() {
+    using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+    return trunc(static_cast<MPType>(x_));
+  }
 
  public:
   const T x_;

From 7def562807d3b46705553de3dd9391a027288942 Mon Sep 17 00:00:00 2001
From: longranger2 <836253168@qq.com>
Date: Thu, 11 May 2023 22:03:43 +0800
Subject: [PATCH 17/22] fix bug

---
 paddle/phi/kernels/gpu/bernoulli_kernel.cu | 2 +-
 paddle/phi/kernels/gpu/trunc_kernel.cu     | 2 +-
 python/paddle/tensor/random.py             | 4 +++-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/paddle/phi/kernels/gpu/bernoulli_kernel.cu b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
index b1d0f7608aa7c..18cd7e796ef11 100644
--- a/paddle/phi/kernels/gpu/bernoulli_kernel.cu
+++ b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
@@ -26,10 +26,10 @@
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/distribution_helper.h"
-#include "paddle/phi/common/amp_type_traits.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/trunc_kernel.cu b/paddle/phi/kernels/gpu/trunc_kernel.cu
index 4724edf506382..bdbdb80a97f5c 100644
--- a/paddle/phi/kernels/gpu/trunc_kernel.cu
+++ b/paddle/phi/kernels/gpu/trunc_kernel.cu
@@ -30,7 +30,7 @@ class TruncFunctor {
   __device__ TruncFunctor(const T x) : x_(x) {}
   __device__ T operator()() {
     using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-    return trunc(static_cast<MPType>(x_));
+    return static_cast<T>(trunc(static_cast<MPType>(x_)));
   }
 
  public:
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index 1db1ae522b43b..98b123b96e411 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -76,7 +76,9 @@ def bernoulli(x, name=None):
     if in_dygraph_mode():
         return _C_ops.bernoulli(x)
     else:
-        check_variable_and_dtype(x, "x", ["float16", "float32", "float64"], "bernoulli")
+        check_variable_and_dtype(
+            x, "x", ["float16", "float32", "float64"], "bernoulli"
+        )
 
         helper = LayerHelper("randint", **locals())
         out = helper.create_variable_for_type_inference(

From 3e9063aa522eee0a768050891b8a9e5ac1ffab40 Mon Sep 17 00:00:00 2001
From: LoneRanger <836253168@qq.com>
Date: Fri, 12 May 2023 17:30:56 +0800
Subject: [PATCH 18/22] Update test_bernoulli_op.py

---
 python/paddle/fluid/tests/unittests/test_bernoulli_op.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_bernoulli_op.py b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
index 8f0871585ce98..6d7a73ba49d1a 100644
--- a/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
@@ -135,7 +135,7 @@ def init_test_case(self):
 
     def verify_output(self, outs):
         hist, prob = output_hist(np.array(outs[0]))
-        np.testing.assert_allclose(hist, prob, rtol=0, atol=0.01)
+        np.testing.assert_allclose(hist, prob)
 
 
 if __name__ == "__main__":

From 13a2c74a8c3f295cb2f1a52950b0470442f70cf3 Mon Sep 17 00:00:00 2001
From: LoneRanger <836253168@qq.com>
Date: Mon, 15 May 2023 14:55:43 +0800
Subject: [PATCH 19/22] Update test_bernoulli_op.py

---
 python/paddle/fluid/tests/unittests/test_bernoulli_op.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_bernoulli_op.py b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
index 6d7a73ba49d1a..05d4c75824465 100644
--- a/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
@@ -36,7 +36,7 @@ def setUp(self):
             "X": np.random.uniform(size=(1000, 784)).astype(self.dtype)
         }
         self.attrs = {}
-        self.outputs = {"Out": np.zeros((1000, 784))}
+        self.outputs = {"Out": np.zeros((1000, 784)).astype(self.dtype}
 
     def init_dtype(self):
         self.dtype = np.float32

From 3c4e333f5dcfe470a72262223f0995bc2f40f31f Mon Sep 17 00:00:00 2001
From: LoneRanger <836253168@qq.com>
Date: Mon, 15 May 2023 22:30:57 +0800
Subject: [PATCH 20/22] Update test_bernoulli_op.py

---
 python/paddle/fluid/tests/unittests/test_bernoulli_op.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_bernoulli_op.py b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
index 05d4c75824465..7b3fabdf1484f 100644
--- a/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
@@ -132,7 +132,9 @@ def test_check_output(self):
     def init_test_case(self):
         self.x = np.random.uniform(size=(1000, 784)).astype("float32")
         self.out = np.zeros((1000, 784)).astype("float32")
-
+        self.x = convert_uint16_to_float(convert_float_to_uint16(self.x))
+        self.out = convert_uint16_to_float(convert_float_to_uint16(self.out))
+                                                            
     def verify_output(self, outs):
         hist, prob = output_hist(np.array(outs[0]))
         np.testing.assert_allclose(hist, prob)

From e7ad7f237110421a01203bd09354de97444016e4 Mon Sep 17 00:00:00 2001
From: longranger2 <836253168@qq.com>
Date: Tue, 16 May 2023 10:06:41 +0800
Subject: [PATCH 21/22] fix bug of import

---
 .../paddle/fluid/tests/unittests/test_bernoulli_op.py  | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_bernoulli_op.py b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
index 7b3fabdf1484f..b4ae430d5ead6 100644
--- a/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
@@ -15,7 +15,11 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from eager_op_test import (
+    OpTest,
+    convert_float_to_uint16,
+    convert_uint16_to_float,
+)
 
 import paddle
 from paddle.fluid import core
@@ -36,7 +40,7 @@ def setUp(self):
             "X": np.random.uniform(size=(1000, 784)).astype(self.dtype)
         }
         self.attrs = {}
-        self.outputs = {"Out": np.zeros((1000, 784)).astype(self.dtype}
+        self.outputs = {"Out": np.zeros((1000, 784)).astype(self.dtype)}
 
     def init_dtype(self):
         self.dtype = np.float32
@@ -134,7 +138,7 @@ def init_test_case(self):
         self.out = np.zeros((1000, 784)).astype("float32")
         self.x = convert_uint16_to_float(convert_float_to_uint16(self.x))
         self.out = convert_uint16_to_float(convert_float_to_uint16(self.out))
-                                                            
+
     def verify_output(self, outs):
         hist, prob = output_hist(np.array(outs[0]))
         np.testing.assert_allclose(hist, prob)

From f922dd818902f0f19741064806251a1b2fd482fd Mon Sep 17 00:00:00 2001
From: longranger2 <836253168@qq.com>
Date: Wed, 31 May 2023 16:07:14 +0800
Subject: [PATCH 22/22] remove the trunc

---
 paddle/phi/kernels/gpu/bernoulli_kernel.cu  |  2 +-
 paddle/phi/kernels/gpu/trunc_grad_kernel.cu |  4 +---
 paddle/phi/kernels/gpu/trunc_kernel.cu      | 17 +++--------------
 3 files changed, 5 insertions(+), 18 deletions(-)

diff --git a/paddle/phi/kernels/gpu/bernoulli_kernel.cu b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
index 18cd7e796ef11..be41dcb524947 100644
--- a/paddle/phi/kernels/gpu/bernoulli_kernel.cu
+++ b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
@@ -58,7 +58,7 @@ __global__ void bernoulli_cuda_kernel(
       size_t idx = i + j;
       if (idx < size) {
         out_data[idx] =
-            static_cast<T>((&rand.x)[j], static_cast<MPType>(x_data[idx]));
+            static_cast<T>((&rand.x)[j] <= static_cast<MPType>(x_data[idx]));
       }
     }
   }
diff --git a/paddle/phi/kernels/gpu/trunc_grad_kernel.cu b/paddle/phi/kernels/gpu/trunc_grad_kernel.cu
index 40e1404cd900d..8a88383e6e4f0 100644
--- a/paddle/phi/kernels/gpu/trunc_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/trunc_grad_kernel.cu
@@ -52,6 +52,4 @@ PD_REGISTER_KERNEL(trunc_grad,
                    float,
                    double,
                    int,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/trunc_kernel.cu b/paddle/phi/kernels/gpu/trunc_kernel.cu
index bdbdb80a97f5c..4c5876c2ba924 100644
--- a/paddle/phi/kernels/gpu/trunc_kernel.cu
+++ b/paddle/phi/kernels/gpu/trunc_kernel.cu
@@ -28,10 +28,7 @@ template <typename T>
 class TruncFunctor {
  public:
   __device__ TruncFunctor(const T x) : x_(x) {}
-  __device__ T operator()() {
-    using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-    return static_cast<T>(trunc(static_cast<MPType>(x_)));
-  }
+  __device__ T operator()() { return trunc(x_); }
 
  public:
   const T x_;
@@ -82,13 +79,5 @@ void TruncKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PD_REGISTER_KERNEL(trunc,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::TruncKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
+PD_REGISTER_KERNEL(
+    trunc, GPU, ALL_LAYOUT, phi::TruncKernel, float, double, int, int64_t) {}