PaddlePaddle · luotao1 · Mar 17, 2023 · Feb 26, 2023 · Mar 12, 2023 · Mar 12, 2023
diff --git a/paddle/phi/kernels/gpu/gumbel_softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/gumbel_softmax_grad_kernel.cu
@@ -21,5 +21,6 @@ PD_REGISTER_KERNEL(gumbel_softmax_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::GumbelSoftmaxGradKernel,
+                   phi::dtype::float16,
                    float,
                    double) {}
diff --git a/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu b/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/gumbel_softmax_kernel.h"
-
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/axis_utils.h"
 #include "paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h"
@@ -124,9 +124,11 @@ __global__ void AddGumbelNoiseCUDAKernel(const T* input_data,
                                          int64_t n) {
   int index = threadIdx.x + blockIdx.x * blockDim.x;
   int step = blockDim.x * gridDim.x;
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
   for (int64_t i = index; i < n; i += step) {
-    T gumbel_noise = -log(-log(noise[i]));
-    output_data[i] = (gumbel_noise + input_data[i]) / temperature;
+    MPType gumbel_noise = -log(-log(static_cast<MPType>(noise[i])));
+    output_data[i] = static_cast<T>(
+        (gumbel_noise + static_cast<MPType>(input_data[i])) / temperature);
   }
 }
 
@@ -152,10 +154,15 @@ struct GumbleNoiseGenerator<GPUContext, T> {
     uint64_t offset = seed_offset.second;
 
     thrust::counting_iterator<int64_t> index_sequence_begin(0);
-    thrust::transform(index_sequence_begin,
-                      index_sequence_begin + size,
-                      thrust::device_ptr<T>(random_data),
-                      UniformCUDAGenerator<T>(0.00001, 1, seed, size * offset));
+    using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+    thrust::transform(
+        index_sequence_begin,
+        index_sequence_begin + size,
+        thrust::device_ptr<T>(random_data),
+        UniformCUDAGenerator<T>(static_cast<phi::dtype::float16>(0.00001),
+                                static_cast<phi::dtype::float16>(1),
+                                seed,
+                                size * offset));
 
     // add gumbel noise to X
     const int thread_size = 512;
@@ -168,5 +175,10 @@ struct GumbleNoiseGenerator<GPUContext, T> {
 }  // namespace phi
 #endif
 
-PD_REGISTER_KERNEL(
-    gumbel_softmax, GPU, ALL_LAYOUT, phi::GumbelSoftmaxKernel, float, double) {}
+PD_REGISTER_KERNEL(gumbel_softmax,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GumbelSoftmaxKernel,
+                   phi::dtype::float16,
+                   float,
+                   double) {}
diff --git a/python/paddle/fluid/tests/unittests/test_gumbel_softmax_op.py b/python/paddle/fluid/tests/unittests/test_gumbel_softmax_op.py
@@ -103,6 +103,14 @@ def init_attrs(self):
         self.dtype = "float64"
 
 
+class TestGumbelSoftmaxOp6(TestGumbelSoftmaxOp):
+    def init_attrs(self):
+        self.shape = [20, 10, 5]
+        self.attrs = {"hard": True, "axis": 1}
+        self.count_expected = 100
+        self.dtype = np.float16
+
+
 class TestGumbelSoftmaxOpSampleDistribution(OpTest):
     def softmax(self, x):
         x_row_max = x.max(axis=-1)

diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
@@ -1664,7 +1664,7 @@ def gumbel_softmax(x, temperature=1.0, hard=False, axis=-1, name=None):
     Parameters:
         x (Tensor): An N-D Tensor, the first N - 1 dimensions index into a batch
             of independent distributions and the last dimension represents
-            a vector of probabilities with datatype float32, float64.
+            a vector of probabilities with datatype float16, float32, float64.
         temperature (float, optional): non-negative scalar temperature.
             Default is 1.0.
         hard (bool, optional): if True, the returned samples will be discretized as
@@ -1705,7 +1705,9 @@ def gumbel_softmax(x, temperature=1.0, hard=False, axis=-1, name=None):
         )
 
     helper = LayerHelper("gumbel_softmax", **locals())
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'gumbel_softmax')
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64'], 'gumbel_softmax'
+    )
     out = helper.create_variable_for_type_inference(x.dtype)
     helper.append_op(
         type='gumbel_softmax',