diff --git a/python/paddle/fluid/tests/unittests/test_softmax_mask_fuse_upper_triangle_op.py b/python/paddle/fluid/tests/unittests/test_softmax_mask_fuse_upper_triangle_op.py
index 1911ec3ef65af..a5f59c6d1f261 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_mask_fuse_upper_triangle_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_mask_fuse_upper_triangle_op.py
@@ -57,7 +57,30 @@ def test_check_grad(self):
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
-class TestDropoutBiasFuseOp1(unittest.TestCase):
+class TestSoftmaxMaskFuseOp1(OpTest):
+    def setUp(self):
+        self.op_type = "softmax_mask_fuse_upper_triangle"
+        x = np.random.random((1, 1, 32, 32))
+        self.inputs = {'X': x}
+        rst = _get_softmax_upper(x)
+        self.outputs = {'Out': rst}
+
+    def test_check_output(self):
+        try:
+            self.check_output_with_place(core.CPUPlace())
+        except NotImplementedError:
+            pass
+
+    def test_check_grad(self):
+        try:
+            self.check_grad_with_place(core.CPUPlace(), ["X"], "Out")
+        except NotImplementedError:
+            pass
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestDropoutBiasFuseOp2(unittest.TestCase):
     # test the python side API for softmax_mask_fuse op
     def setUp(self):
         np.random.seed(123)
diff --git a/python/paddle/incubate/operators/softmax_mask_fuse_upper_triangle.py b/python/paddle/incubate/operators/softmax_mask_fuse_upper_triangle.py
index 5ebeadc02fe65..b81ad4ecdc82a 100644
--- a/python/paddle/incubate/operators/softmax_mask_fuse_upper_triangle.py
+++ b/python/paddle/incubate/operators/softmax_mask_fuse_upper_triangle.py
@@ -24,7 +24,7 @@ def softmax_mask_fuse_upper_triangle(x):
     Fuse softmax mask together without even give a mask.
     Under GPT model, the mask is always be a upper triangle
     so we can simply mask the upper triangle part of x to get the mask result
-    :param x: the input x
+    :param x: the input x (rst of QK)
     :return: the result of softmax mask fuse (upper triangle)
     """
     if in_dygraph_mode():