PaddlePaddle · zhwesky2010 · Aug 26, 2025 · Aug 14, 2025 · Aug 20, 2025 · Aug 20, 2025
diff --git a/paddle/phi/ops/yaml/ops.yaml b/paddle/phi/ops/yaml/ops.yaml
@@ -786,7 +786,12 @@
 
 - op : bmm
   args : (Tensor x, Tensor y)
-  output : Tensor
+  python_api :
+    name : [paddle.bmm, paddle.Tensor.bmm]
+    args_alias:
+      x : [input]
+      y : [mat2]
+  output : Tensor(out)
   infer_meta :
     func : BmmInferMeta
   kernel :

diff --git a/python/paddle/_paddle_docs.py b/python/paddle/_paddle_docs.py
@@ -741,6 +741,59 @@ def triu(
 ) -> Tensor
 """,
 )
+
+add_doc_and_signature(
+    "bmm",
+    """
+    Applies batched matrix multiplication to two tensors.
+
+    Both of the two input tensors must be three-dimensional and share the same batch size.
+
+    If x is a (b, m, k) tensor, y is a (b, k, n) tensor, the output will be a (b, m, n) tensor.
+
+    Args:
+        x (Tensor): The input Tensor.
+        y (Tensor): The input Tensor.
+        name (str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically. Default: None.
+        out(Tensor, optional): The output tensor.
+
+    Returns:
+        Tensor: The product Tensor.
+
+    Examples:
+        .. code-block:: python
+
+            >>> import paddle
+
+            >>> # In imperative mode:
+            >>> # size x: (2, 2, 3) and y: (2, 3, 2)
+            >>> x = paddle.to_tensor([[[1.0, 1.0, 1.0],
+            ...                     [2.0, 2.0, 2.0]],
+            ...                     [[3.0, 3.0, 3.0],
+            ...                     [4.0, 4.0, 4.0]]])
+            >>> y = paddle.to_tensor([[[1.0, 1.0],[2.0, 2.0],[3.0, 3.0]],
+            ...                     [[4.0, 4.0],[5.0, 5.0],[6.0, 6.0]]])
+            >>> out = paddle.bmm(x, y)
+            >>> print(out)
+            Tensor(shape=[2, 2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[6. , 6. ],
+              [12., 12.]],
+             [[45., 45.],
+              [60., 60.]]])
+
+    """,
+    """
+def bmm(
+    x: Tensor,
+    y: Tensor,
+    name: str | None = None,
+    *,
+    out: Tensor | None = None,
+) -> Tensor
+""",
+)
+
 # lihaoyang
 
 # lubingxin

diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
@@ -21,6 +21,7 @@
 
 import paddle
 from paddle import _C_ops
+from paddle._C_ops import bmm  # noqa: F401
 from paddle.base.libpaddle import DataType
 from paddle.common_ops_import import VarDesc
 from paddle.tensor.math import broadcast_shape
@@ -2546,70 +2547,6 @@ def matrix_rank(
             return out
 
 
-def bmm(x: Tensor, y: Tensor, name: str | None = None) -> Tensor:
-    """
-    Applies batched matrix multiplication to two tensors.
-
-    Both of the two input tensors must be three-dimensional and share the same batch size.
-
-    If x is a (b, m, k) tensor, y is a (b, k, n) tensor, the output will be a (b, m, n) tensor.
-
-    Args:
-        x (Tensor): The input Tensor.
-        y (Tensor): The input Tensor.
-        name (str|None): A name for this layer(optional). If set None, the layer
-            will be named automatically. Default: None.
-
-    Returns:
-        Tensor: The product Tensor.
-
-    Examples:
-        .. code-block:: python
-
-            >>> import paddle
-
-            >>> # In imperative mode:
-            >>> # size x: (2, 2, 3) and y: (2, 3, 2)
-            >>> x = paddle.to_tensor([[[1.0, 1.0, 1.0],
-            ...                     [2.0, 2.0, 2.0]],
-            ...                     [[3.0, 3.0, 3.0],
-            ...                     [4.0, 4.0, 4.0]]])
-            >>> y = paddle.to_tensor([[[1.0, 1.0],[2.0, 2.0],[3.0, 3.0]],
-            ...                     [[4.0, 4.0],[5.0, 5.0],[6.0, 6.0]]])
-            >>> out = paddle.bmm(x, y)
-            >>> print(out)
-            Tensor(shape=[2, 2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
-            [[[6. , 6. ],
-              [12., 12.]],
-             [[45., 45.],
-              [60., 60.]]])
-
-    """
-    if in_dynamic_or_pir_mode():
-        return _C_ops.bmm(x, y)
-    else:
-        x_shape = x.shape
-        y_shape = y.shape
-        if not len(x_shape) == len(y_shape) == 3:
-            raise ValueError(
-                f"x and y should be 3-dimensional. But received x's dimension: {x_shape}, y's dimension: {y_shape}"
-            )
-        if x_shape[2] != -1 and y_shape[1] != -1 and x_shape[2] != y_shape[1]:
-            raise ValueError(
-                f"x's width must be equal with y's height. But received x's shape: {x_shape}, y's shape: {y_shape}"
-            )
-        if x_shape[0] != -1 and y_shape[0] != -1 and x_shape[0] != y_shape[0]:
-            raise ValueError(
-                f"x's batch (shape[0]) must be equal with y's batch (shape[0]). But received x's shape: {x_shape}, y's shape: {y_shape}"
-            )
-        helper = LayerHelper('bmm', **locals())
-        out = helper.create_variable_for_type_inference(dtype=x.dtype)
-        helper.append_op(
-            type='bmm', inputs={'X': x, 'Y': y}, outputs={'Out': out}
-        )
-        return out
-
-
 def histogram(
     input: Tensor,
     bins: int = 100,

diff --git a/test/legacy_test/test_bmm_op.py b/test/legacy_test/test_bmm_op.py
@@ -26,8 +26,8 @@ class TestBmmOp(OpTest):
     def setUp(self):
         self.op_type = "bmm"
         self.prim_op_type = "comp"
-        self.python_api = paddle.tensor.bmm
-        self.public_python_api = paddle.tensor.bmm
+        self.python_api = paddle.Tensor.bmm
+        self.public_python_api = paddle.Tensor.bmm
         X = np.random.random((10, 3, 4)).astype("float64")
         Y = np.random.random((10, 4, 5)).astype("float64")
         self.inputs = {'X': X, 'Y': Y}
@@ -46,8 +46,8 @@ def setUp(self):
         self.op_type = "bmm"
         self.prim_op_type = "comp"
         self.dtype = np.float16
-        self.python_api = paddle.tensor.bmm
-        self.public_python_api = paddle.tensor.bmm
+        self.python_api = paddle.Tensor.bmm
+        self.public_python_api = paddle.Tensor.bmm
         X = np.random.random((10, 3, 4)).astype("float16")
         Y = np.random.random((10, 4, 5)).astype("float16")
         self.inputs = {'X': X, 'Y': Y}
@@ -71,8 +71,8 @@ def setUp(self):
         self.op_type = "bmm"
         self.prim_op_type = "comp"
         self.dtype = np.uint16
-        self.python_api = paddle.tensor.bmm
-        self.public_python_api = paddle.tensor.bmm
+        self.python_api = paddle.Tensor.bmm
+        self.public_python_api = paddle.Tensor.bmm
         X = np.random.random((10, 3, 4)).astype("float32")
         Y = np.random.random((10, 4, 5)).astype("float32")
         self.inputs = {'X': X, 'Y': Y}
@@ -173,5 +173,52 @@ def test_checkout_grad(self):
         self.check_grad(['X', 'Y'], 'Out', check_pir=True)
 
 
+class TestBmmOutAndParamDecorator(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+        self.x_np = np.random.random((10, 3, 4)).astype("float64")
+        self.y_np = np.random.random((10, 4, 5)).astype("float64")
+        self.test_types = ["decorator", "out", "out_decorator"]
+
+    def do_test(self, test_type):
+        x = paddle.to_tensor(self.x_np, stop_gradient=False)
+        y = paddle.to_tensor(self.y_np, stop_gradient=False)
+        if test_type == 'raw':
+            result = paddle.bmm(x, y)
+            result.mean().backward()
+            return result, x.grad, y.grad
+        elif test_type == 'decorator':
+            result = paddle.bmm(input=x, mat2=y)
+            result.mean().backward()
+            return result, x.grad, y.grad
+        elif test_type == 'out':
+            out = paddle.empty([10, 3, 5], dtype='float64')
+            out.stop_gradient = False
+            paddle.bmm(x, y, out=out)
+            out.mean().backward()
+            return out, x.grad, y.grad
+        elif test_type == 'out_decorator':
+            out = paddle.empty([10, 3, 5], dtype='float64')
+            out.stop_gradient = False
+            paddle.bmm(input=x, mat2=y, out=out)
+            out.mean().backward()
+            return out, x.grad, y.grad
+        else:
+            raise ValueError(f"Unknown test type: {test_type}")
+
+    def test_all(self):
+        out_std, grad_x_std, grad_y_std = self.do_test('raw')
+        for test_type in self.test_types:
+            out, grad_x, grad_y = self.do_test(test_type)
+            np.testing.assert_allclose(out.numpy(), out_std.numpy(), rtol=1e-7)
+            np.testing.assert_allclose(
+                grad_x.numpy(), grad_x_std.numpy(), rtol=1e-7
+            )
+
+            np.testing.assert_allclose(
+                grad_y.numpy(), grad_y_std.numpy(), rtol=1e-7
+            )
+
+
 if __name__ == "__main__":
     unittest.main()