PaddlePaddle
diff --git a/‎.github/actions/check-bypass/action.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/actions/check-bypass/action.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/check-bypass.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/check-bypass.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/distributed/collective/deep_ep/kernels/utils.cuh‎
Lines changed: 18 additions & 5 deletions b/‎paddle/fluid/distributed/collective/deep_ep/kernels/utils.cuh‎
Lines changed: 18 additions & 5 deletions
diff --git a/‎paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc‎
Lines changed: 1 addition & 1 deletion b/‎paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/pybind/eager_method.cc‎
Lines changed: 21 additions & 3 deletions b/‎paddle/fluid/pybind/eager_method.cc‎
Lines changed: 21 additions & 3 deletions
diff --git a/‎python/paddle/__init__.py‎
Lines changed: 12 additions & 0 deletions b/‎python/paddle/__init__.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎python/paddle/linalg.py‎
Lines changed: 2 additions & 0 deletions b/‎python/paddle/linalg.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎python/paddle/special.py‎
Lines changed: 19 additions & 0 deletions b/‎python/paddle/special.py‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎python/paddle/tensor/linalg.py‎
Lines changed: 4 additions & 1 deletion b/‎python/paddle/tensor/linalg.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎python/paddle/tensor/manipulation.py‎
Lines changed: 19 additions & 6 deletions b/‎python/paddle/tensor/manipulation.py‎
Lines changed: 19 additions & 6 deletions
@@ -18,7 +18,7 @@ runs:
     - id: check-bypass
       name: Check Bypass
       env:
-        CI_TEAM_MEMBERS: '["SigureMo", "risemeup1", "tianshuo78520a", "0x3878f", "swgu98", "luotao1", "XieYunshen"]'
+        CI_TEAM_MEMBERS: '["tianshuo78520a", "swgu98", "risemeup1", "XieYunshen"]'
       uses: PFCCLab/ci-bypass@v1
       with:
         github-token: ${{ inputs.github-token }}
 
@@ -20,7 +20,7 @@ jobs:
     permissions:
       contents: read
     env:
-      CI_TEAM_MEMBERS: '["SigureMo", "risemeup1", "tianshuo78520a", "0x3878f", "swgu98", "luotao1", "XieYunshen", "mmglove", "fightfat"]'
+      CI_TEAM_MEMBERS: '["tianshuo78520a", "swgu98", "risemeup1" , "XieYunshen"]'
     outputs:
       can-skip: ${{ steps.check-bypass.outputs.can-skip }}
     steps:
 
@@ -418,9 +418,6 @@ __device__ __forceinline__ float exp2f_approx(const float &x) {
   return ret;
 }
 
-// TMA PTX instructions
-#ifndef DISABLE_SM90_FEATURES
-
 __device__ __forceinline__ uint32_t elect_one_sync(int lane_id) {
   uint32_t pred = 0;
   asm volatile(
@@ -437,23 +434,30 @@ __device__ __forceinline__ uint32_t elect_one_sync(int lane_id) {
 }
 
 __device__ __forceinline__ void fence_view_async_shared() {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
   asm volatile("fence.proxy.async.shared::cta; \n" ::);
+#endif
 }
 
 __device__ __forceinline__ void fence_barrier_init() {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
   asm volatile("fence.mbarrier_init.release.cluster; \n" ::);
+#endif
 }
 
 __device__ __forceinline__ void mbarrier_init(uint64_t *mbar_ptr,
                                               uint32_t arrive_count) {
   auto mbar_int_ptr = static_cast<uint32_t>(__cvta_generic_to_shared(mbar_ptr));
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
   asm volatile("mbarrier.init.shared::cta.b64 [%1], %0;" ::"r"(arrive_count),
                "r"(mbar_int_ptr));
+#endif
 }
 
 __device__ __forceinline__ void mbarrier_wait(uint64_t *mbar_ptr,
                                               uint32_t &phase) {
   auto mbar_int_ptr = static_cast<uint32_t>(__cvta_generic_to_shared(mbar_ptr));
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
   asm volatile(
       "{\n\t"
       ".reg .pred       P1; \n\t"
@@ -466,19 +470,24 @@ __device__ __forceinline__ void mbarrier_wait(uint64_t *mbar_ptr,
       "r"(phase),
       "r"(0x989680));
   phase ^= 1;
+#endif
 }
 
 __device__ __forceinline__ void mbarrier_arrive_and_expect_tx(
     uint64_t *mbar_ptr, int num_bytes) {
   auto mbar_int_ptr = static_cast<uint32_t>(__cvta_generic_to_shared(mbar_ptr));
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
   asm volatile(
       "mbarrier.arrive.expect_tx.shared::cta.b64 _, [%1], %0; \n\t" ::"r"(
           num_bytes),
       "r"(mbar_int_ptr));
+#endif
 }
 
 __device__ __forceinline__ void tma_store_fence() {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
   asm volatile("fence.proxy.async.shared::cta;");
+#endif
 }
 
 constexpr uint64_t kEvictFirst = 0x12f0000000000000;
@@ -492,6 +501,7 @@ __device__ __forceinline__ void tma_load_1d(const void *smem_ptr,
   auto mbar_int_ptr = static_cast<uint32_t>(__cvta_generic_to_shared(mbar_ptr));
   auto smem_int_ptr = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
   const auto cache_hint = evict_first ? kEvictFirst : kEvictNormal;
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
   asm volatile(
       "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.L2::"
       "cache_hint [%0], [%1], %2, [%3], %4;\n" ::"r"(smem_int_ptr),
@@ -500,6 +510,7 @@ __device__ __forceinline__ void tma_load_1d(const void *smem_ptr,
       "r"(mbar_int_ptr),
       "l"(cache_hint)
       : "memory");
+#endif
 }
 
 __device__ __forceinline__ void tma_store_1d(const void *smem_ptr,
@@ -508,6 +519,7 @@ __device__ __forceinline__ void tma_store_1d(const void *smem_ptr,
                                              bool evict_first = true) {
   auto smem_int_ptr = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
   const auto cache_hint = evict_first ? kEvictFirst : kEvictNormal;
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
   asm volatile(
       "cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint [%0], [%1], "
       "%2, %3;\n" ::"l"(gmem_ptr),
@@ -516,14 +528,15 @@ __device__ __forceinline__ void tma_store_1d(const void *smem_ptr,
       "l"(cache_hint)
       : "memory");
   asm volatile("cp.async.bulk.commit_group;");
+#endif
 }
 
 template <int N = 0>
 __device__ __forceinline__ void tma_store_wait() {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
   asm volatile("cp.async.bulk.wait_group.read %0;" ::"n"(N) : "memory");
-}
-
 #endif
+}
 
 template <typename dtype_t>
 __host__ __device__ constexpr dtype_t ceil_div(dtype_t a, dtype_t b) {
 
@@ -140,7 +140,7 @@ paddle::Tensor multiply_ad_func(const paddle::Tensor& x,
   }
 
   // Forward API Call
-  auto api_result = paddle::experimental::multiply(x, y);
+  auto api_result = paddle::experimental::multiply(x, y, input_out);
   // Check NaN and Inf if needed
 
   if (FLAGS_check_nan_inf) {
 
@@ -707,13 +707,31 @@ static PyObject* tensor_method_copy_(TensorObject* self,
                                      PyObject* args,
                                      PyObject* kwargs) {
   EAGER_TRY
-  paddle::Tensor& src_tensor = CastPyArg2Tensor(PyTuple_GET_ITEM(args, 0), 0);
+  PyObject* other_tensor = nullptr;
+  bool blocking = true;
+  bool non_blocking = false;
+  static char* kwlist[] = {const_cast<char*>("other"),
+                           const_cast<char*>("blocking"),
+                           const_cast<char*>("non_blocking"),
+                           nullptr};
+  bool flag = PyArg_ParseTupleAndKeywords(
+      args, kwargs, "|Obb", kwlist, &other_tensor, &blocking, &non_blocking);
+  blocking = !blocking || non_blocking ? false : true;
+  PADDLE_ENFORCE_EQ(flag,
+                    true,
+                    common::errors::PreconditionNotMet(
+                        "Could not parse args and kwargs successfully, "
+                        "please check your input first and make "
+                        "sure you are on the right way. "
+                        "The expected arguments as follow: ("
+                        "other, blocking, non_blocking)"));
+
+  paddle::Tensor& src_tensor = CastPyArg2Tensor(other_tensor, 0);
   const phi::distributed::ProcessMesh* mesh = nullptr;
   if (InputsContainDistTensor(&mesh, src_tensor, self->tensor)) {
     ConvertAllInputsToDistTensor(mesh, src_tensor, self->tensor);
   }
 
-  bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 1), 1);
   VLOG(6) << "Start Copy Tensor " << src_tensor.name() << " to "
           << self->tensor.name();
   if (!self->tensor.initialized()) {
@@ -742,7 +760,7 @@ static PyObject* tensor_method_copy_(TensorObject* self,
 
   VLOG(6) << "Finish Copy Tensor " << src_tensor.name() << " to "
           << self->tensor.name();
-  RETURN_PY_NONE
+  return ToPyObject(self->tensor);
 
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
@@ -166,6 +166,7 @@ def new_init(self, *args, **kwargs):
     hub as hub,
     linalg as linalg,
     signal as signal,
+    special as special,
     tensor as tensor,
     utils as utils,
 )
@@ -885,6 +886,12 @@ def __dir__(self):
 e = math.e
 
 # API alias
+cat = concat
+concatenate = concat
+take_along_dim = take_along_axis
+clamp = clip
+ger = outer
+
 div = divide
 div_ = divide_
 
@@ -1001,6 +1008,7 @@ def __dir__(self):
     'less_',
     'kron',
     'clip',
+    'clamp',
     'Tensor',
     'FloatTensor',
     'DoubleTensor',
@@ -1145,6 +1153,7 @@ def __dir__(self):
     'erfinv',
     'inner',
     'outer',
+    'ger',
     'square',
     'square_',
     'divide',
@@ -1262,6 +1271,8 @@ def __dir__(self):
     'log10',
     'log10_',
     'concat',
+    'cat',
+    'concatenate',
     'check_shape',
     'trunc',
     'trunc_',
@@ -1293,6 +1304,7 @@ def __dir__(self):
     'renorm',
     'renorm_',
     'take_along_axis',
+    'take_along_dim',
     'scatter_reduce',
     'put_along_axis',
     'scatter_add',
 
@@ -33,6 +33,7 @@
     lu,
     lu_solve,
     lu_unpack,
+    matmul,
     matrix_exp,
     matrix_norm,
     matrix_power,
@@ -71,6 +72,7 @@
     'multi_dot',
     'matrix_rank',
     'matrix_transpose',
+    'matmul',
     'svd',
     'svdvals',
     'qr',
 
@@ -0,0 +1,19 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .tensor.math import logsumexp
+
+__all__ = [
+    "logsumexp",
+]
@@ -266,6 +266,8 @@ def matmul(
     transpose_x: bool = False,
     transpose_y: bool = False,
     name: str | None = None,
+    *,
+    out: Tensor | None = None,
 ) -> Tensor:
     """
     Applies matrix multiplication to two tensors. `matmul` follows
@@ -313,6 +315,7 @@ def matmul(
         transpose_x (bool, optional): Whether to transpose :math:`x` before multiplication. Default is False.
         transpose_y (bool, optional): Whether to transpose :math:`y` before multiplication. Default is False.
         name (str|None, optional): If set None, the layer will be named automatically. For more information, please refer to :ref:`api_guide_Name`. Default is None.
+        out (Tensor, optional): The output tensor. If set, the result will be stored in this tensor. Default is None.
 
     Returns:
         Tensor: The output Tensor.
@@ -360,7 +363,7 @@ def matmul(
 
     """
     if in_dynamic_or_pir_mode():
-        return _C_ops.matmul(x, y, transpose_x, transpose_y)
+        return _C_ops.matmul(x, y, transpose_x, transpose_y, out=out)
     else:
         attrs = {
             'trans_x': transpose_x,
 
@@ -1351,7 +1351,11 @@ def tolist(x: Tensor) -> NestedList[int | float | complex]:
 
 @ParamAliasDecorator({"x": ["tensors"], "axis": ["dim"]})
 def concat(
-    x: Sequence[Tensor], axis: int | Tensor = 0, name: str | None = None
+    x: Sequence[Tensor],
+    axis: int | Tensor = 0,
+    name: str | None = None,
+    *,
+    out: Tensor | None = None,
 ) -> Tensor:
     """
 
@@ -1380,6 +1384,7 @@ def concat(
             it works the same way as ``axis+R``. Default is 0.
             alias: ``dim``.
         name (str|None, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+        out (Tensor|None, optional): The output Tensor. If set, the result will be stored in this Tensor. Default is None.
 
     Returns:
         Tensor, A Tensor with the same data type as ``x``.
@@ -1422,7 +1427,7 @@ def concat(
     if in_dynamic_mode():
         if isinstance(axis, Variable):
             axis = axis.item(0)
-        return _C_ops.concat(input, axis)
+        return _C_ops.concat(input, axis, out=out)
     elif in_pir_mode():
 
         def is_in_amp_mode():
@@ -4918,7 +4923,9 @@ def expand_as(x: Tensor, y: Tensor, name: str | None = None) -> Tensor:
 
 @ParamAliasDecorator({"x": ["input"], "shape": ["size"]})
 def broadcast_to(
-    x: Tensor, shape: ShapeLike, name: str | None = None
+    x: Tensor,
+    shape: ShapeLike,
+    name: str | None = None,
 ) -> Tensor:
     """
 
@@ -6944,7 +6951,12 @@ def scatter_add(
 
 @ParamAliasDecorator({"arr": ["input"], "axis": ["dim"]})
 def take_along_axis(
-    arr: Tensor, indices: Tensor, axis: int, broadcast: bool = True
+    arr: Tensor,
+    indices: Tensor,
+    axis: int,
+    broadcast: bool = True,
+    *,
+    out: Tensor | None = None,
 ) -> Tensor:
     """
     Take values from the input array by given indices matrix along the designated axis.
@@ -6962,9 +6974,10 @@ def take_along_axis(
         axis (int) : The axis to take 1d slices along.
             alias: ``dim``.
         broadcast (bool, optional): whether the indices broadcast.
+        out (Tensor, optional): The output Tensor. If set, the output will be written to this Tensor.
 
     Returns:
-        Tensor, The indexed element, same dtype with arr
+        Tensor, The indexed element, same dtype with arr.
 
     Examples:
         .. code-block:: python
@@ -7011,7 +7024,7 @@ def take_along_axis(
                 )
 
     if in_dynamic_or_pir_mode():
-        return _C_ops.take_along_axis(arr, indices, axis)
+        return _C_ops.take_along_axis(arr, indices, axis, out=out)
     else:
         check_variable_and_dtype(
             arr,
Original file line number	Diff line number	Diff line change
`@@ -140,7 +140,7 @@ paddle::Tensor multiply_ad_func(const paddle::Tensor& x,`
`140`	`140`	`}`
`141`	`141`
`142`	`142`	`// Forward API Call`
`143`		`- auto api_result = paddle::experimental::multiply(x, y);`
	`143`	`+ auto api_result = paddle::experimental::multiply(x, y, input_out);`
`144`	`144`	`// Check NaN and Inf if needed`
`145`	`145`
`146`	`146`	`if (FLAGS_check_nan_inf) {`