Skip to content

Commit 98859bb

Browse files
committed
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into Tensor
2 parents b8611a2 + c3af6f2 commit 98859bb

File tree

19 files changed

+599
-52
lines changed

19 files changed

+599
-52
lines changed

.github/actions/check-bypass/action.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ runs:
1818
- id: check-bypass
1919
name: Check Bypass
2020
env:
21-
CI_TEAM_MEMBERS: '["SigureMo", "risemeup1", "tianshuo78520a", "0x3878f", "swgu98", "luotao1", "XieYunshen"]'
21+
CI_TEAM_MEMBERS: '["tianshuo78520a", "swgu98", "risemeup1", "XieYunshen"]'
2222
uses: PFCCLab/ci-bypass@v1
2323
with:
2424
github-token: ${{ inputs.github-token }}

.github/workflows/check-bypass.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ jobs:
2020
permissions:
2121
contents: read
2222
env:
23-
CI_TEAM_MEMBERS: '["SigureMo", "risemeup1", "tianshuo78520a", "0x3878f", "swgu98", "luotao1", "XieYunshen", "mmglove", "fightfat"]'
23+
CI_TEAM_MEMBERS: '["tianshuo78520a", "swgu98", "risemeup1" , "XieYunshen"]'
2424
outputs:
2525
can-skip: ${{ steps.check-bypass.outputs.can-skip }}
2626
steps:

paddle/fluid/distributed/collective/deep_ep/kernels/utils.cuh

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -418,9 +418,6 @@ __device__ __forceinline__ float exp2f_approx(const float &x) {
418418
return ret;
419419
}
420420

421-
// TMA PTX instructions
422-
#ifndef DISABLE_SM90_FEATURES
423-
424421
__device__ __forceinline__ uint32_t elect_one_sync(int lane_id) {
425422
uint32_t pred = 0;
426423
asm volatile(
@@ -437,23 +434,30 @@ __device__ __forceinline__ uint32_t elect_one_sync(int lane_id) {
437434
}
438435

439436
__device__ __forceinline__ void fence_view_async_shared() {
437+
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
440438
asm volatile("fence.proxy.async.shared::cta; \n" ::);
439+
#endif
441440
}
442441

443442
__device__ __forceinline__ void fence_barrier_init() {
443+
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
444444
asm volatile("fence.mbarrier_init.release.cluster; \n" ::);
445+
#endif
445446
}
446447

447448
__device__ __forceinline__ void mbarrier_init(uint64_t *mbar_ptr,
448449
uint32_t arrive_count) {
449450
auto mbar_int_ptr = static_cast<uint32_t>(__cvta_generic_to_shared(mbar_ptr));
451+
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
450452
asm volatile("mbarrier.init.shared::cta.b64 [%1], %0;" ::"r"(arrive_count),
451453
"r"(mbar_int_ptr));
454+
#endif
452455
}
453456

454457
__device__ __forceinline__ void mbarrier_wait(uint64_t *mbar_ptr,
455458
uint32_t &phase) {
456459
auto mbar_int_ptr = static_cast<uint32_t>(__cvta_generic_to_shared(mbar_ptr));
460+
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
457461
asm volatile(
458462
"{\n\t"
459463
".reg .pred P1; \n\t"
@@ -466,19 +470,24 @@ __device__ __forceinline__ void mbarrier_wait(uint64_t *mbar_ptr,
466470
"r"(phase),
467471
"r"(0x989680));
468472
phase ^= 1;
473+
#endif
469474
}
470475

471476
__device__ __forceinline__ void mbarrier_arrive_and_expect_tx(
472477
uint64_t *mbar_ptr, int num_bytes) {
473478
auto mbar_int_ptr = static_cast<uint32_t>(__cvta_generic_to_shared(mbar_ptr));
479+
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
474480
asm volatile(
475481
"mbarrier.arrive.expect_tx.shared::cta.b64 _, [%1], %0; \n\t" ::"r"(
476482
num_bytes),
477483
"r"(mbar_int_ptr));
484+
#endif
478485
}
479486

480487
__device__ __forceinline__ void tma_store_fence() {
488+
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
481489
asm volatile("fence.proxy.async.shared::cta;");
490+
#endif
482491
}
483492

484493
constexpr uint64_t kEvictFirst = 0x12f0000000000000;
@@ -492,6 +501,7 @@ __device__ __forceinline__ void tma_load_1d(const void *smem_ptr,
492501
auto mbar_int_ptr = static_cast<uint32_t>(__cvta_generic_to_shared(mbar_ptr));
493502
auto smem_int_ptr = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
494503
const auto cache_hint = evict_first ? kEvictFirst : kEvictNormal;
504+
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
495505
asm volatile(
496506
"cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.L2::"
497507
"cache_hint [%0], [%1], %2, [%3], %4;\n" ::"r"(smem_int_ptr),
@@ -500,6 +510,7 @@ __device__ __forceinline__ void tma_load_1d(const void *smem_ptr,
500510
"r"(mbar_int_ptr),
501511
"l"(cache_hint)
502512
: "memory");
513+
#endif
503514
}
504515

505516
__device__ __forceinline__ void tma_store_1d(const void *smem_ptr,
@@ -508,6 +519,7 @@ __device__ __forceinline__ void tma_store_1d(const void *smem_ptr,
508519
bool evict_first = true) {
509520
auto smem_int_ptr = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
510521
const auto cache_hint = evict_first ? kEvictFirst : kEvictNormal;
522+
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
511523
asm volatile(
512524
"cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint [%0], [%1], "
513525
"%2, %3;\n" ::"l"(gmem_ptr),
@@ -516,14 +528,15 @@ __device__ __forceinline__ void tma_store_1d(const void *smem_ptr,
516528
"l"(cache_hint)
517529
: "memory");
518530
asm volatile("cp.async.bulk.commit_group;");
531+
#endif
519532
}
520533

521534
template <int N = 0>
522535
__device__ __forceinline__ void tma_store_wait() {
536+
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
523537
asm volatile("cp.async.bulk.wait_group.read %0;" ::"n"(N) : "memory");
524-
}
525-
526538
#endif
539+
}
527540

528541
template <typename dtype_t>
529542
__host__ __device__ constexpr dtype_t ceil_div(dtype_t a, dtype_t b) {

paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ paddle::Tensor multiply_ad_func(const paddle::Tensor& x,
140140
}
141141

142142
// Forward API Call
143-
auto api_result = paddle::experimental::multiply(x, y);
143+
auto api_result = paddle::experimental::multiply(x, y, input_out);
144144
// Check NaN and Inf if needed
145145

146146
if (FLAGS_check_nan_inf) {

paddle/fluid/pybind/eager_method.cc

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -707,13 +707,31 @@ static PyObject* tensor_method_copy_(TensorObject* self,
707707
PyObject* args,
708708
PyObject* kwargs) {
709709
EAGER_TRY
710-
paddle::Tensor& src_tensor = CastPyArg2Tensor(PyTuple_GET_ITEM(args, 0), 0);
710+
PyObject* other_tensor = nullptr;
711+
bool blocking = true;
712+
bool non_blocking = false;
713+
static char* kwlist[] = {const_cast<char*>("other"),
714+
const_cast<char*>("blocking"),
715+
const_cast<char*>("non_blocking"),
716+
nullptr};
717+
bool flag = PyArg_ParseTupleAndKeywords(
718+
args, kwargs, "|Obb", kwlist, &other_tensor, &blocking, &non_blocking);
719+
blocking = !blocking || non_blocking ? false : true;
720+
PADDLE_ENFORCE_EQ(flag,
721+
true,
722+
common::errors::PreconditionNotMet(
723+
"Could not parse args and kwargs successfully, "
724+
"please check your input first and make "
725+
"sure you are on the right way. "
726+
"The expected arguments as follow: ("
727+
"other, blocking, non_blocking)"));
728+
729+
paddle::Tensor& src_tensor = CastPyArg2Tensor(other_tensor, 0);
711730
const phi::distributed::ProcessMesh* mesh = nullptr;
712731
if (InputsContainDistTensor(&mesh, src_tensor, self->tensor)) {
713732
ConvertAllInputsToDistTensor(mesh, src_tensor, self->tensor);
714733
}
715734

716-
bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 1), 1);
717735
VLOG(6) << "Start Copy Tensor " << src_tensor.name() << " to "
718736
<< self->tensor.name();
719737
if (!self->tensor.initialized()) {
@@ -742,7 +760,7 @@ static PyObject* tensor_method_copy_(TensorObject* self,
742760

743761
VLOG(6) << "Finish Copy Tensor " << src_tensor.name() << " to "
744762
<< self->tensor.name();
745-
RETURN_PY_NONE
763+
return ToPyObject(self->tensor);
746764

747765
EAGER_CATCH_AND_THROW_RETURN_NULL
748766
}

python/paddle/__init__.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,7 @@ def new_init(self, *args, **kwargs):
166166
hub as hub,
167167
linalg as linalg,
168168
signal as signal,
169+
special as special,
169170
tensor as tensor,
170171
utils as utils,
171172
)
@@ -885,6 +886,12 @@ def __dir__(self):
885886
e = math.e
886887

887888
# API alias
889+
cat = concat
890+
concatenate = concat
891+
take_along_dim = take_along_axis
892+
clamp = clip
893+
ger = outer
894+
888895
div = divide
889896
div_ = divide_
890897

@@ -1001,6 +1008,7 @@ def __dir__(self):
10011008
'less_',
10021009
'kron',
10031010
'clip',
1011+
'clamp',
10041012
'Tensor',
10051013
'FloatTensor',
10061014
'DoubleTensor',
@@ -1145,6 +1153,7 @@ def __dir__(self):
11451153
'erfinv',
11461154
'inner',
11471155
'outer',
1156+
'ger',
11481157
'square',
11491158
'square_',
11501159
'divide',
@@ -1262,6 +1271,8 @@ def __dir__(self):
12621271
'log10',
12631272
'log10_',
12641273
'concat',
1274+
'cat',
1275+
'concatenate',
12651276
'check_shape',
12661277
'trunc',
12671278
'trunc_',
@@ -1293,6 +1304,7 @@ def __dir__(self):
12931304
'renorm',
12941305
'renorm_',
12951306
'take_along_axis',
1307+
'take_along_dim',
12961308
'scatter_reduce',
12971309
'put_along_axis',
12981310
'scatter_add',

python/paddle/linalg.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
lu,
3434
lu_solve,
3535
lu_unpack,
36+
matmul,
3637
matrix_exp,
3738
matrix_norm,
3839
matrix_power,
@@ -71,6 +72,7 @@
7172
'multi_dot',
7273
'matrix_rank',
7374
'matrix_transpose',
75+
'matmul',
7476
'svd',
7577
'svdvals',
7678
'qr',

python/paddle/special.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from .tensor.math import logsumexp
16+
17+
__all__ = [
18+
"logsumexp",
19+
]

python/paddle/tensor/linalg.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,8 @@ def matmul(
266266
transpose_x: bool = False,
267267
transpose_y: bool = False,
268268
name: str | None = None,
269+
*,
270+
out: Tensor | None = None,
269271
) -> Tensor:
270272
"""
271273
Applies matrix multiplication to two tensors. `matmul` follows
@@ -313,6 +315,7 @@ def matmul(
313315
transpose_x (bool, optional): Whether to transpose :math:`x` before multiplication. Default is False.
314316
transpose_y (bool, optional): Whether to transpose :math:`y` before multiplication. Default is False.
315317
name (str|None, optional): If set None, the layer will be named automatically. For more information, please refer to :ref:`api_guide_Name`. Default is None.
318+
out (Tensor, optional): The output tensor. If set, the result will be stored in this tensor. Default is None.
316319
317320
Returns:
318321
Tensor: The output Tensor.
@@ -360,7 +363,7 @@ def matmul(
360363
361364
"""
362365
if in_dynamic_or_pir_mode():
363-
return _C_ops.matmul(x, y, transpose_x, transpose_y)
366+
return _C_ops.matmul(x, y, transpose_x, transpose_y, out=out)
364367
else:
365368
attrs = {
366369
'trans_x': transpose_x,

python/paddle/tensor/manipulation.py

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1351,7 +1351,11 @@ def tolist(x: Tensor) -> NestedList[int | float | complex]:
13511351

13521352
@ParamAliasDecorator({"x": ["tensors"], "axis": ["dim"]})
13531353
def concat(
1354-
x: Sequence[Tensor], axis: int | Tensor = 0, name: str | None = None
1354+
x: Sequence[Tensor],
1355+
axis: int | Tensor = 0,
1356+
name: str | None = None,
1357+
*,
1358+
out: Tensor | None = None,
13551359
) -> Tensor:
13561360
"""
13571361
@@ -1380,6 +1384,7 @@ def concat(
13801384
it works the same way as ``axis+R``. Default is 0.
13811385
alias: ``dim``.
13821386
name (str|None, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
1387+
out (Tensor|None, optional): The output Tensor. If set, the result will be stored in this Tensor. Default is None.
13831388
13841389
Returns:
13851390
Tensor, A Tensor with the same data type as ``x``.
@@ -1422,7 +1427,7 @@ def concat(
14221427
if in_dynamic_mode():
14231428
if isinstance(axis, Variable):
14241429
axis = axis.item(0)
1425-
return _C_ops.concat(input, axis)
1430+
return _C_ops.concat(input, axis, out=out)
14261431
elif in_pir_mode():
14271432

14281433
def is_in_amp_mode():
@@ -4918,7 +4923,9 @@ def expand_as(x: Tensor, y: Tensor, name: str | None = None) -> Tensor:
49184923

49194924
@ParamAliasDecorator({"x": ["input"], "shape": ["size"]})
49204925
def broadcast_to(
4921-
x: Tensor, shape: ShapeLike, name: str | None = None
4926+
x: Tensor,
4927+
shape: ShapeLike,
4928+
name: str | None = None,
49224929
) -> Tensor:
49234930
"""
49244931
@@ -6944,7 +6951,12 @@ def scatter_add(
69446951

69456952
@ParamAliasDecorator({"arr": ["input"], "axis": ["dim"]})
69466953
def take_along_axis(
6947-
arr: Tensor, indices: Tensor, axis: int, broadcast: bool = True
6954+
arr: Tensor,
6955+
indices: Tensor,
6956+
axis: int,
6957+
broadcast: bool = True,
6958+
*,
6959+
out: Tensor | None = None,
69486960
) -> Tensor:
69496961
"""
69506962
Take values from the input array by given indices matrix along the designated axis.
@@ -6962,9 +6974,10 @@ def take_along_axis(
69626974
axis (int) : The axis to take 1d slices along.
69636975
alias: ``dim``.
69646976
broadcast (bool, optional): whether the indices broadcast.
6977+
out (Tensor, optional): The output Tensor. If set, the output will be written to this Tensor.
69656978
69666979
Returns:
6967-
Tensor, The indexed element, same dtype with arr
6980+
Tensor, The indexed element, same dtype with arr.
69686981
69696982
Examples:
69706983
.. code-block:: python
@@ -7011,7 +7024,7 @@ def take_along_axis(
70117024
)
70127025

70137026
if in_dynamic_or_pir_mode():
7014-
return _C_ops.take_along_axis(arr, indices, axis)
7027+
return _C_ops.take_along_axis(arr, indices, axis, out=out)
70157028
else:
70167029
check_variable_and_dtype(
70177030
arr,

0 commit comments

Comments
 (0)