Skip to content

Commit dfe9e6b

Browse files
committed
Merge branch 'develop' into h58
2 parents a7cad7b + 8c44b3e commit dfe9e6b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

52 files changed

+633
-397
lines changed

paddle/cinn/hlir/op/contrib/gather_nd.cc

Lines changed: 6 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -42,23 +42,6 @@ namespace op {
4242
using cinn::common::CINNValue;
4343
using cinn::common::CINNValuePack;
4444

45-
Expr CastShapeElementType(Expr shape_element) {
46-
// constant values should already been cast to int64_t for shape calc
47-
// make sure shape expr does not contain non-matched int type for min and max
48-
if (shape_element.is_constant()) return shape_element;
49-
if (auto max_elem = shape_element.As<ir::Max>()) {
50-
max_elem->a() = CastShapeElementType(max_elem->a());
51-
max_elem->b() = CastShapeElementType(max_elem->b());
52-
} else if (auto min_elem = shape_element.As<ir::Min>()) {
53-
min_elem->a() = CastShapeElementType(min_elem->a());
54-
min_elem->b() = CastShapeElementType(min_elem->b());
55-
} else {
56-
shape_element = ir::Call::Make(
57-
ir::Int(64), "int64_t", {shape_element}, {}, ir::CallType::Intrinsic);
58-
}
59-
return shape_element;
60-
}
61-
6245
ir::Tensor GatherNdSymbolic(const ir::Tensor &x,
6346
const ir::Tensor &index,
6447
const std::string &name) {
@@ -87,14 +70,12 @@ ir::Tensor GatherNdSymbolic(const ir::Tensor &x,
8770
indices_position[indices_position_size - 1] =
8871
ir::Cast::Make(cinn::common::Int(64), Expr(i));
8972
// support negative indices
90-
auto idx_expr =
91-
ir::Cast::Make(cinn::common::Int(64), index(indices_position));
92-
auto real_expr =
93-
ir::Select::Make(ir::GE::Make(idx_expr, Expr(0)),
94-
idx_expr,
95-
CastShapeElementType(x_shape[i]) + idx_expr);
96-
real_indices.push_back(
97-
ir::Cast::Make(cinn::common::Int(64), real_expr));
73+
auto idx_expr = index(indices_position);
74+
auto real_expr = ir::Select::Make(
75+
ir::GE::Make(idx_expr, Expr(0)),
76+
idx_expr,
77+
ir::Cast::Make(idx_expr.type(), x_shape[i]) + idx_expr);
78+
real_indices.push_back(real_expr);
9879
}
9980
if (real_indices.size() == x_shape_size) {
10081
return x(real_indices);

paddle/fluid/pybind/compiled_program.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -387,7 +387,7 @@ void BindCompiledProgram(pybind11::module &m) { // NOLINT
387387
self.fuse_gemm_epilogue_ = b;
388388
},
389389
R"DOC((bool, optional): fuse_gemm_epilogue indicate whether
390-
to fuse matmul_op, elemenewist_add_op and activation_op,
390+
to fuse matmul_op, elementwise_add_op and activation_op,
391391
it may make the execution faster. Default is False.
392392
393393
Examples:
@@ -410,7 +410,7 @@ void BindCompiledProgram(pybind11::module &m) { // NOLINT
410410
PADDLE_ENFORCE_NE(self.IsFinalized(),
411411
true,
412412
common::errors::PreconditionNotMet(
413-
"BuildStrategy has been finlaized, cannot be "
413+
"BuildStrategy has been finalized, cannot be "
414414
"configured again."));
415415
self.fuse_dot_product_attention_ = b;
416416
},

paddle/phi/api/include/tensor.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -672,7 +672,7 @@ class PADDLE_API Tensor final {
672672
*
673673
* @return Tensor
674674
*/
675-
Tensor contiguous();
675+
Tensor contiguous() const;
676676

677677
private:
678678
/**

paddle/phi/api/lib/tensor.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -549,7 +549,7 @@ bool Tensor::is_contiguous() const {
549549
}
550550
}
551551

552-
Tensor Tensor::contiguous() {
552+
Tensor Tensor::contiguous() const {
553553
if (is_dense_tensor() || is_dist_tensor()) {
554554
phi::DenseTensor *dense_tensor = nullptr;
555555
if (is_dist_tensor()) {

paddle/phi/api/profiler/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,4 +26,4 @@ if(WITH_PYTHON AND EXISTS ${PADDLE_BINARY_DIR})
2626
endif()
2727
endif()
2828

29-
collect_srcs(api_srcs SRCS device_tracer.cc profiler.cc)
29+
collect_srcs(api_srcs SRCS device_tracer.cc event.cc profiler.cc)

paddle/phi/api/profiler/event.cc

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#include "paddle/phi/api/profiler/event.h"
16+
17+
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
18+
#include "glog/logging.h"
19+
#endif
20+
21+
namespace phi {
22+
23+
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
24+
25+
CudaEvent::CudaEvent() {
26+
#ifdef PADDLE_WITH_HIP
27+
hipEventCreateWithFlags(&event_, flags_);
28+
#else
29+
cudaEventCreateWithFlags(&event_, flags_);
30+
#endif
31+
VLOG(4) << "CudaEvent " << event_;
32+
}
33+
34+
CudaEvent::CudaEvent(unsigned int flags) : flags_(flags) {
35+
#ifdef PADDLE_WITH_HIP
36+
hipEventCreateWithFlags(&event_, flags_);
37+
#else
38+
cudaEventCreateWithFlags(&event_, flags_);
39+
#endif
40+
VLOG(4) << "CudaEvent " << event_;
41+
}
42+
43+
bool CudaEvent::Query() {
44+
#ifdef PADDLE_WITH_HIP
45+
gpuError_t err = hipEventQuery(event_);
46+
if (err == hipSuccess) {
47+
return true;
48+
}
49+
if (err == hipErrorNotReady) {
50+
return false;
51+
}
52+
#else
53+
gpuError_t err = cudaEventQuery(event_);
54+
if (err == cudaSuccess) {
55+
return true;
56+
}
57+
if (err == cudaErrorNotReady) {
58+
return false;
59+
}
60+
#endif
61+
PADDLE_ENFORCE_GPU_SUCCESS(err);
62+
return false;
63+
}
64+
65+
float CudaEvent::ElapsedTime(CudaEvent *end_event) {
66+
float milliseconds = 0;
67+
#ifdef PADDLE_WITH_HIP
68+
hipEventSynchronize(end_event->GetRawCudaEvent());
69+
PADDLE_ENFORCE_GPU_SUCCESS(
70+
hipEventElapsedTime(&milliseconds, event_, end_event->GetRawCudaEvent()));
71+
#else
72+
cudaEventSynchronize(end_event->GetRawCudaEvent());
73+
PADDLE_ENFORCE_GPU_SUCCESS(cudaEventElapsedTime(
74+
&milliseconds, event_, end_event->GetRawCudaEvent()));
75+
#endif
76+
return milliseconds;
77+
}
78+
79+
#endif
80+
81+
} // namespace phi

paddle/phi/api/profiler/event.h

Lines changed: 4 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -140,23 +140,9 @@ class CudaEvent {
140140
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
141141

142142
public:
143-
CudaEvent() {
144-
#ifdef PADDLE_WITH_HIP
145-
hipEventCreateWithFlags(&event_, flags_);
146-
#else
147-
cudaEventCreateWithFlags(&event_, flags_);
148-
#endif
149-
VLOG(4) << "CudaEvent " << event_;
150-
}
143+
CudaEvent();
151144

152-
explicit CudaEvent(unsigned int flags) : flags_(flags) {
153-
#ifdef PADDLE_WITH_HIP
154-
hipEventCreateWithFlags(&event_, flags_);
155-
#else
156-
cudaEventCreateWithFlags(&event_, flags_);
157-
#endif
158-
VLOG(4) << "CudaEvent " << event_;
159-
}
145+
explicit CudaEvent(unsigned int flags);
160146

161147
~CudaEvent() {
162148
#ifdef PADDLE_WITH_HIP
@@ -174,41 +160,9 @@ class CudaEvent {
174160
#endif
175161
}
176162

177-
bool Query() {
178-
#ifdef PADDLE_WITH_HIP
179-
gpuError_t err = hipEventQuery(event_);
180-
if (err == hipSuccess) {
181-
return true;
182-
}
183-
if (err == hipErrorNotReady) {
184-
return false;
185-
}
186-
#else
187-
gpuError_t err = cudaEventQuery(event_);
188-
if (err == cudaSuccess) {
189-
return true;
190-
}
191-
if (err == cudaErrorNotReady) {
192-
return false;
193-
}
194-
#endif
195-
PADDLE_ENFORCE_GPU_SUCCESS(err);
196-
return false;
197-
}
163+
bool Query();
198164

199-
float ElapsedTime(CudaEvent *end_event) {
200-
float milliseconds = 0;
201-
#ifdef PADDLE_WITH_HIP
202-
hipEventSynchronize(end_event->GetRawCudaEvent());
203-
PADDLE_ENFORCE_GPU_SUCCESS(hipEventElapsedTime(
204-
&milliseconds, event_, end_event->GetRawCudaEvent()));
205-
#else
206-
cudaEventSynchronize(end_event->GetRawCudaEvent());
207-
PADDLE_ENFORCE_GPU_SUCCESS(cudaEventElapsedTime(
208-
&milliseconds, event_, end_event->GetRawCudaEvent()));
209-
#endif
210-
return milliseconds;
211-
}
165+
float ElapsedTime(CudaEvent *end_event);
212166

213167
void Synchronize() {
214168
#ifdef PADDLE_WITH_HIP

paddle/phi/backends/device_ext.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -395,7 +395,7 @@ struct C_DeviceInterface {
395395
size_t size);
396396

397397
/**
398-
* @brief Asynchonrize memory copy from host to device
398+
* @brief Asynchronize memory copy from host to device
399399
*
400400
* @param[C_Device] device Core fill it with a physical id
401401
* @param[C_Stream] stream
@@ -410,7 +410,7 @@ struct C_DeviceInterface {
410410
size_t size);
411411

412412
/**
413-
* @brief Asynchonrize memory copy from device to host
413+
* @brief Asynchronize memory copy from device to host
414414
*
415415
* @param[C_Device] device Core fill it with a physical id
416416
* @param[C_Stream] stream
@@ -425,7 +425,7 @@ struct C_DeviceInterface {
425425
size_t size);
426426

427427
/**
428-
* @brief Asynchonrize memory copy from device to device
428+
* @brief Asynchronize memory copy from device to device
429429
*
430430
* @param[C_Device] device Core fill it with a physical id
431431
* @param[C_Stream] stream
@@ -440,7 +440,7 @@ struct C_DeviceInterface {
440440
size_t size);
441441

442442
/**
443-
* @brief Peer asynchonrize memory copy from host to device
443+
* @brief Peer asynchronize memory copy from host to device
444444
*
445445
* @param[C_Device] device Core fill it with a physical id
446446
* @param[C_Stream] stream

paddle/phi/backends/onednn/axpy_handler.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ class OneDNNAXPYHandler {
5353
// Private implementation idiom to hide dependency on oneDNN headers.
5454
class Impl;
5555
// We need custom deleter, since the compiler is unable to parameterize
56-
// an allocator's default deleter due to incomple type.
56+
// an allocator's default deleter due to incomplete type.
5757
std::unique_ptr<Impl, void (*)(Impl*)> pimpl_;
5858
};
5959
} // namespace funcs

paddle/phi/backends/onednn/matmul_utils.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ class MatmulOneDNNHandler : public OneDNNHandlerNoCachingT<XT, dnnl::matmul> {
5959
std::vector<int64_t> x_strides(x_dims.size() - 3, 1);
6060
std::vector<int64_t> y_strides(x_dims.size() - 3, 1);
6161
std::vector<int64_t> out_strides(x_dims.size() - 3, 1);
62-
std::vector<int64_t> out_ddims(x_dims.size() - 3, 1);
62+
std::vector<int64_t> out_dims(x_dims.size() - 3, 1);
6363

6464
x_strides.reserve(x_dims.size());
6565
y_strides.reserve(x_dims.size());
@@ -78,20 +78,20 @@ class MatmulOneDNNHandler : public OneDNNHandlerNoCachingT<XT, dnnl::matmul> {
7878
}
7979

8080
out_strides.insert(out_strides.end(), {M * N, N, 1});
81-
out_ddims.insert(out_ddims.end(),
82-
{std::max(x_dims[MB_idx], y_dims[MB_idx]), M, N});
81+
out_dims.insert(out_dims.end(),
82+
{std::max(x_dims[MB_idx], y_dims[MB_idx]), M, N});
8383

8484
for (int i = x_dims.size() - 4; i >= 0; --i) {
85-
out_ddims[i] = std::max(x_dims[i], y_dims[i]);
85+
out_dims[i] = std::max(x_dims[i], y_dims[i]);
8686
x_strides[i] = x_dims[i + 1] * x_strides[i + 1];
8787
y_strides[i] = y_dims[i + 1] * y_strides[i + 1];
8888

89-
out_strides[i] = out_ddims[i + 1] * out_strides[i + 1];
89+
out_strides[i] = out_dims[i + 1] * out_strides[i + 1];
9090
}
9191

9292
auto x_md = memory::desc(x_dims, OneDNNGetDataType<XT>(), x_strides);
9393
auto y_md = memory::desc(y_dims, OneDNNGetDataType<YT>(), y_strides);
94-
auto out_md = memory::desc(out_ddims, OneDNNGetDataType<OT>(), out_strides);
94+
auto out_md = memory::desc(out_dims, OneDNNGetDataType<OT>(), out_strides);
9595

9696
this->AcquireForwardPrimitiveDescriptor(x_md, y_md, out_md);
9797
}

0 commit comments

Comments
 (0)