PaddlePaddle
diff --git a/‎paddle/cinn/hlir/op/contrib/gather_nd.cc‎
Lines changed: 6 additions & 25 deletions b/‎paddle/cinn/hlir/op/contrib/gather_nd.cc‎
Lines changed: 6 additions & 25 deletions
diff --git a/‎paddle/fluid/pybind/compiled_program.cc‎
Lines changed: 2 additions & 2 deletions b/‎paddle/fluid/pybind/compiled_program.cc‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎paddle/phi/api/include/tensor.h‎
Lines changed: 1 addition & 1 deletion b/‎paddle/phi/api/include/tensor.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/phi/api/lib/tensor.cc‎
Lines changed: 1 addition & 1 deletion b/‎paddle/phi/api/lib/tensor.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/phi/api/profiler/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎paddle/phi/api/profiler/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/phi/api/profiler/event.cc‎
Lines changed: 81 additions & 0 deletions b/‎paddle/phi/api/profiler/event.cc‎
Lines changed: 81 additions & 0 deletions
diff --git a/‎paddle/phi/api/profiler/event.h‎
Lines changed: 4 additions & 50 deletions b/‎paddle/phi/api/profiler/event.h‎
Lines changed: 4 additions & 50 deletions
diff --git a/‎paddle/phi/backends/device_ext.h‎
Lines changed: 4 additions & 4 deletions b/‎paddle/phi/backends/device_ext.h‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎paddle/phi/backends/onednn/axpy_handler.h‎
Lines changed: 1 addition & 1 deletion b/‎paddle/phi/backends/onednn/axpy_handler.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/phi/backends/onednn/matmul_utils.h‎
Lines changed: 6 additions & 6 deletions b/‎paddle/phi/backends/onednn/matmul_utils.h‎
Lines changed: 6 additions & 6 deletions
@@ -42,23 +42,6 @@ namespace op {
 using cinn::common::CINNValue;
 using cinn::common::CINNValuePack;
 
-Expr CastShapeElementType(Expr shape_element) {
-  // constant values should already been cast to int64_t for shape calc
-  // make sure shape expr does not contain non-matched int type for min and max
-  if (shape_element.is_constant()) return shape_element;
-  if (auto max_elem = shape_element.As<ir::Max>()) {
-    max_elem->a() = CastShapeElementType(max_elem->a());
-    max_elem->b() = CastShapeElementType(max_elem->b());
-  } else if (auto min_elem = shape_element.As<ir::Min>()) {
-    min_elem->a() = CastShapeElementType(min_elem->a());
-    min_elem->b() = CastShapeElementType(min_elem->b());
-  } else {
-    shape_element = ir::Call::Make(
-        ir::Int(64), "int64_t", {shape_element}, {}, ir::CallType::Intrinsic);
-  }
-  return shape_element;
-}
-
 ir::Tensor GatherNdSymbolic(const ir::Tensor &x,
                             const ir::Tensor &index,
                             const std::string &name) {
@@ -87,14 +70,12 @@ ir::Tensor GatherNdSymbolic(const ir::Tensor &x,
           indices_position[indices_position_size - 1] =
               ir::Cast::Make(cinn::common::Int(64), Expr(i));
           // support negative indices
-          auto idx_expr =
-              ir::Cast::Make(cinn::common::Int(64), index(indices_position));
-          auto real_expr =
-              ir::Select::Make(ir::GE::Make(idx_expr, Expr(0)),
-                               idx_expr,
-                               CastShapeElementType(x_shape[i]) + idx_expr);
-          real_indices.push_back(
-              ir::Cast::Make(cinn::common::Int(64), real_expr));
+          auto idx_expr = index(indices_position);
+          auto real_expr = ir::Select::Make(
+              ir::GE::Make(idx_expr, Expr(0)),
+              idx_expr,
+              ir::Cast::Make(idx_expr.type(), x_shape[i]) + idx_expr);
+          real_indices.push_back(real_expr);
         }
         if (real_indices.size() == x_shape_size) {
           return x(real_indices);
 
@@ -387,7 +387,7 @@ void BindCompiledProgram(pybind11::module &m) {  // NOLINT
             self.fuse_gemm_epilogue_ = b;
           },
           R"DOC((bool, optional): fuse_gemm_epilogue indicate whether
-                to fuse matmul_op, elemenewist_add_op and activation_op,
+                to fuse matmul_op, elementwise_add_op and activation_op,
                 it may make the execution faster. Default is False.
 
                 Examples:
@@ -410,7 +410,7 @@ void BindCompiledProgram(pybind11::module &m) {  // NOLINT
             PADDLE_ENFORCE_NE(self.IsFinalized(),
                               true,
                               common::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finlaized, cannot be "
+                                  "BuildStrategy has been finalized, cannot be "
                                   "configured again."));
             self.fuse_dot_product_attention_ = b;
           },
 
@@ -672,7 +672,7 @@ class PADDLE_API Tensor final {
    *
    * @return Tensor
    */
-  Tensor contiguous();
+  Tensor contiguous() const;
 
  private:
   /**
 
@@ -549,7 +549,7 @@ bool Tensor::is_contiguous() const {
   }
 }
 
-Tensor Tensor::contiguous() {
+Tensor Tensor::contiguous() const {
   if (is_dense_tensor() || is_dist_tensor()) {
     phi::DenseTensor *dense_tensor = nullptr;
     if (is_dist_tensor()) {
 
@@ -26,4 +26,4 @@ if(WITH_PYTHON AND EXISTS ${PADDLE_BINARY_DIR})
   endif()
 endif()
 
-collect_srcs(api_srcs SRCS device_tracer.cc profiler.cc)
+collect_srcs(api_srcs SRCS device_tracer.cc event.cc profiler.cc)
@@ -0,0 +1,81 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/api/profiler/event.h"
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#include "glog/logging.h"
+#endif
+
+namespace phi {
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+CudaEvent::CudaEvent() {
+#ifdef PADDLE_WITH_HIP
+  hipEventCreateWithFlags(&event_, flags_);
+#else
+  cudaEventCreateWithFlags(&event_, flags_);
+#endif
+  VLOG(4) << "CudaEvent " << event_;
+}
+
+CudaEvent::CudaEvent(unsigned int flags) : flags_(flags) {
+#ifdef PADDLE_WITH_HIP
+  hipEventCreateWithFlags(&event_, flags_);
+#else
+  cudaEventCreateWithFlags(&event_, flags_);
+#endif
+  VLOG(4) << "CudaEvent " << event_;
+}
+
+bool CudaEvent::Query() {
+#ifdef PADDLE_WITH_HIP
+  gpuError_t err = hipEventQuery(event_);
+  if (err == hipSuccess) {
+    return true;
+  }
+  if (err == hipErrorNotReady) {
+    return false;
+  }
+#else
+  gpuError_t err = cudaEventQuery(event_);
+  if (err == cudaSuccess) {
+    return true;
+  }
+  if (err == cudaErrorNotReady) {
+    return false;
+  }
+#endif
+  PADDLE_ENFORCE_GPU_SUCCESS(err);
+  return false;
+}
+
+float CudaEvent::ElapsedTime(CudaEvent *end_event) {
+  float milliseconds = 0;
+#ifdef PADDLE_WITH_HIP
+  hipEventSynchronize(end_event->GetRawCudaEvent());
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      hipEventElapsedTime(&milliseconds, event_, end_event->GetRawCudaEvent()));
+#else
+  cudaEventSynchronize(end_event->GetRawCudaEvent());
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaEventElapsedTime(
+      &milliseconds, event_, end_event->GetRawCudaEvent()));
+#endif
+  return milliseconds;
+}
+
+#endif
+
+}  // namespace phi
@@ -140,23 +140,9 @@ class CudaEvent {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
  public:
-  CudaEvent() {
-#ifdef PADDLE_WITH_HIP
-    hipEventCreateWithFlags(&event_, flags_);
-#else
-    cudaEventCreateWithFlags(&event_, flags_);
-#endif
-    VLOG(4) << "CudaEvent " << event_;
-  }
+  CudaEvent();
 
-  explicit CudaEvent(unsigned int flags) : flags_(flags) {
-#ifdef PADDLE_WITH_HIP
-    hipEventCreateWithFlags(&event_, flags_);
-#else
-    cudaEventCreateWithFlags(&event_, flags_);
-#endif
-    VLOG(4) << "CudaEvent " << event_;
-  }
+  explicit CudaEvent(unsigned int flags);
 
   ~CudaEvent() {
 #ifdef PADDLE_WITH_HIP
@@ -174,41 +160,9 @@ class CudaEvent {
 #endif
   }
 
-  bool Query() {
-#ifdef PADDLE_WITH_HIP
-    gpuError_t err = hipEventQuery(event_);
-    if (err == hipSuccess) {
-      return true;
-    }
-    if (err == hipErrorNotReady) {
-      return false;
-    }
-#else
-    gpuError_t err = cudaEventQuery(event_);
-    if (err == cudaSuccess) {
-      return true;
-    }
-    if (err == cudaErrorNotReady) {
-      return false;
-    }
-#endif
-    PADDLE_ENFORCE_GPU_SUCCESS(err);
-    return false;
-  }
+  bool Query();
 
-  float ElapsedTime(CudaEvent *end_event) {
-    float milliseconds = 0;
-#ifdef PADDLE_WITH_HIP
-    hipEventSynchronize(end_event->GetRawCudaEvent());
-    PADDLE_ENFORCE_GPU_SUCCESS(hipEventElapsedTime(
-        &milliseconds, event_, end_event->GetRawCudaEvent()));
-#else
-    cudaEventSynchronize(end_event->GetRawCudaEvent());
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventElapsedTime(
-        &milliseconds, event_, end_event->GetRawCudaEvent()));
-#endif
-    return milliseconds;
-  }
+  float ElapsedTime(CudaEvent *end_event);
 
   void Synchronize() {
 #ifdef PADDLE_WITH_HIP
 
@@ -395,7 +395,7 @@ struct C_DeviceInterface {
                               size_t size);
 
   /**
-   * @brief Asynchonrize memory copy from host to device
+   * @brief Asynchronize memory copy from host to device
    *
    * @param[C_Device]   device     Core fill it with a physical id
    * @param[C_Stream]   stream
@@ -410,7 +410,7 @@ struct C_DeviceInterface {
                                     size_t size);
 
   /**
-   * @brief Asynchonrize memory copy from device to host
+   * @brief Asynchronize memory copy from device to host
    *
    * @param[C_Device]   device     Core fill it with a physical id
    * @param[C_Stream]   stream
@@ -425,7 +425,7 @@ struct C_DeviceInterface {
                                     size_t size);
 
   /**
-   * @brief Asynchonrize memory copy from device to device
+   * @brief Asynchronize memory copy from device to device
    *
    * @param[C_Device]   device     Core fill it with a physical id
    * @param[C_Stream]   stream
@@ -440,7 +440,7 @@ struct C_DeviceInterface {
                                     size_t size);
 
   /**
-   * @brief Peer asynchonrize memory copy from host to device
+   * @brief Peer asynchronize memory copy from host to device
    *
    * @param[C_Device]   device     Core fill it with a physical id
    * @param[C_Stream]   stream
 
@@ -53,7 +53,7 @@ class OneDNNAXPYHandler {
   // Private implementation idiom to hide dependency on oneDNN headers.
   class Impl;
   // We need custom deleter, since the compiler is unable to parameterize
-  // an allocator's default deleter due to incomple type.
+  // an allocator's default deleter due to incomplete type.
   std::unique_ptr<Impl, void (*)(Impl*)> pimpl_;
 };
 }  // namespace funcs
 
@@ -59,7 +59,7 @@ class MatmulOneDNNHandler : public OneDNNHandlerNoCachingT<XT, dnnl::matmul> {
     std::vector<int64_t> x_strides(x_dims.size() - 3, 1);
     std::vector<int64_t> y_strides(x_dims.size() - 3, 1);
     std::vector<int64_t> out_strides(x_dims.size() - 3, 1);
-    std::vector<int64_t> out_ddims(x_dims.size() - 3, 1);
+    std::vector<int64_t> out_dims(x_dims.size() - 3, 1);
 
     x_strides.reserve(x_dims.size());
     y_strides.reserve(x_dims.size());
@@ -78,20 +78,20 @@ class MatmulOneDNNHandler : public OneDNNHandlerNoCachingT<XT, dnnl::matmul> {
     }
 
     out_strides.insert(out_strides.end(), {M * N, N, 1});
-    out_ddims.insert(out_ddims.end(),
-                     {std::max(x_dims[MB_idx], y_dims[MB_idx]), M, N});
+    out_dims.insert(out_dims.end(),
+                    {std::max(x_dims[MB_idx], y_dims[MB_idx]), M, N});
 
     for (int i = x_dims.size() - 4; i >= 0; --i) {
-      out_ddims[i] = std::max(x_dims[i], y_dims[i]);
+      out_dims[i] = std::max(x_dims[i], y_dims[i]);
       x_strides[i] = x_dims[i + 1] * x_strides[i + 1];
       y_strides[i] = y_dims[i + 1] * y_strides[i + 1];
 
-      out_strides[i] = out_ddims[i + 1] * out_strides[i + 1];
+      out_strides[i] = out_dims[i + 1] * out_strides[i + 1];
     }
 
     auto x_md = memory::desc(x_dims, OneDNNGetDataType<XT>(), x_strides);
     auto y_md = memory::desc(y_dims, OneDNNGetDataType<YT>(), y_strides);
-    auto out_md = memory::desc(out_ddims, OneDNNGetDataType<OT>(), out_strides);
+    auto out_md = memory::desc(out_dims, OneDNNGetDataType<OT>(), out_strides);
 
     this->AcquireForwardPrimitiveDescriptor(x_md, y_md, out_md);
   }
Original file line number	Diff line number	Diff line change
`@@ -672,7 +672,7 @@ class PADDLE_API Tensor final {`
`672`	`672`	`*`
`673`	`673`	`* @return Tensor`
`674`	`674`	`*/`
`675`		`- Tensor contiguous();`
	`675`	`+ Tensor contiguous() const;`
`676`	`676`
`677`	`677`	`private:`
`678`	`678`	`/**`
Original file line number	Diff line number	Diff line change
`@@ -549,7 +549,7 @@ bool Tensor::is_contiguous() const {`
`549`	`549`	`}`
`550`	`550`	`}`
`551`	`551`
`552`		`-Tensor Tensor::contiguous() {`
	`552`	`+Tensor Tensor::contiguous() const {`
`553`	`553`	`if (is_dense_tensor() \|\| is_dist_tensor()) {`
`554`	`554`	`phi::DenseTensor *dense_tensor = nullptr;`
`555`	`555`	`if (is_dist_tensor()) {`