PaddlePaddle · YuanRisheng · Nov 16, 2022 · Nov 15, 2022
diff --git a/paddle/phi/backends/gpu/gpu_primitives.h b/paddle/phi/backends/gpu/gpu_primitives.h
diff --git a/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h b/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 #include <type_traits>
 
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/funcs/detail/activation_functions.h"
 #include "paddle/phi/kernels/funcs/gru_compute.h"
 

diff --git a/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h b/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 #include <type_traits>
 
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/funcs/detail/activation_functions.h"
 #include "paddle/phi/kernels/funcs/lstm_compute.h"
 
@@ -202,15 +202,12 @@ __global__ void KeLstmBackward(Op op,
   if (is_batch) {
     if (value.prev_state_value) {
       if (grad.check_ig_grad)
-        paddle::platform::CudaAtomicAdd(grad.check_ig_grad + frame_idx,
-                                        r_checkIGrad);
+        phi::CudaAtomicAdd(grad.check_ig_grad + frame_idx, r_checkIGrad);
       if (grad.check_fg_grad)
-        paddle::platform::CudaAtomicAdd(grad.check_fg_grad + frame_idx,
-                                        r_checkFGrad);
+        phi::CudaAtomicAdd(grad.check_fg_grad + frame_idx, r_checkFGrad);
     }
     if (grad.check_og_grad)
-      paddle::platform::CudaAtomicAdd(grad.check_og_grad + frame_idx,
-                                      r_checkOGrad);
+      phi::CudaAtomicAdd(grad.check_og_grad + frame_idx, r_checkOGrad);
   } else {
     if (value.prev_state_value) {
       if (grad.check_ig_grad) grad.check_ig_grad[frame_idx] += r_checkIGrad;

diff --git a/paddle/phi/kernels/funcs/gather.cu.h b/paddle/phi/kernels/funcs/gather.cu.h
@@ -18,8 +18,8 @@ limitations under the License. */
 
 #include "paddle/fluid/memory/memcpy.h"
 // TODO(paddle-dev): move gpu_primitives.h to phi
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -217,7 +217,7 @@ __global__ void GatherGradGPUKernel(const T* input,
     int64_t out_index =
         inner_dim_index * (outer_dim_size * out_index_dim_size) +
         index[index_dim_index] * outer_dim_size + out_dim_index;
-    paddle::platform::CudaAtomicAdd(out + out_index, *(input + idx));
+    phi::CudaAtomicAdd(out + out_index, *(input + idx));
   }
 }
 

diff --git a/paddle/phi/kernels/funcs/pooling.cu b/paddle/phi/kernels/funcs/pooling.cu
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>
 
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/funcs/pooling.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
 #include "paddle/phi/kernels/primitive/datamover_primitives.h"
@@ -428,8 +428,7 @@ __global__ void KernelMaxPool2DGrad(const int nthreads,
 
     if (maxIndex != -1) {
       // atomic add
-      paddle::platform::CudaAtomicAdd(input_grad + maxIndex,
-                                      output_grad[index]);
+      phi::CudaAtomicAdd(input_grad + maxIndex, output_grad[index]);
     }
   }
 }
@@ -1330,7 +1329,7 @@ __global__ void KernelMaxPool3DGrad(const int nthreads,
     }
     if (maxIdx != -1) {
       // atomic add
-      paddle::platform::CudaAtomicAdd(input_grad + maxIdx, output_grad[index]);
+      phi::CudaAtomicAdd(input_grad + maxIdx, output_grad[index]);
     }
   }
 }
@@ -2359,7 +2358,7 @@ __global__ void KernelMaxPool3DWithIdxGrad(
           w_offset;
       int max_index = mask[output_index];
       if (max_index != -1) {
-        paddle::platform::CudaAtomicAdd(
+        phi::CudaAtomicAdd(
             &input_grad[nc_offset * input_depth * input_height * input_width +
                         max_index],
             output_grad[output_index]);

diff --git a/paddle/phi/kernels/funcs/scatter.cu.h b/paddle/phi/kernels/funcs/scatter.cu.h
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <unordered_set>
 #include <vector>
 
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -70,7 +70,7 @@ __global__ void ScatterCUDAKernel(const T* params,
     if (overwrite) {
       *(output + out_i) = *(params + i);
     } else {
-      paddle::platform::CudaAtomicAdd(output + out_i, *(params + i));
+      phi::CudaAtomicAdd(output + out_i, *(params + i));
     }
   }
 }
@@ -104,7 +104,7 @@ __global__ void ScatterNdCUDAKernel(const T* update,
       temp *= output_dims[j];
     }
     int64_t output_i = gather_i + slice_i;
-    paddle::platform::CudaAtomicAdd(output + output_i, *(update + i));
+    phi::CudaAtomicAdd(output + output_i, *(update + i));
   }
 }
 

diff --git a/paddle/phi/kernels/funcs/segment_pooling.cu b/paddle/phi/kernels/funcs/segment_pooling.cu
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include <algorithm>
 
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/segment_pooling.h"
@@ -60,7 +60,7 @@ __global__ void SegmentSumIdsKernel(const Index* segment_ids,
         }
         if (j > 0) {
           if (last_segment_id == first_segment_id) {
-            paddle::platform::CudaAtomicAdd(summed_ids + last_segment_id, sum);
+            phi::CudaAtomicAdd(summed_ids + last_segment_id, sum);
           } else {
             *(summed_ids + last_segment_id) = sum;
           }
@@ -70,7 +70,7 @@ __global__ void SegmentSumIdsKernel(const Index* segment_ids,
       sum += T(1);
       last_segment_id = current_segment_id;
     }
-    paddle::platform::CudaAtomicAdd(summed_ids + last_segment_id, sum);
+    phi::CudaAtomicAdd(summed_ids + last_segment_id, sum);
   }
 }
 
@@ -111,8 +111,8 @@ __global__ void SegmentMeanKernel(const Index* segment_ids,
               last_segment_id * inner_dim_size + segment_offset;
 
           if (last_segment_id == first_segment_id) {
-            paddle::platform::CudaAtomicAdd(
-                output + output_index, sum / *(summed_ids + last_segment_id));
+            phi::CudaAtomicAdd(output + output_index,
+                               sum / *(summed_ids + last_segment_id));
           } else {
             *(output + output_index) = sum / *(summed_ids + last_segment_id);
           }
@@ -123,8 +123,8 @@ __global__ void SegmentMeanKernel(const Index* segment_ids,
       last_segment_id = current_segment_id;
     }
     Index output_index = last_segment_id * inner_dim_size + segment_offset;
-    paddle::platform::CudaAtomicAdd(output + output_index,
-                                    sum / *(summed_ids + last_segment_id));
+    phi::CudaAtomicAdd(output + output_index,
+                       sum / *(summed_ids + last_segment_id));
   }
 }
 
@@ -215,7 +215,7 @@ class MaxPool {
   DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
   DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; }
   DEVICE inline T atomic(T* address, const T val) {
-    return paddle::platform::CudaAtomicMax(address, val);
+    return phi::CudaAtomicMax(address, val);
   }
 };
 
@@ -225,7 +225,7 @@ class MinPool {
   DEVICE inline T initial() { return static_cast<T>(FLT_MAX); }
   DEVICE inline void compute(const T& x, T* y) { *y = *y < x ? *y : x; }
   DEVICE inline T atomic(T* address, const T val) {
-    return paddle::platform::CudaAtomicMin(address, val);
+    return phi::CudaAtomicMin(address, val);
   }
 };
 
@@ -235,7 +235,7 @@ class SumPool {
   DEVICE inline T initial() { return static_cast<T>(0); }
   DEVICE inline void compute(const T& x, T* y) { *y = *y + x; }
   DEVICE inline T atomic(T* address, const T val) {
-    return paddle::platform::CudaAtomicAdd(address, val);
+    return phi::CudaAtomicAdd(address, val);
   }
 };
 

diff --git a/paddle/phi/kernels/funcs/selected_rows_functor.cu b/paddle/phi/kernels/funcs/selected_rows_functor.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <set>
 #include <vector>
 
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -127,7 +127,7 @@ __global__ void SelectedRowsAddTensorKernel(const T* selected_rows,
     // Since index in rows of SelectedRows can be duplicate, we can not use
     // tensor_out[index] += selected_rows[index]; Instead, we have to use
     // AtomicAdd to avoid concurrent write error.
-    paddle::platform::CudaAtomicAdd(tensor_out + index, selected_rows[index]);
+    phi::CudaAtomicAdd(tensor_out + index, selected_rows[index]);
   }
 }
 }  // namespace
@@ -279,7 +279,7 @@ __global__ void SelectedRowsAddToTensorKernel(const T* selected_rows,
   for (int index = tid; index < row_numel; index += block_size) {
     // Since index in rows of SelectedRows can be duplicate, we have to use
     // Atomic Operation to avoid concurrent write error.
-    paddle::platform::CudaAtomicAdd(tensor_out + index, selected_rows[index]);
+    phi::CudaAtomicAdd(tensor_out + index, selected_rows[index]);
   }
 }
 }  // namespace
@@ -360,7 +360,7 @@ __global__ void MergeAddKernel(const T* input,
   input += ty * row_numel;
   out += out_idx * row_numel;
   for (int index = tid; index < row_numel; index += block_size) {
-    paddle::platform::CudaAtomicAdd(out + index, input[index]);
+    phi::CudaAtomicAdd(out + index, input[index]);
   }
 }
 
@@ -623,9 +623,9 @@ struct UpdateToTensor<phi::GPUContext, T> {
     auto* in1_data = in1_value.template data<T>();
     auto* in2_data = input2->data<T>();
 
-    dim3 threads(paddle::platform::PADDLE_CUDA_NUM_THREADS, 1);
+    dim3 threads(phi::PADDLE_CUDA_NUM_THREADS, 1);
     dim3 grid(in1_rows.size(), 1);
-    UpdateToTensorKernel<T, paddle::platform::PADDLE_CUDA_NUM_THREADS>
+    UpdateToTensorKernel<T, phi::PADDLE_CUDA_NUM_THREADS>
         <<<grid, threads, 0, context.stream()>>>(
             in1_data, in1_rows.cuda_data(), op, in2_data, in1_row_numel);
   }

diff --git a/paddle/phi/kernels/gpu/accuracy_kernel.cu b/paddle/phi/kernels/gpu/accuracy_kernel.cu
@@ -17,14 +17,14 @@
 #include <thrust/execution_policy.h>
 #include <thrust/reduce.h>
 
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
-using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 
 template <int BlockSize>
 __global__ void AccuracyCudaKernel(const int N,

diff --git a/paddle/phi/kernels/gpu/adagrad_kernel.cu b/paddle/phi/kernels/gpu/adagrad_kernel.cu
@@ -14,8 +14,8 @@
 
 #include "paddle/phi/kernels/adagrad_kernel.h"
 
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/selected_rows_functor.h"
@@ -47,7 +47,7 @@ __global__ void MergeGradKernel(const T* grad,
   grad += ty * row_numel;
   grad_merge += grad_merge_idx * row_numel;
   for (int index = tid; index < row_numel; index += block_size) {
-    paddle::platform::CudaAtomicAdd(grad_merge + index, grad[index]);
+    phi::CudaAtomicAdd(grad_merge + index, grad[index]);
   }
 }
 
@@ -69,9 +69,9 @@ __global__ void SparseAdagradFunctorKernel(const T* grad,
   for (int index = tid; index < row_numel; index += block_size) {
     // Since index in rows of SelectedRows can be duplicate, we have to use
     // Atomic Operation to avoid concurrent write error.
-    paddle::platform::CudaAtomicAdd(param + index,
-                                    -1.0 * learning_rate[0] * grad[index] /
-                                        (sqrt(moment[index]) + epsilon));
+    phi::CudaAtomicAdd(param + index,
+                       -1.0 * learning_rate[0] * grad[index] /
+                           (sqrt(moment[index]) + epsilon));
   }
 }
 

diff --git a/paddle/phi/kernels/gpu/affine_grid_grad_kernel.cu b/paddle/phi/kernels/gpu/affine_grid_grad_kernel.cu
@@ -18,9 +18,9 @@
 
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/affine_grid_utils.h"
@@ -75,18 +75,14 @@ __global__ void affine_grid_grad_kernel_4d(const int count,
 
     int theta_offset = n * 6;  // 2 * 3;
     T out_grad_x = out_grad[index * 2];
-    paddle::platform::CudaAtomicAdd(theta_grad + theta_offset,
-                                    out_grad_x * w_coor);
-    paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 1,
-                                    out_grad_x * h_coor);
-    paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 2, out_grad_x);
+    phi::CudaAtomicAdd(theta_grad + theta_offset, out_grad_x * w_coor);
+    phi::CudaAtomicAdd(theta_grad + theta_offset + 1, out_grad_x * h_coor);
+    phi::CudaAtomicAdd(theta_grad + theta_offset + 2, out_grad_x);
 
     T out_grad_y = out_grad[index * 2 + 1];
-    paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 3,
-                                    out_grad_y * w_coor);
-    paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 4,
-                                    out_grad_y * h_coor);
-    paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 5, out_grad_y);
+    phi::CudaAtomicAdd(theta_grad + theta_offset + 3, out_grad_y * w_coor);
+    phi::CudaAtomicAdd(theta_grad + theta_offset + 4, out_grad_y * h_coor);
+    phi::CudaAtomicAdd(theta_grad + theta_offset + 5, out_grad_y);
   }
 }
 
@@ -116,31 +112,22 @@ __global__ void affine_grid_grad_kernel_5d(const int count,
 
     int theta_offset = n * 12;  // 3 * 4;
     T out_grad_x = out_grad[index * 3];
-    paddle::platform::CudaAtomicAdd(theta_grad + theta_offset,
-                                    out_grad_x * w_coor);
-    paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 1,
-                                    out_grad_x * h_coor);
-    paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 2,
-                                    out_grad_x * d_coor);
-    paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 3, out_grad_x);
+    phi::CudaAtomicAdd(theta_grad + theta_offset, out_grad_x * w_coor);
+    phi::CudaAtomicAdd(theta_grad + theta_offset + 1, out_grad_x * h_coor);
+    phi::CudaAtomicAdd(theta_grad + theta_offset + 2, out_grad_x * d_coor);
+    phi::CudaAtomicAdd(theta_grad + theta_offset + 3, out_grad_x);
 
     T out_grad_y = out_grad[index * 3 + 1];
-    paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 4,
-                                    out_grad_y * w_coor);
-    paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 5,
-                                    out_grad_y * h_coor);
-    paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 6,
-                                    out_grad_y * d_coor);
-    paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 7, out_grad_y);
+    phi::CudaAtomicAdd(theta_grad + theta_offset + 4, out_grad_y * w_coor);
+    phi::CudaAtomicAdd(theta_grad + theta_offset + 5, out_grad_y * h_coor);
+    phi::CudaAtomicAdd(theta_grad + theta_offset + 6, out_grad_y * d_coor);
+    phi::CudaAtomicAdd(theta_grad + theta_offset + 7, out_grad_y);
 
     T out_grad_z = out_grad[index * 3 + 2];
-    paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 8,
-                                    out_grad_z * w_coor);
-    paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 9,
-                                    out_grad_z * h_coor);
-    paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 10,
-                                    out_grad_z * d_coor);
-    paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 11, out_grad_z);
+    phi::CudaAtomicAdd(theta_grad + theta_offset + 8, out_grad_z * w_coor);
+    phi::CudaAtomicAdd(theta_grad + theta_offset + 9, out_grad_z * h_coor);
+    phi::CudaAtomicAdd(theta_grad + theta_offset + 10, out_grad_z * d_coor);
+    phi::CudaAtomicAdd(theta_grad + theta_offset + 11, out_grad_z);
   }
 }
 

diff --git a/paddle/phi/kernels/gpu/affine_grid_kernel.cu b/paddle/phi/kernels/gpu/affine_grid_kernel.cu
@@ -18,9 +18,9 @@
 
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/affine_grid_utils.h"