[MTAI-484] fix(build): replace "MUSAAA" with "MUSA"

hanhaowen-mt · Aug 4, 2023 · 4445bc0 · 4445bc0
1 parent ffec2fc
commit 4445bc0
Show file tree

Hide file tree

Showing 21 changed files with 46 additions and 26 deletions.
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
@@ -46,13 +46,16 @@ file(
   "gpu/c*.cu"
   "gpu/s*.cu"
   "gpu/abs_kernel.cu"
+  "gpu/uniform_kernel.cu"
   "gpu/activation_kernel.cu"
   "gpu/full_kernel.cu"
   "gpu/p_norm_grad_kernel.cu"
   "gpu/matmul_kernel.cu"
   "gpu/expand_kernel.cu"
   "gpu/isfinite_kernel.cu"
   "kps/*.cu"
+  "legacy/gpu/uniform_kernel.cu"
+  "sparse/gpu/mask_kernel.cu"
   "legacy/kps/*.cu"
   )
 list(REMOVE_ITEM kernel_cu

diff --git a/paddle/phi/kernels/funcs/distribution_helper.h b/paddle/phi/kernels/funcs/distribution_helper.h
@@ -192,15 +192,15 @@ struct normal_distribution<double> {
 #elif defined(__MUSACC__)
 template <typename T>
 struct uniform_distribution {
-  __device__ inline T operator()(murandStatePhilox4_32_10_t *state) const {
+  __device__ inline T operator()(murand_state_philox4x32_10 *state) const {
     return static_cast<T>(murand_uniform(state));
   }
   static constexpr int kReturnsCount = 1;
 };
 
 template <>
 struct uniform_distribution<float> {
-  __device__ inline float4 operator()(murandStatePhilox4_32_10_t *state) const {
+  __device__ inline float4 operator()(murand_state_philox4x32_10 *state) const {
     return murand_uniform4(state);
   }
   static constexpr int kReturnsCount = 4;
@@ -209,15 +209,15 @@ struct uniform_distribution<float> {
 template <>
 struct uniform_distribution<double> {
   __device__ inline double2 operator()(
-      murandStatePhilox4_32_10_t *state) const {
-    return murand_uniform2_double(state);
+      murand_state_philox4x32_10 *state) const {
+    return murand_uniform_double2(state);
   }
   static constexpr int kReturnsCount = 2;
 };
 
 template <>
 struct uniform_distribution<uint32_t> {
-  __device__ inline uint4 operator()(murandStatePhilox4_32_10_t *state) const {
+  __device__ inline uint4 operator()(murand_state_philox4x32_10 *state) const {
     return murand4(state);
   }
   static constexpr int kReturnsCount = 4;
@@ -226,7 +226,7 @@ struct uniform_distribution<uint32_t> {
 template <>
 struct uniform_distribution<uint64_t> {
   __device__ inline ulonglong2 operator()(
-      murandStatePhilox4_32_10_t *state) const {
+      murand_state_philox4x32_10 *state) const {
     ulonglong2 result;
     uint4 rand = murand4(state);
     result.x = (uint64_t)rand.x << 32 | rand.y;
@@ -238,7 +238,7 @@ struct uniform_distribution<uint64_t> {
 
 template <>
 struct normal_distribution<float> {
-  __device__ inline float4 operator()(murandStatePhilox4_32_10_t *state) const {
+  __device__ inline float4 operator()(murand_state_philox4x32_10 *state) const {
     return murand_normal4(state);
   }
   static constexpr int kReturnsCount = 4;
@@ -247,8 +247,8 @@ struct normal_distribution<float> {
 template <>
 struct normal_distribution<double> {
   __device__ inline double2 operator()(
-      murandStatePhilox4_32_10_t *state) const {
-    return murand_normal2_double(state);
+      murand_state_philox4x32_10 *state) const {
+    return murand_normal_double2(state);
   }
   static constexpr int kReturnsCount = 2;
 };
@@ -334,6 +334,10 @@ __global__ void DistributionKernel(size_t size,
   curandStatePhilox4_32_10_t state;
   curand_init(seed, idx + THREAD_ID_X, offset, &state);
   using SType = curandStatePhilox4_32_10_t;
+#elif defined(__MUSACC__)
+  murand_state_philox4x32_10 state;
+  murand_init(seed, idx + THREAD_ID_X, offset, &state);
+  using SType = murand_state_philox4x32_10;
 #else
   hiprandStatePhilox4_32_10_t state;
   hiprand_init(seed, idx + THREAD_ID_X, offset, &state);

diff --git a/paddle/phi/kernels/funcs/sparse/softmax.cu.h b/paddle/phi/kernels/funcs/sparse/softmax.cu.h
@@ -27,6 +27,8 @@ inline DenseTensor GetOffsets(const Context& dev_ctx,
                               const IntT dim) {
 #ifdef __HIPCC__
   const auto& policy = thrust::hip::par.on(dev_ctx.stream());
+#elif defined(__MUSACC__)
+  const auto& policy = thrust::musa::par.on(dev_ctx.stream());
 #else
   const auto& policy = thrust::cuda::par.on(dev_ctx.stream());
 #endif
@@ -87,6 +89,8 @@ std::tuple<DenseTensor, DenseTensor, DenseTensor, DenseTensor> ComputePoolMax(
     const IntT dim) {
 #ifdef __HIPCC__
   const auto& policy = thrust::hip::par.on(dev_ctx.stream());
+#elif defined(__MUSACC__)
+  const auto& policy = thrust::musa::par.on(dev_ctx.stream());
 #else
   const auto& policy = thrust::cuda::par.on(dev_ctx.stream());
 #endif

diff --git a/paddle/phi/kernels/reduce_amax_kernel.cc b/paddle/phi/kernels/reduce_amax_kernel.cc
@@ -34,7 +34,7 @@ void AMaxKernel(const Context& dev_ctx,
 PD_REGISTER_KERNEL(
     amax, CPU, ALL_LAYOUT, phi::AMaxKernel, float, double, int, int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(
     amax, GPU, ALL_LAYOUT, phi::AMaxKernel, float, double, int, int64_t) {}
 #endif

diff --git a/paddle/phi/kernels/reduce_amin_kernel.cc b/paddle/phi/kernels/reduce_amin_kernel.cc
@@ -34,7 +34,7 @@ void AMinKernel(const Context& dev_ctx,
 PD_REGISTER_KERNEL(
     amin, CPU, ALL_LAYOUT, phi::AMinKernel, float, double, int, int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(
     amin, GPU, ALL_LAYOUT, phi::AMinKernel, float, double, int, int64_t) {}
 #endif

diff --git a/paddle/phi/kernels/reduce_mean_kernel.cc b/paddle/phi/kernels/reduce_mean_kernel.cc
@@ -41,7 +41,7 @@ PD_REGISTER_KERNEL(mean,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(mean,
                    GPU,
                    ALL_LAYOUT,

diff --git a/paddle/phi/kernels/reduce_min_kernel.cc b/paddle/phi/kernels/reduce_min_kernel.cc
@@ -57,7 +57,7 @@ PD_REGISTER_KERNEL(
     min, GPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {}
 #endif
 
-#if defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(
     min, GPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t, phi::dtype::float16) {}
 #endif

diff --git a/paddle/phi/kernels/selected_rows/activation_kernel.cc b/paddle/phi/kernels/selected_rows/activation_kernel.cc
@@ -49,7 +49,7 @@ PD_REGISTER_KERNEL(
 PD_REGISTER_KERNEL(
     sqrt_sr, CPU, ALL_LAYOUT, phi::sr::SqrtKernel, float, double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 PD_REGISTER_KERNEL(square_sr,
                    GPU,

diff --git a/paddle/phi/kernels/selected_rows/assign_kernel.cc b/paddle/phi/kernels/selected_rows/assign_kernel.cc
@@ -41,7 +41,7 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign_sr,
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign_sr,
                                  GPU,
                                  ALL_LAYOUT,

diff --git a/paddle/phi/kernels/selected_rows/full_kernel.cc b/paddle/phi/kernels/selected_rows/full_kernel.cc
@@ -54,7 +54,7 @@ PD_REGISTER_KERNEL(full_sr,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(full_sr,
                    GPU,
                    ALL_LAYOUT,

diff --git a/paddle/phi/kernels/selected_rows/isfinite_kernel.cc b/paddle/phi/kernels/selected_rows/isfinite_kernel.cc
@@ -51,7 +51,7 @@ PD_REGISTER_KERNEL(isfinite_sr,
                    int,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(isinf_sr,
                    GPU,
                    ALL_LAYOUT,

diff --git a/paddle/phi/kernels/selected_rows/merge_selected_rows_kernel.cc b/paddle/phi/kernels/selected_rows/merge_selected_rows_kernel.cc
@@ -41,7 +41,7 @@ PD_REGISTER_KERNEL(merge_selected_rows,
                    float,
                    double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(merge_selected_rows,
                    GPU,
                    ALL_LAYOUT,

diff --git a/paddle/phi/kernels/selected_rows/scale_kernel.cc b/paddle/phi/kernels/selected_rows/scale_kernel.cc
@@ -54,7 +54,7 @@ PD_REGISTER_KERNEL(scale_sr,
                    int,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(scale_sr,
                    GPU,
                    ALL_LAYOUT,

diff --git a/paddle/phi/kernels/selected_rows/shape_kernel.cc b/paddle/phi/kernels/selected_rows/shape_kernel.cc
@@ -52,7 +52,7 @@ PD_REGISTER_KERNEL(shape_sr,
   kernel->OutputAt(0).SetDataType(phi::DataType::INT32);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(shape_sr,
                    GPU,
                    ALL_LAYOUT,

diff --git a/paddle/phi/kernels/selected_rows/uniform_kernel.cc b/paddle/phi/kernels/selected_rows/uniform_kernel.cc
@@ -77,7 +77,7 @@ PD_REGISTER_KERNEL(uniform_sr,
                    double,
                    phi::dtype::bfloat16) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 PD_REGISTER_KERNEL(uniform_raw_sr,
                    GPU,

diff --git a/paddle/phi/kernels/sparse/gpu/conv.cu.h b/paddle/phi/kernels/sparse/gpu/conv.cu.h
@@ -606,7 +606,7 @@ inline void CallThrustScan(const GPUContext& dev_ctx,
                            int* h_offsets_ptr) {
 #ifdef PADDLE_WITH_HIP
   thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
-#elif definfed(PADDLE_WITH_MUSA)
+#elif defined(PADDLE_WITH_MUSA)
   thrust::exclusive_scan(thrust::musa::par.on(dev_ctx.stream()),
 #else
   thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),

diff --git a/paddle/phi/kernels/sparse/gpu/slice_kernel.cu b/paddle/phi/kernels/sparse/gpu/slice_kernel.cu
@@ -324,7 +324,7 @@ void SliceCsrTensor2D(const Context& dev_ctx,
                                                         out_crows_data);
 #ifdef PADDLE_WITH_HIP
   thrust::inclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
-#ifdef PADDLE_WITH_MUSA
+#elif defined(PADDLE_WITH_MUSA)
   thrust::inclusive_scan(thrust::musa::par.on(dev_ctx.stream()),
 #else
   thrust::inclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),

diff --git a/paddle/phi/kernels/sparse/gpu/softmax_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/softmax_grad_kernel.cu
@@ -218,7 +218,7 @@ void SoftmaxCooGradGPUKernel(const Context& dev_ctx,
   bool is_same_offset = thrust::equal(thrust::hip::par.on(dev_ctx.stream()),
 #elif defined(PADDLE_WITH_MUSA)
   const auto& policy = thrust::musa::par.on(dev_ctx.stream());
-  bool is_same_offset = thrust::equal(thrust::hip::par.on(dev_ctx.stream()),
+  bool is_same_offset = thrust::equal(thrust::musa::par.on(dev_ctx.stream()),
 #else
   const auto& policy = thrust::cuda::par.on(dev_ctx.stream());
   bool is_same_offset = thrust::equal(thrust::cuda::par.on(dev_ctx.stream()),

diff --git a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
@@ -282,6 +282,15 @@ void CsrToCooGPUKernel(const GPUContext& dev_ctx,
     PADDLE_THROW(
         phi::errors::Unimplemented("'rocsparse_csr2coo' only supports batches "
                                    "with a value of 1 currently."));
+#elif defined(PADDLE_WITH_MUSA)
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, batches, 1);
+    GetBatchSizes<IntT><<<config.block_per_grid.x, config.thread_per_block.x>>>(
+        csr_crows_data, rows, batches, offsets_ptr);
+
+    thrust::exclusive_scan(thrust::musa::par.on(dev_ctx.stream()),
+                           offsets_ptr,
+                           offsets_ptr + batches,
+                           offsets_ptr);
 #else
     auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, batches, 1);
     GetBatchSizes<IntT><<<config.block_per_grid.x, config.thread_per_block.x>>>(

diff --git a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc
@@ -81,7 +81,7 @@ PD_REGISTER_KERNEL(sparse_coo_tensor_grad,
   kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(values_coo_grad,
                    GPU,
                    ALL_LAYOUT,

diff --git a/paddle/phi/kernels/transfer_layout_kernel.cc b/paddle/phi/kernels/transfer_layout_kernel.cc
@@ -221,7 +221,7 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE(transfer_layout,
                                  CPU,
                                  ALL_LAYOUT,
                                  phi::TransferLayoutKernel<phi::CPUContext>) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(transfer_layout,
                                  GPU,
                                  ALL_LAYOUT,