Merge pull request PaddlePaddle#35 from mthreads/optimize_bulid_musa

[MTAI-484] fix(build): fix compiling error
hanhaowen-mt · Aug 13, 2023 · e0be7af · e0be7af
2 parents 5f2a054 + 145977d
commit e0be7af
Show file tree

Hide file tree

Showing 5 changed files with 44 additions and 43 deletions.
diff --git a/cmake/musa.cmake b/cmake/musa.cmake
@@ -91,6 +91,7 @@ list(APPEND MUSA_MCC_FLAGS -Wno-self-assign)
 list(APPEND MUSA_MCC_FLAGS -Wno-literal-conversion)
 list(APPEND MUSA_MCC_FLAGS -Wno-unknown-warning-option)
 list(APPEND MUSA_MCC_FLAGS -Wno-unused-variable)
+list(APPEND MUSA_MCC_FLAGS -Wno-unused-value)
 list(APPEND MUSA_MCC_FLAGS -Wno-unused-local-typedef)
 list(APPEND MUSA_MCC_FLAGS -Wno-unused-lambda-capture)
 list(APPEND MUSA_MCC_FLAGS -Wno-reorder-ctor)

diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h
@@ -130,6 +130,7 @@ inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
 #define VisitDataTypeCallback(cpp_type, proto_type) \
   do {                                              \
     if (type == proto_type) {                       \
+      visitor.template apply<cpp_type>();           \
       return;                                       \
     }                                               \
   } while (0)

diff --git a/paddle/phi/backends/gpu/gpu_primitives.h b/paddle/phi/backends/gpu/gpu_primitives.h
@@ -231,23 +231,21 @@ __device__ __forceinline__ void fastAtomicAdd(T *arr,
 
 // NOTE(zhangbo): cuda do not have atomicCAS for __nv_bfloat16.
 inline static __device__ uint32_t bf16_add_to_low_half(uint32_t val, float x) {
-  return 0;
-  //phi::dtype::bfloat16 low_half;
-  //// the bfloat16 in lower 16bits
-  //low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
-  //low_half =
-  //    static_cast<phi::dtype::bfloat16>(static_cast<float>(low_half) + x);
-  //return (val & 0xFFFF0000u) | low_half.x;
+  phi::dtype::bfloat16 low_half;
+  // the bfloat16 in lower 16bits
+  low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
+  low_half =
+      static_cast<phi::dtype::bfloat16>(static_cast<float>(low_half) + x);
+  return (val & 0xFFFF0000u) | low_half.x;
 }
 
 inline static __device__ uint32_t bf16_add_to_high_half(uint32_t val, float x) {
-  return 0;
-  //phi::dtype::bfloat16 high_half;
-  //// the bfloat16 in higher 16bits
-  //high_half.x = static_cast<uint16_t>(val >> 16);
-  //high_half =
-  //    static_cast<phi::dtype::bfloat16>(static_cast<float>(high_half) + x);
-  //return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
+  phi::dtype::bfloat16 high_half;
+  // the bfloat16 in higher 16bits
+  high_half.x = static_cast<uint16_t>(val >> 16);
+  high_half =
+      static_cast<phi::dtype::bfloat16>(static_cast<float>(high_half) + x);
+  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
 }
 
 #if CUDA_VERSION >= 11000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
@@ -451,23 +449,21 @@ CUDA_ATOMIC_WRAPPER(Max, phi::dtype::float16) {
 #endif
 
 inline static __device__ uint32_t bf16_max_to_low_half(uint32_t val, float x) {
-  return 0;
-  //phi::dtype::bfloat16 low_half;
-  //// The bfloat16 in lower 16bits
-  //low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
-  //low_half =
-  //    static_cast<phi::dtype::bfloat16>(max(static_cast<float>(low_half), x));
-  //return (val & 0xFFFF0000u) | low_half.x;
+  phi::dtype::bfloat16 low_half;
+  // The bfloat16 in lower 16bits
+  low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
+  low_half =
+      static_cast<phi::dtype::bfloat16>(max(static_cast<float>(low_half), x));
+  return (val & 0xFFFF0000u) | low_half.x;
 }
 
 inline static __device__ uint32_t bf16_max_to_high_half(uint32_t val, float x) {
-  return 0;
-  //phi::dtype::bfloat16 high_half;
-  //// The bfloat16 in higher 16bits
-  //high_half.x = static_cast<uint16_t>(val >> 16);
-  //high_half =
-  //    static_cast<phi::dtype::bfloat16>(max(static_cast<float>(high_half), x));
-  //return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
+  phi::dtype::bfloat16 high_half;
+  // The bfloat16 in higher 16bits
+  high_half.x = static_cast<uint16_t>(val >> 16);
+  high_half =
+      static_cast<phi::dtype::bfloat16>(max(static_cast<float>(high_half), x));
+  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
 }
 
 CUDA_ATOMIC_WRAPPER(Max, phi::dtype::bfloat16) {
@@ -639,23 +635,21 @@ CUDA_ATOMIC_WRAPPER(Min, phi::dtype::float16) {
 #endif
 
 inline static __device__ uint32_t bf16_min_to_low_half(uint32_t val, float x) {
-  return 0;
-  //phi::dtype::bfloat16 low_half;
-  //// The bfloat16 in lower 16bits
-  //low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
-  //low_half =
-  //    static_cast<phi::dtype::bfloat16>(min(static_cast<float>(low_half), x));
-  //return (val & 0xFFFF0000u) | low_half.x;
+  phi::dtype::bfloat16 low_half;
+  // The bfloat16 in lower 16bits
+  low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
+  low_half =
+      static_cast<phi::dtype::bfloat16>(min(static_cast<float>(low_half), x));
+  return (val & 0xFFFF0000u) | low_half.x;
 }
 
 inline static __device__ uint32_t bf16_min_to_high_half(uint32_t val, float x) {
-  return 0;
-  //phi::dtype::bfloat16 high_half;
-  //// The bfloat16 in higher 16bits
-  //high_half.x = static_cast<uint16_t>(val >> 16);
-  //high_half =
-  //    static_cast<phi::dtype::bfloat16>(min(static_cast<float>(high_half), x));
-  //return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
+  phi::dtype::bfloat16 high_half;
+  // The bfloat16 in higher 16bits
+  high_half.x = static_cast<uint16_t>(val >> 16);
+  high_half =
+      static_cast<phi::dtype::bfloat16>(min(static_cast<float>(high_half), x));
+  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
 }
 
 CUDA_ATOMIC_WRAPPER(Min, phi::dtype::bfloat16) {

diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
@@ -48,7 +48,6 @@ if(WITH_MUSA)
        "fusion/gpu/fused_softmax_mask_kernel.cu"
        "gpu/batch_norm_grad_kernel.cu"
        "gpu/batch_norm_kernel.cu"
-       "gpu/check_numerics_kernel.cu"
        "gpu/cholesky_grad_kernel.cu"
        "gpu/cholesky_solve_grad_kernel.cu"
        "gpu/conv_grad_kernel.cu"

diff --git a/paddle/phi/kernels/gpu/check_numerics_kernel.cu b/paddle/phi/kernels/gpu/check_numerics_kernel.cu
@@ -500,6 +500,11 @@ void CheckNumericsKernel(const Context& ctx,
                          const std::string& output_dir,
                          DenseTensor* stats,
                          DenseTensor* values) {
+#ifdef PADDLE_WITH_MUSA
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "OP check_numerics is unsupported for MUSA backend now!"));
+return;
+#else
   int dev_id = tensor.place().device;
   VLOG(6) << "op_type=" << op_type << ", var_name=" << var_name
           << ", dev_id=gpu:" << dev_id << ", numel=" << tensor.numel()
@@ -598,6 +603,7 @@ void CheckNumericsKernel(const Context& ctx,
     PrintStack<T>(ctx, *stats, op_type, var_name, dev_id);
   }
 #endif
+#endif
 }
 
 }  // namespace phi