Skip to content

Commit

Permalink
Merge pull request PaddlePaddle#35 from mthreads/optimize_bulid_musa
Browse files Browse the repository at this point in the history
[MTAI-484] fix(build): fix compiling error
  • Loading branch information
caizhi-mt authored and mt-robot committed Aug 13, 2023
2 parents 5f2a054 + 145977d commit e0be7af
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 43 deletions.
1 change: 1 addition & 0 deletions cmake/musa.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ list(APPEND MUSA_MCC_FLAGS -Wno-self-assign)
list(APPEND MUSA_MCC_FLAGS -Wno-literal-conversion)
list(APPEND MUSA_MCC_FLAGS -Wno-unknown-warning-option)
list(APPEND MUSA_MCC_FLAGS -Wno-unused-variable)
list(APPEND MUSA_MCC_FLAGS -Wno-unused-value)
list(APPEND MUSA_MCC_FLAGS -Wno-unused-local-typedef)
list(APPEND MUSA_MCC_FLAGS -Wno-unused-lambda-capture)
list(APPEND MUSA_MCC_FLAGS -Wno-reorder-ctor)
Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/framework/data_type.h
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
#define VisitDataTypeCallback(cpp_type, proto_type) \
do { \
if (type == proto_type) { \
visitor.template apply<cpp_type>(); \
return; \
} \
} while (0)
Expand Down
78 changes: 36 additions & 42 deletions paddle/phi/backends/gpu/gpu_primitives.h
Original file line number Diff line number Diff line change
Expand Up @@ -231,23 +231,21 @@ __device__ __forceinline__ void fastAtomicAdd(T *arr,

// NOTE(zhangbo): cuda do not have atomicCAS for __nv_bfloat16.
inline static __device__ uint32_t bf16_add_to_low_half(uint32_t val, float x) {
return 0;
//phi::dtype::bfloat16 low_half;
//// the bfloat16 in lower 16bits
//low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
//low_half =
// static_cast<phi::dtype::bfloat16>(static_cast<float>(low_half) + x);
//return (val & 0xFFFF0000u) | low_half.x;
phi::dtype::bfloat16 low_half;
// the bfloat16 in lower 16bits
low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
low_half =
static_cast<phi::dtype::bfloat16>(static_cast<float>(low_half) + x);
return (val & 0xFFFF0000u) | low_half.x;
}

inline static __device__ uint32_t bf16_add_to_high_half(uint32_t val, float x) {
return 0;
//phi::dtype::bfloat16 high_half;
//// the bfloat16 in higher 16bits
//high_half.x = static_cast<uint16_t>(val >> 16);
//high_half =
// static_cast<phi::dtype::bfloat16>(static_cast<float>(high_half) + x);
//return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
phi::dtype::bfloat16 high_half;
// the bfloat16 in higher 16bits
high_half.x = static_cast<uint16_t>(val >> 16);
high_half =
static_cast<phi::dtype::bfloat16>(static_cast<float>(high_half) + x);
return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
}

#if CUDA_VERSION >= 11000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
Expand Down Expand Up @@ -451,23 +449,21 @@ CUDA_ATOMIC_WRAPPER(Max, phi::dtype::float16) {
#endif

inline static __device__ uint32_t bf16_max_to_low_half(uint32_t val, float x) {
return 0;
//phi::dtype::bfloat16 low_half;
//// The bfloat16 in lower 16bits
//low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
//low_half =
// static_cast<phi::dtype::bfloat16>(max(static_cast<float>(low_half), x));
//return (val & 0xFFFF0000u) | low_half.x;
phi::dtype::bfloat16 low_half;
// The bfloat16 in lower 16bits
low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
low_half =
static_cast<phi::dtype::bfloat16>(max(static_cast<float>(low_half), x));
return (val & 0xFFFF0000u) | low_half.x;
}

inline static __device__ uint32_t bf16_max_to_high_half(uint32_t val, float x) {
return 0;
//phi::dtype::bfloat16 high_half;
//// The bfloat16 in higher 16bits
//high_half.x = static_cast<uint16_t>(val >> 16);
//high_half =
// static_cast<phi::dtype::bfloat16>(max(static_cast<float>(high_half), x));
//return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
phi::dtype::bfloat16 high_half;
// The bfloat16 in higher 16bits
high_half.x = static_cast<uint16_t>(val >> 16);
high_half =
static_cast<phi::dtype::bfloat16>(max(static_cast<float>(high_half), x));
return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
}

CUDA_ATOMIC_WRAPPER(Max, phi::dtype::bfloat16) {
Expand Down Expand Up @@ -639,23 +635,21 @@ CUDA_ATOMIC_WRAPPER(Min, phi::dtype::float16) {
#endif

inline static __device__ uint32_t bf16_min_to_low_half(uint32_t val, float x) {
return 0;
//phi::dtype::bfloat16 low_half;
//// The bfloat16 in lower 16bits
//low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
//low_half =
// static_cast<phi::dtype::bfloat16>(min(static_cast<float>(low_half), x));
//return (val & 0xFFFF0000u) | low_half.x;
phi::dtype::bfloat16 low_half;
// The bfloat16 in lower 16bits
low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
low_half =
static_cast<phi::dtype::bfloat16>(min(static_cast<float>(low_half), x));
return (val & 0xFFFF0000u) | low_half.x;
}

inline static __device__ uint32_t bf16_min_to_high_half(uint32_t val, float x) {
return 0;
//phi::dtype::bfloat16 high_half;
//// The bfloat16 in higher 16bits
//high_half.x = static_cast<uint16_t>(val >> 16);
//high_half =
// static_cast<phi::dtype::bfloat16>(min(static_cast<float>(high_half), x));
//return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
phi::dtype::bfloat16 high_half;
// The bfloat16 in higher 16bits
high_half.x = static_cast<uint16_t>(val >> 16);
high_half =
static_cast<phi::dtype::bfloat16>(min(static_cast<float>(high_half), x));
return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
}

CUDA_ATOMIC_WRAPPER(Min, phi::dtype::bfloat16) {
Expand Down
1 change: 0 additions & 1 deletion paddle/phi/kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ if(WITH_MUSA)
"fusion/gpu/fused_softmax_mask_kernel.cu"
"gpu/batch_norm_grad_kernel.cu"
"gpu/batch_norm_kernel.cu"
"gpu/check_numerics_kernel.cu"
"gpu/cholesky_grad_kernel.cu"
"gpu/cholesky_solve_grad_kernel.cu"
"gpu/conv_grad_kernel.cu"
Expand Down
6 changes: 6 additions & 0 deletions paddle/phi/kernels/gpu/check_numerics_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -500,6 +500,11 @@ void CheckNumericsKernel(const Context& ctx,
const std::string& output_dir,
DenseTensor* stats,
DenseTensor* values) {
#ifdef PADDLE_WITH_MUSA
PADDLE_THROW(phi::errors::Unimplemented(
"OP check_numerics is unsupported for MUSA backend now!"));
return;
#else
int dev_id = tensor.place().device;
VLOG(6) << "op_type=" << op_type << ", var_name=" << var_name
<< ", dev_id=gpu:" << dev_id << ", numel=" << tensor.numel()
Expand Down Expand Up @@ -598,6 +603,7 @@ void CheckNumericsKernel(const Context& ctx,
PrintStack<T>(ctx, *stats, op_type, var_name, dev_id);
}
#endif
#endif
}

} // namespace phi
Expand Down

0 comments on commit e0be7af

Please sign in to comment.