diff --git a/csrc/includes/conversion_utils.h b/csrc/includes/conversion_utils.h index 1c2d3b4b7a47..3a90a3e91ddf 100644 --- a/csrc/includes/conversion_utils.h +++ b/csrc/includes/conversion_utils.h @@ -265,7 +265,7 @@ DS_D_INLINE float2 to(__nv_bfloat162 val) template <> DS_D_INLINE __half to(double val) { -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ float val_f = __double2float_rn(val); return __float2half(val_f); #else diff --git a/csrc/includes/cublas_wrappers.h b/csrc/includes/cublas_wrappers.h index b016832dc9b3..1f359fcccf1b 100644 --- a/csrc/includes/cublas_wrappers.h +++ b/csrc/includes/cublas_wrappers.h @@ -10,7 +10,7 @@ #include #include #include -#ifndef __HIP_PLATFORM_HCC__ +#ifndef __HIP_PLATFORM_AMD__ #include #endif #include @@ -26,7 +26,7 @@ int cublas_gemm_ex(cublasHandle_t handle, const float* A, const float* B, float* C, -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ rocblas_gemm_algo algo = rocblas_gemm_algo_standard); #else cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT); @@ -43,7 +43,7 @@ int cublas_gemm_ex(cublasHandle_t handle, const __half* A, const __half* B, __half* C, -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ rocblas_gemm_algo algo = rocblas_gemm_algo_standard); #else cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP); @@ -64,7 +64,7 @@ int cublas_strided_batched_gemm(cublasHandle_t handle, int stride_B, int stride_C, int batch, -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ rocblas_gemm_algo algo = rocblas_gemm_algo_standard); #else cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT); @@ -85,7 +85,7 @@ int cublas_strided_batched_gemm(cublasHandle_t handle, int stride_B, int stride_C, int batch, -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ rocblas_gemm_algo algo = rocblas_gemm_algo_standard); #else cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP); diff --git a/csrc/includes/ds_kernel_utils.h b/csrc/includes/ds_kernel_utils.h index dfcf7dff7f29..a3af561adfe5 100644 --- a/csrc/includes/ds_kernel_utils.h +++ b/csrc/includes/ds_kernel_utils.h @@ -15,7 +15,7 @@ used throughout the codebase. #define DS_HD_INLINE __host__ __device__ __forceinline__ #define DS_D_INLINE __device__ __forceinline__ -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ // constexpr variant of warpSize for templating constexpr int hw_warp_size = 64; @@ -23,7 +23,7 @@ constexpr int hw_warp_size = 64; #include #include -#else // !__HIP_PLATFORM_HCC__ +#else // !__HIP_PLATFORM_AMD__ // constexpr variant of warpSize for templating constexpr int hw_warp_size = 32; @@ -40,7 +40,7 @@ constexpr int hw_warp_size = 32; #include #include -#endif //__HIP_PLATFORM_HCC__ +#endif //__HIP_PLATFORM_AMD__ inline int next_pow2(const int val) { diff --git a/csrc/includes/feed_forward.h b/csrc/includes/feed_forward.h index 8cf9ee9ef594..46e3ba748d52 100644 --- a/csrc/includes/feed_forward.h +++ b/csrc/includes/feed_forward.h @@ -48,7 +48,7 @@ class FeedForward { weights, input_ptr, out, -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ rocblas_gemm_algo(config_.gemm_algos[0])); #else cublasGemmAlgo_t(config_.gemm_algos[0])); @@ -77,7 +77,7 @@ class FeedForward { input_ptr, out_grad, weights_grad, -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ rocblas_gemm_algo(config_.gemm_algos[1])); #else cublasGemmAlgo_t(config_.gemm_algos[1])); @@ -94,7 +94,7 @@ class FeedForward { weights, out_grad, inp_grad_out, -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ rocblas_gemm_algo(config_.gemm_algos[2])); #else cublasGemmAlgo_t(config_.gemm_algos[2])); diff --git a/csrc/includes/gemm_test.h b/csrc/includes/gemm_test.h index 6d04921c2e81..278515174523 100644 --- a/csrc/includes/gemm_test.h +++ b/csrc/includes/gemm_test.h @@ -6,10 +6,10 @@ #pragma once #include -#ifndef __HIP_PLATFORM_HCC__ +#ifndef __HIP_PLATFORM_AMD__ #include #endif -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ #include #endif #include @@ -67,7 +67,7 @@ class GemmTest { B, A, C, -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ static_cast(algo)); #else static_cast(algo)); @@ -86,7 +86,7 @@ class GemmTest { A, C, B, -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ static_cast(algo)); #else static_cast(algo)); @@ -105,7 +105,7 @@ class GemmTest { B, C, A, -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ static_cast(algo)); #else static_cast(algo)); @@ -121,7 +121,7 @@ class GemmTest { float fast_latency = (std::numeric_limits::max)(); int fast_algo = 0; -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ for (int algo = (int)rocblas_gemm_algo_standard; algo <= (int)rocblas_gemm_algo_standard; #else for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP; @@ -211,7 +211,7 @@ class StridedGemmTest { stride_b, stride_c, bsz, -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ static_cast(algo)); #else static_cast(algo)); @@ -245,7 +245,7 @@ class StridedGemmTest { stride_b, stride_c, bsz, -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ static_cast(algo)); #else static_cast(algo)); @@ -276,7 +276,7 @@ class StridedGemmTest { stride_b, stride_c, bsz, -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ static_cast(algo)); #else static_cast(algo)); @@ -292,7 +292,7 @@ class StridedGemmTest { float fast_latency = (std::numeric_limits::max)(); int fast_algo = 0; -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ for (int algo = (int)rocblas_gemm_algo_standard; algo <= (int)rocblas_gemm_algo_standard; #else for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP; diff --git a/csrc/includes/general_kernels.h b/csrc/includes/general_kernels.h index 28e2cbf2984f..bd621d3c4329 100644 --- a/csrc/includes/general_kernels.h +++ b/csrc/includes/general_kernels.h @@ -8,7 +8,7 @@ #include #include -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ #include #else #include diff --git a/csrc/includes/quantizer.h b/csrc/includes/quantizer.h index 2ae10bad3527..f4f63160d79b 100644 --- a/csrc/includes/quantizer.h +++ b/csrc/includes/quantizer.h @@ -5,7 +5,7 @@ #pragma once -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ #include #else #include diff --git a/csrc/includes/reduction_utils.h b/csrc/includes/reduction_utils.h index bb55377b1dcf..8d0c2d6986a1 100644 --- a/csrc/includes/reduction_utils.h +++ b/csrc/includes/reduction_utils.h @@ -280,7 +280,7 @@ DS_D_INLINE __half init() template <> DS_D_INLINE __half2 init() { -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ return __half2{_Float16_2{0x0000, 0x0000}}; #else constexpr __half2_raw zero = {0x0000, 0x0000}; @@ -291,7 +291,7 @@ DS_D_INLINE __half2 init() template <> DS_D_INLINE __half2 init() { -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ return __half2{_Float16_2{0x7C00, 0x7C00}}; #else constexpr __half2_raw inf = {0x7C00, 0x7C00}; @@ -302,7 +302,7 @@ DS_D_INLINE __half2 init() template <> DS_D_INLINE __half2 init() { -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ return __half2{_Float16_2{0xFC00, 0xFC00}}; #else constexpr __half2_raw neg_inf = {0xFC00, 0xFC00}; @@ -414,7 +414,7 @@ DS_D_INLINE void _block(cg::thread_block& tb, // Unused when `partition_size == 1` or total_warps == 1 __shared__ float reduce_buffer[max_warps * elems]; -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ const int total_threads = blockDim.x * blockDim.y * blockDim.z; const int running_warps = total_threads / hw_warp_size; #else diff --git a/csrc/includes/strided_batch_gemm.h b/csrc/includes/strided_batch_gemm.h index cd126f4b0584..86d1e3dea11a 100644 --- a/csrc/includes/strided_batch_gemm.h +++ b/csrc/includes/strided_batch_gemm.h @@ -77,7 +77,7 @@ class StridedBatchGemm { stride_b, stride_c, bsz, -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ rocblas_gemm_algo(_config.gemm_algos[0])); #else cublasGemmAlgo_t(_config.gemm_algos[0])); @@ -105,7 +105,7 @@ class StridedBatchGemm { stride_b, stride_c, _config.batch_size, -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ rocblas_gemm_algo(_config.gemm_algos[0])); #else cublasGemmAlgo_t(_config.gemm_algos[0])); @@ -149,7 +149,7 @@ class StridedBatchGemm { stride_b, stride_c, bsz, -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ rocblas_gemm_algo(_config.gemm_algos[1])); #else cublasGemmAlgo_t(_config.gemm_algos[1])); @@ -178,7 +178,7 @@ class StridedBatchGemm { stride_b, stride_c, bsz, -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ rocblas_gemm_algo(_config.gemm_algos[2])); #else cublasGemmAlgo_t(_config.gemm_algos[2])); diff --git a/csrc/lamb/fused_lamb_cuda_kernel.cu b/csrc/lamb/fused_lamb_cuda_kernel.cu index ca94a8e5ec2c..d9bacae73457 100644 --- a/csrc/lamb/fused_lamb_cuda_kernel.cu +++ b/csrc/lamb/fused_lamb_cuda_kernel.cu @@ -17,7 +17,7 @@ #include // #include -#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305 +#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION > 305 #include #else #include @@ -109,7 +109,7 @@ __device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b) cg::sync(cta); -#if (__CUDA_ARCH__ >= 300) || (defined(__HIP_PLATFORM_HCC__) && HIP_VERSION >= 502) +#if (__CUDA_ARCH__ >= 300) || (defined(__HIP_PLATFORM_AMD__) && HIP_VERSION >= 502) if (tid < 32) { cg::coalesced_group active = cg::coalesced_threads(); diff --git a/csrc/quantization/fake_quantizer.cu b/csrc/quantization/fake_quantizer.cu index 0f53e5235921..4c08cd4cc3d2 100644 --- a/csrc/quantization/fake_quantizer.cu +++ b/csrc/quantization/fake_quantizer.cu @@ -11,7 +11,7 @@ namespace cg = cooperative_groups; __global__ void fake_quantize_kernel(__half* vals, int group_size, int num_bits) { -#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_AMD__) cg::thread_block b = cg::this_thread_block(); // tb cg::thread_block_tile<32> g = @@ -197,7 +197,7 @@ __global__ void sr_fake_quantize_kernel(__half* vals, int num_bits, std::pair seed) { -#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_AMD__) cg::thread_block b = cg::this_thread_block(); cg::thread_block_tile<32> g = cg::tiled_partition<32>(b); @@ -475,7 +475,7 @@ template void launch_sr_fake_quantize_kernel(__half* vals, __global__ void fake_quantize_kernel_asym(__half* vals, int group_size, int num_bits) { -#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_AMD__) cg::thread_block b = cg::this_thread_block(); cg::thread_block_tile<32> g = cg::tiled_partition<32>(b); @@ -720,7 +720,7 @@ __global__ void sr_fake_quantize_kernel_asym(__half* vals, int num_bits, std::pair seed) { -#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__) +#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_AMD__) cg::thread_block b = cg::this_thread_block(); cg::thread_block_tile<32> g = cg::tiled_partition<32>(b); diff --git a/csrc/random_ltd/token_sort.cu b/csrc/random_ltd/token_sort.cu index aae1b93e1c24..3049471cfe34 100644 --- a/csrc/random_ltd/token_sort.cu +++ b/csrc/random_ltd/token_sort.cu @@ -15,7 +15,7 @@ constexpr int granularity = 16; constexpr int mem_vals = granularity / sizeof(int32_t); constexpr int max_buffer_size = (threads + 1) * mem_vals; -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ constexpr int warp_size = 64; #else constexpr int warp_size = 32; diff --git a/csrc/spatial/includes/spatial_cuda_layers.h b/csrc/spatial/includes/spatial_cuda_layers.h index bf6ac0cf18c2..4f56f89f16cd 100644 --- a/csrc/spatial/includes/spatial_cuda_layers.h +++ b/csrc/spatial/includes/spatial_cuda_layers.h @@ -9,7 +9,7 @@ #define HALF_PRECISION_AVAILABLE = 1 #endif -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ #include #else #include diff --git a/csrc/transformer/cublas_wrappers.cu b/csrc/transformer/cublas_wrappers.cu index b48b7086921a..7821a8759ab0 100644 --- a/csrc/transformer/cublas_wrappers.cu +++ b/csrc/transformer/cublas_wrappers.cu @@ -5,7 +5,7 @@ #include "cublas_wrappers.h" -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ int cublas_gemm_ex(rocblas_handle handle, rocblas_operation transa, rocblas_operation transb, @@ -33,7 +33,7 @@ int cublas_gemm_ex(cublasHandle_t handle, cublasGemmAlgo_t algo) #endif { -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ rocblas_status status = rocblas_gemm_ex(handle, transa, transb, @@ -80,7 +80,7 @@ int cublas_gemm_ex(cublasHandle_t handle, algo); #endif -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ if (status != rocblas_status_success) { #else if (status != CUBLAS_STATUS_SUCCESS) { @@ -96,7 +96,7 @@ int cublas_gemm_ex(cublasHandle_t handle, return 0; } -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ int cublas_gemm_ex(rocblas_handle handle, rocblas_operation transa, rocblas_operation transb, @@ -124,7 +124,7 @@ int cublas_gemm_ex(cublasHandle_t handle, cublasGemmAlgo_t algo) #endif { -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ rocblas_status status = rocblas_gemm_ex(handle, transa, transb, @@ -171,7 +171,7 @@ int cublas_gemm_ex(cublasHandle_t handle, algo); #endif -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ if (status != rocblas_status_success) { #else if (status != CUBLAS_STATUS_SUCCESS) { @@ -187,7 +187,7 @@ int cublas_gemm_ex(cublasHandle_t handle, return 0; } -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ int cublas_strided_batched_gemm(rocblas_handle handle, int m, int n, @@ -223,7 +223,7 @@ int cublas_strided_batched_gemm(cublasHandle_t handle, cublasGemmAlgo_t algo) #endif { -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ rocblas_status status = rocblas_gemm_strided_batched_ex(handle, op_A, @@ -280,7 +280,7 @@ int cublas_strided_batched_gemm(cublasHandle_t handle, algo); #endif -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ if (status != rocblas_status_success) { #else if (status != CUBLAS_STATUS_SUCCESS) { @@ -297,7 +297,7 @@ int cublas_strided_batched_gemm(cublasHandle_t handle, return 0; } -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ int cublas_strided_batched_gemm(rocblas_handle handle, int m, int n, @@ -333,7 +333,7 @@ int cublas_strided_batched_gemm(cublasHandle_t handle, cublasGemmAlgo_t algo) #endif { -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ rocblas_status status = rocblas_gemm_strided_batched_ex(handle, op_A, @@ -390,7 +390,7 @@ int cublas_strided_batched_gemm(cublasHandle_t handle, algo); #endif -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ if (status != rocblas_status_success) { #else if (status != CUBLAS_STATUS_SUCCESS) { diff --git a/csrc/transformer/ds_transformer_cuda.cpp b/csrc/transformer/ds_transformer_cuda.cpp index 11afa74fa008..b637bb710c67 100644 --- a/csrc/transformer/ds_transformer_cuda.cpp +++ b/csrc/transformer/ds_transformer_cuda.cpp @@ -145,7 +145,7 @@ BertTransformerLayer::~BertTransformerLayer() template void BertTransformerLayer::Initialize() { -#ifndef __HIP_PLATFORM_HCC__ +#ifndef __HIP_PLATFORM_AMD__ if (std::is_same::value) cublasSetMathMode(_cublasHandle, CUBLAS_TENSOR_OP_MATH); #endif } diff --git a/csrc/transformer/inference/csrc/apply_rotary_pos_emb.cu b/csrc/transformer/inference/csrc/apply_rotary_pos_emb.cu index e326c762c0f3..a06dbb48fd33 100644 --- a/csrc/transformer/inference/csrc/apply_rotary_pos_emb.cu +++ b/csrc/transformer/inference/csrc/apply_rotary_pos_emb.cu @@ -4,7 +4,7 @@ // DeepSpeed Team #include "conversion_utils.h" -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ #include "hip/hip_cooperative_groups.h" #else #include "cooperative_groups.h" @@ -13,7 +13,7 @@ #include "inference_cuda_layers.h" #include "memory_access_utils.h" -#ifndef __HIP_PLATFORM_HCC__ +#ifndef __HIP_PLATFORM_AMD__ #include #endif @@ -99,7 +99,7 @@ __global__ void apply_rotary_pos_half(T* mixed_query, rope_theta, \ max_out_tokens); -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ #define LAUNCH_FOR_ALIGNMENT(ALIGNMENT) \ if (threads_per_head == 4) { \ LAUNCH_ROT_POS_EMB_HALF(4, ALIGNMENT); \ diff --git a/csrc/transformer/inference/csrc/pt_binding.cpp b/csrc/transformer/inference/csrc/pt_binding.cpp index 4fd64112e148..b7277d1e1678 100644 --- a/csrc/transformer/inference/csrc/pt_binding.cpp +++ b/csrc/transformer/inference/csrc/pt_binding.cpp @@ -163,7 +163,7 @@ at::Tensor einsum_sec_sm_ecm(at::Tensor& Q, at::Tensor& W) (T*)W.data_ptr(), (T*)Q.data_ptr(), (T*)O.data_ptr(), -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ rocblas_gemm_algo_standard); #else CUBLAS_GEMM_DEFAULT_TENSOR_OP); @@ -216,7 +216,7 @@ void attention_unfused(at::Tensor& prev_key_cont, seq_len * k, seq_len * soft_len, bsz * heads, -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ rocblas_gemm_algo_standard); #else CUBLAS_GEMM_DEFAULT_TENSOR_OP); @@ -253,7 +253,7 @@ void attention_unfused(at::Tensor& prev_key_cont, seq_len * soft_len, seq_len * k, bsz * heads, -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ rocblas_gemm_algo_standard); #else CUBLAS_GEMM_DEFAULT_TENSOR_OP); @@ -388,7 +388,7 @@ void attention_unfused(T* prev_key_cont, seq_len * k, seq_len * soft_len, bsz * heads, -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ rocblas_gemm_algo_standard); #else CUBLAS_GEMM_DEFAULT_TENSOR_OP); @@ -421,7 +421,7 @@ void attention_unfused(T* prev_key_cont, seq_len * soft_len, seq_len * k, bsz * heads, -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ rocblas_gemm_algo_standard); #else CUBLAS_GEMM_DEFAULT_TENSOR_OP); @@ -886,7 +886,7 @@ void quantized_gemm(void* output, weight16, (T*)input, (T*)output, -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ rocblas_gemm_algo_standard); #else CUBLAS_GEMM_DEFAULT_TENSOR_OP); @@ -931,7 +931,7 @@ at::Tensor qkv_unfused_cublas(at::Tensor& output, (T*)weight.data_ptr(), workspace, (T*)output.data_ptr(), -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ rocblas_gemm_algo_standard); #else CUBLAS_GEMM_DEFAULT_TENSOR_OP); @@ -1003,7 +1003,7 @@ std::vector ds_rms_qkv(at::Tensor& input, (T*)weight.data_ptr(), (T*)rms_norm.data_ptr(), (T*)output.data_ptr(), -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ rocblas_gemm_algo_standard); #else CUBLAS_GEMM_DEFAULT_TENSOR_OP); @@ -1089,7 +1089,7 @@ void quantized_gemm(at::Tensor& output, (T*)weight16.data_ptr(), (T*)input.data_ptr(), (T*)output.data_ptr(), -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ rocblas_gemm_algo_standard); #else CUBLAS_GEMM_DEFAULT_TENSOR_OP); @@ -1135,7 +1135,7 @@ at::Tensor ds_linear_layer(at::Tensor& input, (T*)weight.data_ptr(), (T*)input_cont.data_ptr(), (T*)output.data_ptr(), -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ rocblas_gemm_algo_standard); #else CUBLAS_GEMM_DEFAULT_TENSOR_OP); @@ -1353,7 +1353,7 @@ at::Tensor ds_vector_matmul(at::Tensor& input, (T*)weight.data_ptr(), (T*)input.data_ptr(), (T*)output.data_ptr(), -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ rocblas_gemm_algo_standard); #else CUBLAS_GEMM_DEFAULT_TENSOR_OP); @@ -1439,7 +1439,7 @@ at::Tensor mlp_unfused_cublas(at::Tensor& output, (T*)weight.data_ptr(), inp_norm, intermediate, -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ rocblas_gemm_algo_standard); #else CUBLAS_GEMM_DEFAULT_TENSOR_OP); @@ -1483,7 +1483,7 @@ at::Tensor mlp_unfused_cublas(at::Tensor& output, (T*)weight1.data_ptr(), intermediate, (T*)output.data_ptr(), -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ rocblas_gemm_algo_standard); #else CUBLAS_GEMM_DEFAULT_TENSOR_OP); @@ -1617,7 +1617,7 @@ std::vector ds_rms_mlp_gemm(at::Tensor& input, (T*)weight_interm.data_ptr(), (T*)inp_norm.data_ptr(), intermediate_ptr, -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ rocblas_gemm_algo_standard); #else CUBLAS_GEMM_DEFAULT_TENSOR_OP); @@ -1680,7 +1680,7 @@ std::vector ds_rms_mlp_gemm(at::Tensor& input, (T*)weight_out.data_ptr(), intermediate_ptr, (T*)output.data_ptr(), -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ rocblas_gemm_algo_standard, #else CUBLAS_GEMM_DEFAULT_TENSOR_OP, @@ -1742,7 +1742,7 @@ at::Tensor fused_gemm_gelu(at::Tensor& input, (T*)weight.data_ptr(), (T*)input.data_ptr(), (T*)intermediate.data_ptr(), -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ rocblas_gemm_algo_standard); #else CUBLAS_GEMM_DEFAULT_TENSOR_OP); @@ -1776,7 +1776,7 @@ at::Tensor fused_gemm_gelu(at::Tensor& input, (T*)weight_out.data_ptr(), (T*)intermediate.data_ptr(), (T*)output.data_ptr(), -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ rocblas_gemm_algo_standard); #else CUBLAS_GEMM_DEFAULT_TENSOR_OP); diff --git a/csrc/transformer/inference/csrc/softmax.cu b/csrc/transformer/inference/csrc/softmax.cu index c0e98e10b5ad..bb06cc149ef4 100644 --- a/csrc/transformer/inference/csrc/softmax.cu +++ b/csrc/transformer/inference/csrc/softmax.cu @@ -7,7 +7,7 @@ #include "conversion_utils.h" #include "inference_cuda_layers.h" -#ifndef __HIP_PLATFORM_HCC__ +#ifndef __HIP_PLATFORM_AMD__ #include #endif #include diff --git a/csrc/transformer/inference/csrc/transform.cu b/csrc/transformer/inference/csrc/transform.cu index 06b29647ab2a..8bc5a94e16ee 100644 --- a/csrc/transformer/inference/csrc/transform.cu +++ b/csrc/transformer/inference/csrc/transform.cu @@ -3,7 +3,7 @@ // DeepSpeed Team -#ifndef __HIP_PLATFORM_HCC__ +#ifndef __HIP_PLATFORM_AMD__ #include #endif #include "conversion_utils.h" diff --git a/csrc/transformer/inference/includes/inference_context.h b/csrc/transformer/inference/includes/inference_context.h index aaf568553560..378fd4e5e990 100644 --- a/csrc/transformer/inference/includes/inference_context.h +++ b/csrc/transformer/inference/includes/inference_context.h @@ -70,7 +70,7 @@ class InferenceContext { std::cerr << message << std::endl; throw std::runtime_error(message); } -#ifndef __HIP_PLATFORM_HCC__ +#ifndef __HIP_PLATFORM_AMD__ cublasSetMathMode(_cublasHandle, CUBLAS_TENSOR_OP_MATH); #endif cudaEventCreate(&_comp1_event); diff --git a/csrc/transformer/inference/includes/inference_cublas_wrappers.h b/csrc/transformer/inference/includes/inference_cublas_wrappers.h index 3471ba04254f..640751b12c8f 100644 --- a/csrc/transformer/inference/includes/inference_cublas_wrappers.h +++ b/csrc/transformer/inference/includes/inference_cublas_wrappers.h @@ -13,12 +13,12 @@ #endif #include #include -#ifndef __HIP_PLATFORM_HCC__ +#ifndef __HIP_PLATFORM_AMD__ #include #endif #include -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ int cublas_gemm_ex(rocblas_handle handle, rocblas_operation transa, rocblas_operation transb, @@ -49,7 +49,7 @@ int cublas_gemm_ex(cublasHandle_t handle, #endif { const int ldb = (b_stride == -1) ? ((transb == CUBLAS_OP_N) ? k : n) : b_stride; -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ rocblas_status status = rocblas_gemm_ex(handle, transa, transb, @@ -96,7 +96,7 @@ int cublas_gemm_ex(cublasHandle_t handle, algo); #endif -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ if (status != rocblas_status_success) { #else if (status != CUBLAS_STATUS_SUCCESS) { @@ -113,7 +113,7 @@ int cublas_gemm_ex(cublasHandle_t handle, } template -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ int cublas_gemm_ex(rocblas_handle handle, rocblas_operation transa, rocblas_operation transb, @@ -144,7 +144,7 @@ int cublas_gemm_ex(cublasHandle_t handle, #endif { const int ldb = (b_stride == -1) ? ((transb == CUBLAS_OP_N) ? k : n) : b_stride; -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ constexpr auto rocblas_dtype_16 = std::is_same::value ? rocblas_datatype_f16_r : rocblas_datatype_bf16_r; rocblas_status status = rocblas_gemm_ex(handle, @@ -194,7 +194,7 @@ int cublas_gemm_ex(cublasHandle_t handle, algo); #endif -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ if (status != rocblas_status_success) { #else if (status != CUBLAS_STATUS_SUCCESS) { @@ -210,7 +210,7 @@ int cublas_gemm_ex(cublasHandle_t handle, return 0; } -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ int cublas_strided_batched_gemm(rocblas_handle handle, int m, int n, @@ -246,7 +246,7 @@ int cublas_strided_batched_gemm(cublasHandle_t handle, cublasGemmAlgo_t algo) #endif { -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ rocblas_status status = rocblas_gemm_strided_batched_ex(handle, op_A, @@ -303,7 +303,7 @@ int cublas_strided_batched_gemm(cublasHandle_t handle, algo); #endif -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ if (status != rocblas_status_success) { #else if (status != CUBLAS_STATUS_SUCCESS) { @@ -321,7 +321,7 @@ int cublas_strided_batched_gemm(cublasHandle_t handle, } template -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ int cublas_strided_batched_gemm(rocblas_handle handle, int m, int n, @@ -357,7 +357,7 @@ int cublas_strided_batched_gemm(cublasHandle_t handle, cublasGemmAlgo_t algo) #endif { -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ constexpr auto rocblas_dtype_16 = std::is_same::value ? rocblas_datatype_f16_r : rocblas_datatype_bf16_r; rocblas_status status = @@ -417,7 +417,7 @@ int cublas_strided_batched_gemm(cublasHandle_t handle, algo); #endif -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ if (status != rocblas_status_success) { #else if (status != CUBLAS_STATUS_SUCCESS) {