Switch from HIP_PLATFORM_HCC to HIP_PLATFORM_AMD #4539

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

loadams merged 3 commits into master from loadams/hip-platform-hcc-fix

Oct 19, 2023

csrc/includes/conversion_utils.h

-Original file line number
+Diff line change
@@ Expand Up / @@ -265,7 +265,7 @@ DS_D_INLINE float2 to(__nv_bfloat162 val) @@
     template <>
     DS_D_INLINE __half to(double val)
     {
-    #ifdef __HIP_PLATFORM_HCC__
+    #ifdef __HIP_PLATFORM_AMD__
         float val_f = __double2float_rn(val);
         return __float2half(val_f);
     #else
@@ Expand Down @@

csrc/includes/cublas_wrappers.h

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -10,7 +10,7 @@
  
    #include <cuda.h>

    #include <cuda_fp16.h>

    #include <cuda_runtime.h>

    #ifndef __HIP_PLATFORM_HCC__

    #ifndef __HIP_PLATFORM_AMD__

    #include <mma.h>

    #endif

    #include <stdio.h>

    @@ -26,7 +26,7 @@ int cublas_gemm_ex(cublasHandle_t handle,
  
                       const float* A,

                       const float* B,

                       float* C,

    #ifdef __HIP_PLATFORM_HCC__

    #ifdef __HIP_PLATFORM_AMD__

                       rocblas_gemm_algo algo = rocblas_gemm_algo_standard);

    #else

                       cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);

    @@ -43,7 +43,7 @@ int cublas_gemm_ex(cublasHandle_t handle,
  
                       const __half* A,

                       const __half* B,

                       __half* C,

    #ifdef __HIP_PLATFORM_HCC__

    #ifdef __HIP_PLATFORM_AMD__

                       rocblas_gemm_algo algo = rocblas_gemm_algo_standard);

    #else

                       cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);

    @@ -64,7 +64,7 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
  
                                    int stride_B,

                                    int stride_C,

                                    int batch,

    #ifdef __HIP_PLATFORM_HCC__

    #ifdef __HIP_PLATFORM_AMD__

                                    rocblas_gemm_algo algo = rocblas_gemm_algo_standard);

    #else

                                    cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);

    @@ -85,7 +85,7 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
  
                                    int stride_B,

                                    int stride_C,

                                    int batch,

    #ifdef __HIP_PLATFORM_HCC__

    #ifdef __HIP_PLATFORM_AMD__

                                    rocblas_gemm_algo algo = rocblas_gemm_algo_standard);

    #else

                                    cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);

csrc/includes/ds_kernel_utils.h

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -15,15 +15,15 @@ used throughout the codebase.
  
    #define DS_HD_INLINE __host__ __device__ __forceinline__

    #define DS_D_INLINE __device__ __forceinline__

    #ifdef __HIP_PLATFORM_HCC__

    #ifdef __HIP_PLATFORM_AMD__

    // constexpr variant of warpSize for templating

    constexpr int hw_warp_size = 64;

    #define HALF_PRECISION_AVAILABLE = 1

    #include <hip/hip_cooperative_groups.h>

    #include <hip/hip_fp16.h>

    #else  // !__HIP_PLATFORM_HCC__

    #else  // !__HIP_PLATFORM_AMD__

    // constexpr variant of warpSize for templating

    constexpr int hw_warp_size = 32;

    @@ -40,7 +40,7 @@ constexpr int hw_warp_size = 32;
  
    #include <cooperative_groups.h>

    #include <cuda_fp16.h>

    #endif  //__HIP_PLATFORM_HCC__

    #endif  //__HIP_PLATFORM_AMD__

    inline int next_pow2(const int val)

    {

csrc/includes/feed_forward.h

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -48,7 +48,7 @@ class FeedForward {
  
                           weights,

                           input_ptr,

                           out,

    #ifdef __HIP_PLATFORM_HCC__

    #ifdef __HIP_PLATFORM_AMD__

                           rocblas_gemm_algo(config_.gemm_algos[0]));

    #else

                           cublasGemmAlgo_t(config_.gemm_algos[0]));

    @@ -77,7 +77,7 @@ class FeedForward {
  
                           input_ptr,

                           out_grad,

                           weights_grad,

    #ifdef __HIP_PLATFORM_HCC__

    #ifdef __HIP_PLATFORM_AMD__

                           rocblas_gemm_algo(config_.gemm_algos[1]));

    #else

                           cublasGemmAlgo_t(config_.gemm_algos[1]));

    @@ -94,7 +94,7 @@ class FeedForward {
  
                           weights,

                           out_grad,

                           inp_grad_out,

    #ifdef __HIP_PLATFORM_HCC__

    #ifdef __HIP_PLATFORM_AMD__

                           rocblas_gemm_algo(config_.gemm_algos[2]));

    #else

                           cublasGemmAlgo_t(config_.gemm_algos[2]));

csrc/includes/gemm_test.h

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -6,10 +6,10 @@
  
    #pragma once

    #include <cuda_fp16.h>

    #ifndef __HIP_PLATFORM_HCC__

    #ifndef __HIP_PLATFORM_AMD__

    #include <cuda_profiler_api.h>

    #endif

    #ifdef __HIP_PLATFORM_HCC__

    #ifdef __HIP_PLATFORM_AMD__

    #include <rocblas/rocblas.h>

    #endif

    #include <array>

    @@ -67,7 +67,7 @@ class GemmTest {
  
                               B,

                               A,

                               C,

    #ifdef __HIP_PLATFORM_HCC__

    #ifdef __HIP_PLATFORM_AMD__

                               static_cast<rocblas_gemm_algo>(algo));

    #else

                               static_cast<cublasGemmAlgo_t>(algo));

    @@ -86,7 +86,7 @@ class GemmTest {
  
                               A,

                               C,

                               B,

    #ifdef __HIP_PLATFORM_HCC__

    #ifdef __HIP_PLATFORM_AMD__

                               static_cast<rocblas_gemm_algo>(algo));

    #else

                               static_cast<cublasGemmAlgo_t>(algo));

    @@ -105,7 +105,7 @@ class GemmTest {
  
                               B,

                               C,

                               A,

    #ifdef __HIP_PLATFORM_HCC__

    #ifdef __HIP_PLATFORM_AMD__

                               static_cast<rocblas_gemm_algo>(algo));

    #else

                               static_cast<cublasGemmAlgo_t>(algo));

    @@ -121,7 +121,7 @@ class GemmTest {
  
            float fast_latency = (std::numeric_limits<float>::max)();

            int fast_algo = 0;

    #ifdef __HIP_PLATFORM_HCC__

    #ifdef __HIP_PLATFORM_AMD__

            for (int algo = (int)rocblas_gemm_algo_standard; algo <= (int)rocblas_gemm_algo_standard;

    #else

            for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;

    @@ -211,7 +211,7 @@ class StridedGemmTest {
  
                                            stride_b,

                                            stride_c,

                                            bsz,

    #ifdef __HIP_PLATFORM_HCC__

    #ifdef __HIP_PLATFORM_AMD__

                                            static_cast<rocblas_gemm_algo>(algo));

    #else

                                            static_cast<cublasGemmAlgo_t>(algo));

    @@ -245,7 +245,7 @@ class StridedGemmTest {
  
                                            stride_b,

                                            stride_c,

                                            bsz,

    #ifdef __HIP_PLATFORM_HCC__

    #ifdef __HIP_PLATFORM_AMD__

                                            static_cast<rocblas_gemm_algo>(algo));

    #else

                                            static_cast<cublasGemmAlgo_t>(algo));

    @@ -276,7 +276,7 @@ class StridedGemmTest {
  
                                            stride_b,

                                            stride_c,

                                            bsz,

    #ifdef __HIP_PLATFORM_HCC__

    #ifdef __HIP_PLATFORM_AMD__

                                            static_cast<rocblas_gemm_algo>(algo));

    #else

                                            static_cast<cublasGemmAlgo_t>(algo));

    @@ -292,7 +292,7 @@ class StridedGemmTest {
  
            float fast_latency = (std::numeric_limits<float>::max)();

            int fast_algo = 0;

    #ifdef __HIP_PLATFORM_HCC__

    #ifdef __HIP_PLATFORM_AMD__

            for (int algo = (int)rocblas_gemm_algo_standard; algo <= (int)rocblas_gemm_algo_standard;

    #else

            for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;

csrc/includes/general_kernels.h

-Original file line number
+Diff line change
@@ Expand Up / @@ -8,7 +8,7 @@ @@
     #include <stdio.h>
     #include <stdlib.h>
-    #ifdef __HIP_PLATFORM_HCC__
+    #ifdef __HIP_PLATFORM_AMD__
     #include <hip/hip_cooperative_groups.h>
     #else
     #include <cooperative_groups.h>
@@ Expand Down @@

csrc/includes/quantizer.h

-Original file line number
+Diff line change
@@ Expand Up / @@ -5,7 +5,7 @@ @@
     #pragma once
-    #ifdef __HIP_PLATFORM_HCC__
+    #ifdef __HIP_PLATFORM_AMD__
     #include <hip/hip_cooperative_groups.h>
     #else
     #include <cooperative_groups.h>
@@ Expand Down @@

csrc/includes/reduction_utils.h

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -280,7 +280,7 @@ DS_D_INLINE __half init<ROpType::Max>()
  
    template <>

    DS_D_INLINE __half2 init<ROpType::Add>()

    {

    #ifdef __HIP_PLATFORM_HCC__

    #ifdef __HIP_PLATFORM_AMD__

        return __half2{_Float16_2{0x0000, 0x0000}};

    #else

        constexpr __half2_raw zero = {0x0000, 0x0000};

    @@ -291,7 +291,7 @@ DS_D_INLINE __half2 init<ROpType::Add>()
  
    template <>

    DS_D_INLINE __half2 init<ROpType::Min>()

    {

    #ifdef __HIP_PLATFORM_HCC__

    #ifdef __HIP_PLATFORM_AMD__

        return __half2{_Float16_2{0x7C00, 0x7C00}};

    #else

        constexpr __half2_raw inf = {0x7C00, 0x7C00};

    @@ -302,7 +302,7 @@ DS_D_INLINE __half2 init<ROpType::Min>()
  
    template <>

    DS_D_INLINE __half2 init<ROpType::Max>()

    {

    #ifdef __HIP_PLATFORM_HCC__

    #ifdef __HIP_PLATFORM_AMD__

        return __half2{_Float16_2{0xFC00, 0xFC00}};

    #else

        constexpr __half2_raw neg_inf = {0xFC00, 0xFC00};

    @@ -414,7 +414,7 @@ DS_D_INLINE void _block(cg::thread_block& tb,
  
        // Unused when `partition_size == 1` or total_warps == 1

        __shared__ float reduce_buffer[max_warps * elems];

    #ifdef __HIP_PLATFORM_HCC__

    #ifdef __HIP_PLATFORM_AMD__

        const int total_threads = blockDim.x * blockDim.y * blockDim.z;

        const int running_warps = total_threads / hw_warp_size;

    #else

csrc/includes/strided_batch_gemm.h

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -77,7 +77,7 @@ class StridedBatchGemm {
  
                                        stride_b,

                                        stride_c,

                                        bsz,

    #ifdef __HIP_PLATFORM_HCC__

    #ifdef __HIP_PLATFORM_AMD__

                                        rocblas_gemm_algo(_config.gemm_algos[0]));

    #else

                                        cublasGemmAlgo_t(_config.gemm_algos[0]));

    @@ -105,7 +105,7 @@ class StridedBatchGemm {
  
                                        stride_b,

                                        stride_c,

                                        _config.batch_size,

    #ifdef __HIP_PLATFORM_HCC__

    #ifdef __HIP_PLATFORM_AMD__

                                        rocblas_gemm_algo(_config.gemm_algos[0]));

    #else

                                        cublasGemmAlgo_t(_config.gemm_algos[0]));

    @@ -149,7 +149,7 @@ class StridedBatchGemm {
  
                                        stride_b,

                                        stride_c,

                                        bsz,

    #ifdef __HIP_PLATFORM_HCC__

    #ifdef __HIP_PLATFORM_AMD__

                                        rocblas_gemm_algo(_config.gemm_algos[1]));

    #else

                                        cublasGemmAlgo_t(_config.gemm_algos[1]));

    @@ -178,7 +178,7 @@ class StridedBatchGemm {
  
                                        stride_b,

                                        stride_c,

                                        bsz,

    #ifdef __HIP_PLATFORM_HCC__

    #ifdef __HIP_PLATFORM_AMD__

                                        rocblas_gemm_algo(_config.gemm_algos[2]));

    #else

                                        cublasGemmAlgo_t(_config.gemm_algos[2]));

csrc/lamb/fused_lamb_cuda_kernel.cu

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -17,7 +17,7 @@
  
    #include <iostream>

    // #include <helper_functions.h>

    #if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305

    #if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION > 305

    #include <hip/hip_cooperative_groups.h>

    #else

    #include <cooperative_groups.h>

    @@ -109,7 +109,7 @@ __device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b)
  
        cg::sync(cta);

    #if (__CUDA_ARCH__ >= 300) || (defined(__HIP_PLATFORM_HCC__) && HIP_VERSION >= 502)

    #if (__CUDA_ARCH__ >= 300) || (defined(__HIP_PLATFORM_AMD__) && HIP_VERSION >= 502)

        if (tid < 32) {

            cg::coalesced_group active = cg::coalesced_threads();

csrc/quantization/fake_quantizer.cu

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -11,7 +11,7 @@ namespace cg = cooperative_groups;
  
    __global__ void fake_quantize_kernel(__half* vals, int group_size, int num_bits)

    {

    #if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)

    #if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_AMD__)

        cg::thread_block b = cg::this_thread_block();  // tb

        cg::thread_block_tile<32> g =

    @@ -197,7 +197,7 @@ __global__ void sr_fake_quantize_kernel(__half* vals,
  
                                            int num_bits,

                                            std::pair<uint64_t, uint64_t> seed)

    {

    #if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)

    #if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_AMD__)

        cg::thread_block b = cg::this_thread_block();

        cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);

    @@ -475,7 +475,7 @@ template void launch_sr_fake_quantize_kernel(__half* vals,
  
    __global__ void fake_quantize_kernel_asym(__half* vals, int group_size, int num_bits)

    {

    #if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)

    #if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_AMD__)

        cg::thread_block b = cg::this_thread_block();

        cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);

    @@ -720,7 +720,7 @@ __global__ void sr_fake_quantize_kernel_asym(__half* vals,
  
                                                 int num_bits,

                                                 std::pair<uint64_t, uint64_t> seed)

    {

    #if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)

    #if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_AMD__)

        cg::thread_block b = cg::this_thread_block();

        cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);

csrc/random_ltd/token_sort.cu

-Original file line number
+Diff line change
@@ Expand Up / @@ -15,7 +15,7 @@ constexpr int granularity = 16; @@
     constexpr int mem_vals = granularity / sizeof(int32_t);
     constexpr int max_buffer_size = (threads + 1) * mem_vals;
-    #ifdef __HIP_PLATFORM_HCC__
+    #ifdef __HIP_PLATFORM_AMD__
     constexpr int warp_size = 64;
     #else
     constexpr int warp_size = 32;
@@ Expand Down @@

csrc/spatial/includes/spatial_cuda_layers.h

-Original file line number
+Diff line change
@@ Expand Up / @@ -9,7 +9,7 @@ @@
     #define HALF_PRECISION_AVAILABLE = 1
     #endif
-    #ifdef __HIP_PLATFORM_HCC__
+    #ifdef __HIP_PLATFORM_AMD__
     #include <hip/hip_cooperative_groups.h>
     #else
     #include <cooperative_groups.h>
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Switch from HIP_PLATFORM_HCC to HIP_PLATFORM_AMD #4539

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!