Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion csrc/includes/conversion_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ DS_D_INLINE float2 to(__nv_bfloat162 val)
template <>
DS_D_INLINE __half to(double val)
{
#ifdef __HIP_PLATFORM_HCC__
#ifdef __HIP_PLATFORM_AMD__
float val_f = __double2float_rn(val);
return __float2half(val_f);
#else
Expand Down
10 changes: 5 additions & 5 deletions csrc/includes/cublas_wrappers.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#ifndef __HIP_PLATFORM_HCC__
#ifndef __HIP_PLATFORM_AMD__
#include <mma.h>
#endif
#include <stdio.h>
Expand All @@ -26,7 +26,7 @@ int cublas_gemm_ex(cublasHandle_t handle,
const float* A,
const float* B,
float* C,
#ifdef __HIP_PLATFORM_HCC__
#ifdef __HIP_PLATFORM_AMD__
rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
#else
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
Expand All @@ -43,7 +43,7 @@ int cublas_gemm_ex(cublasHandle_t handle,
const __half* A,
const __half* B,
__half* C,
#ifdef __HIP_PLATFORM_HCC__
#ifdef __HIP_PLATFORM_AMD__
rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
#else
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
Expand All @@ -64,7 +64,7 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
int stride_B,
int stride_C,
int batch,
#ifdef __HIP_PLATFORM_HCC__
#ifdef __HIP_PLATFORM_AMD__
rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
#else
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
Expand All @@ -85,7 +85,7 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
int stride_B,
int stride_C,
int batch,
#ifdef __HIP_PLATFORM_HCC__
#ifdef __HIP_PLATFORM_AMD__
rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
#else
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
Expand Down
6 changes: 3 additions & 3 deletions csrc/includes/ds_kernel_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,15 @@ used throughout the codebase.
#define DS_HD_INLINE __host__ __device__ __forceinline__
#define DS_D_INLINE __device__ __forceinline__

#ifdef __HIP_PLATFORM_HCC__
#ifdef __HIP_PLATFORM_AMD__

// constexpr variant of warpSize for templating
constexpr int hw_warp_size = 64;
#define HALF_PRECISION_AVAILABLE = 1
#include <hip/hip_cooperative_groups.h>
#include <hip/hip_fp16.h>

#else // !__HIP_PLATFORM_HCC__
#else // !__HIP_PLATFORM_AMD__

// constexpr variant of warpSize for templating
constexpr int hw_warp_size = 32;
Expand All @@ -40,7 +40,7 @@ constexpr int hw_warp_size = 32;
#include <cooperative_groups.h>
#include <cuda_fp16.h>

#endif //__HIP_PLATFORM_HCC__
#endif //__HIP_PLATFORM_AMD__

inline int next_pow2(const int val)
{
Expand Down
6 changes: 3 additions & 3 deletions csrc/includes/feed_forward.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ class FeedForward {
weights,
input_ptr,
out,
#ifdef __HIP_PLATFORM_HCC__
#ifdef __HIP_PLATFORM_AMD__
rocblas_gemm_algo(config_.gemm_algos[0]));
#else
cublasGemmAlgo_t(config_.gemm_algos[0]));
Expand Down Expand Up @@ -77,7 +77,7 @@ class FeedForward {
input_ptr,
out_grad,
weights_grad,
#ifdef __HIP_PLATFORM_HCC__
#ifdef __HIP_PLATFORM_AMD__
rocblas_gemm_algo(config_.gemm_algos[1]));
#else
cublasGemmAlgo_t(config_.gemm_algos[1]));
Expand All @@ -94,7 +94,7 @@ class FeedForward {
weights,
out_grad,
inp_grad_out,
#ifdef __HIP_PLATFORM_HCC__
#ifdef __HIP_PLATFORM_AMD__
rocblas_gemm_algo(config_.gemm_algos[2]));
#else
cublasGemmAlgo_t(config_.gemm_algos[2]));
Expand Down
20 changes: 10 additions & 10 deletions csrc/includes/gemm_test.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
#pragma once

#include <cuda_fp16.h>
#ifndef __HIP_PLATFORM_HCC__
#ifndef __HIP_PLATFORM_AMD__
#include <cuda_profiler_api.h>
#endif
#ifdef __HIP_PLATFORM_HCC__
#ifdef __HIP_PLATFORM_AMD__
#include <rocblas/rocblas.h>
#endif
#include <array>
Expand Down Expand Up @@ -67,7 +67,7 @@ class GemmTest {
B,
A,
C,
#ifdef __HIP_PLATFORM_HCC__
#ifdef __HIP_PLATFORM_AMD__
static_cast<rocblas_gemm_algo>(algo));
#else
static_cast<cublasGemmAlgo_t>(algo));
Expand All @@ -86,7 +86,7 @@ class GemmTest {
A,
C,
B,
#ifdef __HIP_PLATFORM_HCC__
#ifdef __HIP_PLATFORM_AMD__
static_cast<rocblas_gemm_algo>(algo));
#else
static_cast<cublasGemmAlgo_t>(algo));
Expand All @@ -105,7 +105,7 @@ class GemmTest {
B,
C,
A,
#ifdef __HIP_PLATFORM_HCC__
#ifdef __HIP_PLATFORM_AMD__
static_cast<rocblas_gemm_algo>(algo));
#else
static_cast<cublasGemmAlgo_t>(algo));
Expand All @@ -121,7 +121,7 @@ class GemmTest {
float fast_latency = (std::numeric_limits<float>::max)();
int fast_algo = 0;

#ifdef __HIP_PLATFORM_HCC__
#ifdef __HIP_PLATFORM_AMD__
for (int algo = (int)rocblas_gemm_algo_standard; algo <= (int)rocblas_gemm_algo_standard;
#else
for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
Expand Down Expand Up @@ -211,7 +211,7 @@ class StridedGemmTest {
stride_b,
stride_c,
bsz,
#ifdef __HIP_PLATFORM_HCC__
#ifdef __HIP_PLATFORM_AMD__
static_cast<rocblas_gemm_algo>(algo));
#else
static_cast<cublasGemmAlgo_t>(algo));
Expand Down Expand Up @@ -245,7 +245,7 @@ class StridedGemmTest {
stride_b,
stride_c,
bsz,
#ifdef __HIP_PLATFORM_HCC__
#ifdef __HIP_PLATFORM_AMD__
static_cast<rocblas_gemm_algo>(algo));
#else
static_cast<cublasGemmAlgo_t>(algo));
Expand Down Expand Up @@ -276,7 +276,7 @@ class StridedGemmTest {
stride_b,
stride_c,
bsz,
#ifdef __HIP_PLATFORM_HCC__
#ifdef __HIP_PLATFORM_AMD__
static_cast<rocblas_gemm_algo>(algo));
#else
static_cast<cublasGemmAlgo_t>(algo));
Expand All @@ -292,7 +292,7 @@ class StridedGemmTest {
float fast_latency = (std::numeric_limits<float>::max)();
int fast_algo = 0;

#ifdef __HIP_PLATFORM_HCC__
#ifdef __HIP_PLATFORM_AMD__
for (int algo = (int)rocblas_gemm_algo_standard; algo <= (int)rocblas_gemm_algo_standard;
#else
for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
Expand Down
2 changes: 1 addition & 1 deletion csrc/includes/general_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
#include <stdio.h>
#include <stdlib.h>

#ifdef __HIP_PLATFORM_HCC__
#ifdef __HIP_PLATFORM_AMD__
#include <hip/hip_cooperative_groups.h>
#else
#include <cooperative_groups.h>
Expand Down
2 changes: 1 addition & 1 deletion csrc/includes/quantizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

#pragma once

#ifdef __HIP_PLATFORM_HCC__
#ifdef __HIP_PLATFORM_AMD__
#include <hip/hip_cooperative_groups.h>
#else
#include <cooperative_groups.h>
Expand Down
8 changes: 4 additions & 4 deletions csrc/includes/reduction_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ DS_D_INLINE __half init<ROpType::Max>()
template <>
DS_D_INLINE __half2 init<ROpType::Add>()
{
#ifdef __HIP_PLATFORM_HCC__
#ifdef __HIP_PLATFORM_AMD__
return __half2{_Float16_2{0x0000, 0x0000}};
#else
constexpr __half2_raw zero = {0x0000, 0x0000};
Expand All @@ -291,7 +291,7 @@ DS_D_INLINE __half2 init<ROpType::Add>()
template <>
DS_D_INLINE __half2 init<ROpType::Min>()
{
#ifdef __HIP_PLATFORM_HCC__
#ifdef __HIP_PLATFORM_AMD__
return __half2{_Float16_2{0x7C00, 0x7C00}};
#else
constexpr __half2_raw inf = {0x7C00, 0x7C00};
Expand All @@ -302,7 +302,7 @@ DS_D_INLINE __half2 init<ROpType::Min>()
template <>
DS_D_INLINE __half2 init<ROpType::Max>()
{
#ifdef __HIP_PLATFORM_HCC__
#ifdef __HIP_PLATFORM_AMD__
return __half2{_Float16_2{0xFC00, 0xFC00}};
#else
constexpr __half2_raw neg_inf = {0xFC00, 0xFC00};
Expand Down Expand Up @@ -414,7 +414,7 @@ DS_D_INLINE void _block(cg::thread_block& tb,
// Unused when `partition_size == 1` or total_warps == 1
__shared__ float reduce_buffer[max_warps * elems];

#ifdef __HIP_PLATFORM_HCC__
#ifdef __HIP_PLATFORM_AMD__
const int total_threads = blockDim.x * blockDim.y * blockDim.z;
const int running_warps = total_threads / hw_warp_size;
#else
Expand Down
8 changes: 4 additions & 4 deletions csrc/includes/strided_batch_gemm.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ class StridedBatchGemm {
stride_b,
stride_c,
bsz,
#ifdef __HIP_PLATFORM_HCC__
#ifdef __HIP_PLATFORM_AMD__
rocblas_gemm_algo(_config.gemm_algos[0]));
#else
cublasGemmAlgo_t(_config.gemm_algos[0]));
Expand Down Expand Up @@ -105,7 +105,7 @@ class StridedBatchGemm {
stride_b,
stride_c,
_config.batch_size,
#ifdef __HIP_PLATFORM_HCC__
#ifdef __HIP_PLATFORM_AMD__
rocblas_gemm_algo(_config.gemm_algos[0]));
#else
cublasGemmAlgo_t(_config.gemm_algos[0]));
Expand Down Expand Up @@ -149,7 +149,7 @@ class StridedBatchGemm {
stride_b,
stride_c,
bsz,
#ifdef __HIP_PLATFORM_HCC__
#ifdef __HIP_PLATFORM_AMD__
rocblas_gemm_algo(_config.gemm_algos[1]));
#else
cublasGemmAlgo_t(_config.gemm_algos[1]));
Expand Down Expand Up @@ -178,7 +178,7 @@ class StridedBatchGemm {
stride_b,
stride_c,
bsz,
#ifdef __HIP_PLATFORM_HCC__
#ifdef __HIP_PLATFORM_AMD__
rocblas_gemm_algo(_config.gemm_algos[2]));
#else
cublasGemmAlgo_t(_config.gemm_algos[2]));
Expand Down
4 changes: 2 additions & 2 deletions csrc/lamb/fused_lamb_cuda_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
#include <iostream>

// #include <helper_functions.h>
#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION > 305
#include <hip/hip_cooperative_groups.h>
#else
#include <cooperative_groups.h>
Expand Down Expand Up @@ -109,7 +109,7 @@ __device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b)

cg::sync(cta);

#if (__CUDA_ARCH__ >= 300) || (defined(__HIP_PLATFORM_HCC__) && HIP_VERSION >= 502)
#if (__CUDA_ARCH__ >= 300) || (defined(__HIP_PLATFORM_AMD__) && HIP_VERSION >= 502)
if (tid < 32) {
cg::coalesced_group active = cg::coalesced_threads();

Expand Down
8 changes: 4 additions & 4 deletions csrc/quantization/fake_quantizer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ namespace cg = cooperative_groups;

__global__ void fake_quantize_kernel(__half* vals, int group_size, int num_bits)
{
#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_AMD__)

cg::thread_block b = cg::this_thread_block(); // tb
cg::thread_block_tile<32> g =
Expand Down Expand Up @@ -197,7 +197,7 @@ __global__ void sr_fake_quantize_kernel(__half* vals,
int num_bits,
std::pair<uint64_t, uint64_t> seed)
{
#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_AMD__)

cg::thread_block b = cg::this_thread_block();
cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
Expand Down Expand Up @@ -475,7 +475,7 @@ template void launch_sr_fake_quantize_kernel(__half* vals,

__global__ void fake_quantize_kernel_asym(__half* vals, int group_size, int num_bits)
{
#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_AMD__)

cg::thread_block b = cg::this_thread_block();
cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
Expand Down Expand Up @@ -720,7 +720,7 @@ __global__ void sr_fake_quantize_kernel_asym(__half* vals,
int num_bits,
std::pair<uint64_t, uint64_t> seed)
{
#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_AMD__)

cg::thread_block b = cg::this_thread_block();
cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
Expand Down
2 changes: 1 addition & 1 deletion csrc/random_ltd/token_sort.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ constexpr int granularity = 16;
constexpr int mem_vals = granularity / sizeof(int32_t);
constexpr int max_buffer_size = (threads + 1) * mem_vals;

#ifdef __HIP_PLATFORM_HCC__
#ifdef __HIP_PLATFORM_AMD__
constexpr int warp_size = 64;
#else
constexpr int warp_size = 32;
Expand Down
2 changes: 1 addition & 1 deletion csrc/spatial/includes/spatial_cuda_layers.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#define HALF_PRECISION_AVAILABLE = 1
#endif

#ifdef __HIP_PLATFORM_HCC__
#ifdef __HIP_PLATFORM_AMD__
#include <hip/hip_cooperative_groups.h>
#else
#include <cooperative_groups.h>
Expand Down
Loading