diff --git a/mmcv/ops/csrc/ms_deform_attn_cuda_kernel.cuh b/mmcv/ops/csrc/ms_deform_attn_cuda_kernel.cuh index baa46ae360..c888fabb3e 100644 --- a/mmcv/ops/csrc/ms_deform_attn_cuda_kernel.cuh +++ b/mmcv/ops/csrc/ms_deform_attn_cuda_kernel.cuh @@ -8,15 +8,12 @@ *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ -#ifndef OPENMMLAB_PRS_ADD_MSD_ATTEN_MMCV_OPS_CSRC_MS_DEFORM_ATTN_CUDA_KERNEL_CUH_ -#define OPENMMLAB_PRS_ADD_MSD_ATTEN_MMCV_OPS_CSRC_MS_DEFORM_ATTN_CUDA_KERNEL_CUH_ +#ifndef DEFORM_ATTN_CUDA_KERNEL +#define DEFORM_ATTN_CUDA_KERNEL +#include "common_cuda_helper.hpp" #include "pytorch_cuda_helper.hpp" -#define CUDA_KERNEL_LOOP(i, n) \ - for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ - i += blockDim.x * gridDim.x) - const int CUDA_NUM_THREADS = 1024; inline int GET_BLOCKS(const int N, const int num_threads) { return (N + num_threads - 1) / num_threads; @@ -213,7 +210,7 @@ __global__ void ms_deformable_im2col_gpu_kernel( const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *data_col) { - CUDA_KERNEL_LOOP(index, n) { + CUDA_1D_KERNEL_LOOP(index, n) { int _temp = index; const int c_col = _temp % channels; _temp /= channels; @@ -271,7 +268,7 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1( const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { - CUDA_KERNEL_LOOP(index, n) { + CUDA_1D_KERNEL_LOOP(index, n) { __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2]; __shared__ scalar_t cache_grad_attn_weight[blockSize]; unsigned int tid = threadIdx.x; @@ -362,7 +359,7 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2( const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { - CUDA_KERNEL_LOOP(index, n) { + CUDA_1D_KERNEL_LOOP(index, n) { __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2]; __shared__ scalar_t cache_grad_attn_weight[blockSize]; unsigned int tid = threadIdx.x; @@ -455,7 +452,7 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1( const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { - CUDA_KERNEL_LOOP(index, n) { + CUDA_1D_KERNEL_LOOP(index, n) { extern __shared__ int _s[]; scalar_t *cache_grad_sampling_loc = reinterpret_cast(_s); scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; @@ -547,7 +544,7 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2( const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { - CUDA_KERNEL_LOOP(index, n) { + CUDA_1D_KERNEL_LOOP(index, n) { extern __shared__ int _s[]; scalar_t *cache_grad_sampling_loc = reinterpret_cast(_s); scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; @@ -650,7 +647,7 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks( const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { - CUDA_KERNEL_LOOP(index, n) { + CUDA_1D_KERNEL_LOOP(index, n) { extern __shared__ int _s[]; scalar_t *cache_grad_sampling_loc = reinterpret_cast(_s); scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; @@ -753,7 +750,7 @@ __global__ void ms_deformable_col2im_gpu_kernel_gm( const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { - CUDA_KERNEL_LOOP(index, n) { + CUDA_1D_KERNEL_LOOP(index, n) { int _temp = index; const int c_col = _temp % channels; _temp /= channels; @@ -807,4 +804,4 @@ __global__ void ms_deformable_col2im_gpu_kernel_gm( } } } -#endif // OPENMMLAB_PRS_ADD_MSD_ATTEN_MMCV_OPS_CSRC_MS_DEFORM_ATTN_CUDA_KERNEL_CUH_ +#endif // DEFORM_ATTN_CUDA_KERNEL