-
Notifications
You must be signed in to change notification settings - Fork 5.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
【PaddlePaddle Hackathon 4 No.35】为 Paddle 优化 prelu op 在 GPU 上的计算性能 #51131
Changes from 5 commits
c8ae296
a30e68b
aeb4dcb
3dcddea
9cb235d
e7042f0
6239e7f
afdf7ca
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,157 +28,83 @@ inline static int PADDLE_GET_BLOCKS(const int N) { | |
} | ||
|
||
template <typename T> | ||
__global__ void PReluChannelFirstWiseKernel(const T *input, | ||
const T *alpha, | ||
T *output, | ||
size_t channel_num, | ||
size_t plane_size, | ||
size_t numel) { | ||
CUDA_KERNEL_LOOP(index, numel) { | ||
size_t temp = index / plane_size; | ||
size_t channel_index = temp % channel_num; | ||
T scale = alpha[channel_index]; | ||
T x = input[index]; | ||
T zero = static_cast<T>(0); | ||
output[index] = (x > zero) ? x : scale * x; | ||
struct PReluChannelFirstWiseCUDAFunctor { | ||
const T* x_; | ||
const T* alpha_; | ||
size_t channel_num_; | ||
size_t plane_size_; | ||
int numel_; | ||
const T zero = static_cast<T>(0); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这里不必作为成员变量,在
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @JamesLim-sy 已修改 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @JamesLim-sy 使用constexpr或const,在编译时会报错, 我暂时先去掉。 |
||
|
||
HOSTDEVICE inline PReluChannelFirstWiseCUDAFunctor(const T* x, | ||
const T* alpha, | ||
int numel, | ||
size_t channel_num, | ||
size_t plane_size) | ||
: x_(x), | ||
alpha_(alpha), | ||
numel_(numel), | ||
channel_num_(channel_num), | ||
plane_size_(plane_size) {} | ||
|
||
HOSTDEVICE inline T operator()(const unsigned int n) const { | ||
size_t temp = n / plane_size_; | ||
size_t channel_index = temp % channel_num_; | ||
T scale = alpha_[channel_index]; | ||
T x = x_[n]; | ||
return (x > zero) ? x : scale * x; | ||
} | ||
} | ||
|
||
template <typename T> | ||
__global__ void PReluChannelLastWiseKernel(const T *input, | ||
const T *alpha, | ||
T *output, | ||
size_t channel_num, | ||
size_t numel) { | ||
CUDA_KERNEL_LOOP(index, numel) { | ||
size_t channel_index = index % channel_num; | ||
T scale = alpha[channel_index]; | ||
T x = input[index]; | ||
T zero = static_cast<T>(0); | ||
output[index] = (x > zero) ? x : scale * x; | ||
} | ||
} | ||
|
||
template <typename T> | ||
__global__ void PReluElementWiseKernel(const T *input, | ||
const T *alpha, | ||
T *output, | ||
size_t spatial_size, | ||
size_t numel) { | ||
CUDA_KERNEL_LOOP(index, numel) { | ||
size_t element_index = index % spatial_size; | ||
T scale = alpha[element_index]; | ||
T x = input[index]; | ||
T zero = static_cast<T>(0); | ||
output[index] = (x > zero) ? x : scale * x; | ||
} | ||
} | ||
|
||
template <typename T> | ||
__global__ void PReluScalarKernel(const T *input, | ||
const T *alpha, | ||
T *output, | ||
size_t numel) { | ||
T scale = alpha[0]; | ||
CUDA_KERNEL_LOOP(index, numel) { | ||
T x = input[index]; | ||
T zero = static_cast<T>(0); | ||
output[index] = (x > zero) ? x : scale * x; | ||
} | ||
} | ||
|
||
template <typename T> | ||
class PreluChannelWiseDirectCUDAFunctor { | ||
public: | ||
void operator()(gpuStream_t stream, | ||
const T *input, | ||
const T *alpha, | ||
T *output, | ||
size_t batch_size, | ||
size_t channel, | ||
bool channel_last, | ||
size_t numel); | ||
}; | ||
|
||
template <typename T> | ||
class PreluElementWiseDirectCUDAFunctor { | ||
public: | ||
void operator()(gpuStream_t stream, | ||
const T *input, | ||
const T *alpha, | ||
T *output, | ||
size_t batch_size, | ||
size_t numel); | ||
struct PReluChannelLastWiseCUDAFunctor { | ||
const T* x_; | ||
const T* alpha_; | ||
size_t channel_num_; | ||
const T zero = static_cast<T>(0); | ||
|
||
HOSTDEVICE inline PReluChannelLastWiseCUDAFunctor(const T* x, | ||
const T* alpha, | ||
size_t channel_num) | ||
: x_(x), alpha_(alpha), channel_num_(channel_num) {} | ||
|
||
HOSTDEVICE inline T operator()(const unsigned int n) const { | ||
size_t channel_index = n % channel_num_; | ||
T scale = alpha_[channel_index]; | ||
T x = x_[n]; | ||
return (x > zero) ? x : scale * x; | ||
} | ||
}; | ||
|
||
template <typename T> | ||
class PreluScalarDirectCUDAFunctor { | ||
public: | ||
void operator()(gpuStream_t stream, | ||
const T *input, | ||
const T *alpha, | ||
T *output, | ||
size_t numel); | ||
struct PreluElementWiseDirectCUDAFunctor { | ||
const T* x_; | ||
const T* alpha_; | ||
size_t spatial_size_; | ||
T zero = static_cast<T>(0); | ||
|
||
HOSTDEVICE inline PreluElementWiseDirectCUDAFunctor(const T* x, | ||
const T* alpha, | ||
size_t spatial_size) | ||
: x_(x), alpha_(alpha), spatial_size_(spatial_size) {} | ||
|
||
HOSTDEVICE inline T operator()(const unsigned int n) const { | ||
size_t element_index = n % spatial_size_; | ||
T scale = alpha_[element_index]; | ||
T x = x_[n]; | ||
return (x > zero) ? x : scale * x; | ||
} | ||
}; | ||
|
||
template <typename T> | ||
void PreluChannelWiseDirectCUDAFunctor<T>::operator()(gpuStream_t stream, | ||
const T *input, | ||
const T *alpha, | ||
T *output, | ||
size_t batch_size, | ||
size_t channel, | ||
bool channel_last, | ||
size_t numel) { | ||
if (channel_last) { | ||
PReluChannelLastWiseKernel<<<PADDLE_GET_BLOCKS(numel), | ||
CUDA_NUM_THREADS, | ||
0, | ||
stream>>>( | ||
input, alpha, output, channel, numel); | ||
} else { | ||
PReluChannelFirstWiseKernel<<<PADDLE_GET_BLOCKS(numel), | ||
CUDA_NUM_THREADS, | ||
0, | ||
stream>>>( | ||
input, alpha, output, channel, numel / batch_size / channel, numel); | ||
struct PreluScalarDirectCUDAFunctor { | ||
const T* scalar_; | ||
const T zero = static_cast<T>(0); | ||
HOSTDEVICE inline PreluScalarDirectCUDAFunctor(const T* scalar) | ||
: scalar_(scalar) {} | ||
HOSTDEVICE inline T operator()(const T x) const { | ||
return (x > zero) ? x : scalar_[0] * x; | ||
} | ||
} | ||
|
||
template <typename T> | ||
void PreluElementWiseDirectCUDAFunctor<T>::operator()(gpuStream_t stream, | ||
const T *input, | ||
const T *alpha, | ||
T *output, | ||
size_t batch_size, | ||
size_t numel) { | ||
PReluElementWiseKernel<<<PADDLE_GET_BLOCKS(numel), | ||
CUDA_NUM_THREADS, | ||
0, | ||
stream>>>( | ||
input, alpha, output, numel / batch_size, numel); | ||
} | ||
|
||
template <typename T> | ||
void PreluScalarDirectCUDAFunctor<T>::operator()(gpuStream_t stream, | ||
const T *input, | ||
const T *alpha, | ||
T *output, | ||
size_t numel) { | ||
PReluScalarKernel<<<PADDLE_GET_BLOCKS(numel), CUDA_NUM_THREADS, 0, stream>>>( | ||
input, alpha, output, numel); | ||
} | ||
|
||
template class PreluChannelWiseDirectCUDAFunctor<float>; | ||
template class PreluChannelWiseDirectCUDAFunctor<phi::dtype::float16>; | ||
template class PreluChannelWiseDirectCUDAFunctor<double>; | ||
|
||
template class PreluElementWiseDirectCUDAFunctor<float>; | ||
template class PreluElementWiseDirectCUDAFunctor<phi::dtype::float16>; | ||
template class PreluElementWiseDirectCUDAFunctor<double>; | ||
|
||
template class PreluScalarDirectCUDAFunctor<float>; | ||
template class PreluScalarDirectCUDAFunctor<phi::dtype::float16>; | ||
template class PreluScalarDirectCUDAFunctor<double>; | ||
}; | ||
|
||
} // namespace phi |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
这里感觉不需要做 static_cast 的转换操作.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
之前跑benchmark的时候,定位了很久才发现这里一直报错,所以才修改了. @JamesLim-sy
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
原来是这样,那这里保持即可,不用做修改.