diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu index 970f679ee8fb0..214e1a81e7dc0 100644 --- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu @@ -226,14 +226,13 @@ nvinfer1::DataType QkvToContextPluginDynamic::getOutputDataType( } template -__global__ void apply_scale(T* data, T scale, int n) { +__global__ void apply_scale(T *data, T scale, int n) { #if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) int tid = blockIdx.x * blockDim.x + threadIdx.x; data[tid] = data[tid] * scale; #endif } - int QkvToContextPluginDynamic::enqueue( const nvinfer1::PluginTensorDesc *input_desc, const nvinfer1::PluginTensorDesc *output_desc, const void *const *inputs, @@ -304,7 +303,8 @@ int QkvToContextPluginDynamic::enqueue( constexpr int threads = 128; int blocks = (n_q + threads - 1) / threads; - apply_scale<<>>(tptr, static_cast(scale_), n_q); + apply_scale<<>>(tptr, static_cast(scale_), + n_q); const platform::CUDADeviceContext &dev_ctx = *device_ctx; operators::math::MultiHeadGPUComputeFunctor multihead_compute_func;