diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc index 1898f28c73ad0..9eed3af335f41 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -112,6 +112,18 @@ class Pool2dOpConverter : public OpConverter { nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]); nvinfer1::ILayer *layer = nullptr; + nvinfer1::DimsHW g_pre_pad(0, 0); + nvinfer1::DimsHW g_post_pad(0, 0); + // paddle Non ceil_mode : Output size = (input size - filter size + 2 * + // padding) / stride (stride size) + 1 + // tensorrt EXPLICIT_ROUND_DOWN: O = floor((M - DK) / S) + 1 + // so if M - DK < 0 we need extra padding + if (input_shape.d[input_dims - 2] - ksize[0] + 2 * paddings[0] < 0) { + g_post_pad.h() = strides[0] - 1; + } + if (input_shape.d[input_dims - 1] - ksize[1] + 2 * paddings[1] < 0) { + g_post_pad.w() = strides[1] - 1; + } if (op_desc.HasAttr("enable_int8")) { #if IS_TRT_VERSION_GE(5000) @@ -123,6 +135,20 @@ class Pool2dOpConverter : public OpConverter { if (engine_->with_dynamic_shape()) { if (!adaptive && !global_pooling && !ceil_mode) { + // input_shape.d < 0 means we can't get shape info here. + // we may suffer from issue if shape is not met finally. + if ((padding_algorithm != "SAME") && + ((g_post_pad.w() > 0 && input_shape.d[input_dims - 2] > 0) || + (g_post_pad.h() > 0 && input_shape.d[input_dims - 1] > 0))) { + auto *pad_layer = TRT_ENGINE_ADD_LAYER(engine_, Padding, *input1, + g_pre_pad, g_post_pad); + PADDLE_ENFORCE_NOT_NULL( + pad_layer, platform::errors::Fatal( + "Pad layer in poolOp converter could not be " + "created. The pointer to pad layer is `NULL`.")); + input1 = pad_layer->getOutput(0); + } + auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *input1, nv_pool_type, nv_ksize); pool_layer->setStride(nv_strides); @@ -157,9 +183,8 @@ class Pool2dOpConverter : public OpConverter { if (global_pooling == true) { nv_ksize.d[0] = input_shape.d[input_dims - 2]; nv_ksize.d[1] = input_shape.d[input_dims - 1]; - auto *pool_layer = TRT_ENGINE_ADD_LAYER( - engine_, Pooling, *const_cast(input1), - nv_pool_type, nv_ksize); + auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *input1, + nv_pool_type, nv_ksize); PADDLE_ENFORCE_NOT_NULL( pool_layer, platform::errors::Fatal( "trt pool layer in converter could not be created.")); @@ -181,28 +206,38 @@ class Pool2dOpConverter : public OpConverter { } if (!adaptive) { - // Under ceil mode, the pre_pad and post_pad are used to - // record the the padding size. In some ceil mode cases, - // we do not need padding, so we initialize the two vars to 0. - - nvinfer1::DimsHW pre_pad(0, 0); - nvinfer1::DimsHW post_pad(0, 0); if (ceil_mode) { + nvinfer1::DimsHW pre_pad(0, 0); + nvinfer1::DimsHW post_pad(0, 0); // If ceil mode is true, we will pad the appropriate size to the input. DealCeilMode(input_shape, ksize, strides, paddings, &pre_pad, &post_pad, input_dims); - auto *pad_layer = TRT_ENGINE_ADD_LAYER( - engine_, Padding, *const_cast(input1), pre_pad, - post_pad); + auto *pad_layer = + TRT_ENGINE_ADD_LAYER(engine_, Padding, *input1, pre_pad, post_pad); + PADDLE_ENFORCE_NOT_NULL( pad_layer, platform::errors::Fatal( "Pad layer in poolOp converter could not be " "created. The pointer to pad layer is `NULL`.")); input1 = pad_layer->getOutput(0); } - auto *pool_layer = TRT_ENGINE_ADD_LAYER( - engine_, Pooling, *const_cast(input1), - nv_pool_type, nv_ksize); +#if IS_TRT_VERSION_GE(8000) + // Exclude padding pixels from the average mean is not supported well by + // TRT + // so enable padding for trt8.0 above. + if ((g_post_pad.w() > 0 || g_post_pad.h() > 0) && + (padding_algorithm != "SAME") && !ceil_mode) { + auto *pad_layer = TRT_ENGINE_ADD_LAYER(engine_, Padding, *input1, + g_pre_pad, g_post_pad); + PADDLE_ENFORCE_NOT_NULL( + pad_layer, platform::errors::Fatal( + "Pad layer in poolOp converter could not be " + "created. The pointer to pad layer is `NULL`.")); + input1 = pad_layer->getOutput(0); + } +#endif + auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *input1, + nv_pool_type, nv_ksize); PADDLE_ENFORCE_NOT_NULL( pool_layer, platform::errors::Fatal( "trt pool layer in converter could not be created.")); diff --git a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu index cbd6e3a2e4ffe..2b6541c5515ce 100644 --- a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu @@ -65,6 +65,7 @@ SlicePlugin::SlicePlugin(void const *serial_data, size_t serial_length) { DeserializeValue(&serial_data, &serial_length, &starts_); DeserializeValue(&serial_data, &serial_length, &ends_); DeserializeValue(&serial_data, &serial_length, &axes_); + DeserializeValue(&serial_data, &serial_length, &with_fp16_); cudaEventCreate(©_event_); cudaStreamCreate(©_stream_); } @@ -187,17 +188,17 @@ int SlicePlugin::enqueue(int batch_size, const void *const *inputs, } size_t SlicePlugin::getSerializationSize() const TRT_NOEXCEPT { - return getBaseSerializationSize() + SerializedSize(getPluginType()) + - SerializedSize(starts_) + SerializedSize(ends_) + - SerializedSize(axes_); + return getBaseSerializationSize() + SerializedSize(starts_) + + SerializedSize(ends_) + SerializedSize(axes_) + + SerializedSize(with_fp16_); } void SlicePlugin::serialize(void *buffer) const TRT_NOEXCEPT { - SerializeValue(&buffer, getPluginType()); serializeBase(buffer); SerializeValue(&buffer, starts_); SerializeValue(&buffer, ends_); SerializeValue(&buffer, axes_); + SerializeValue(&buffer, with_fp16_); } // Dynamic Plugin below. diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_slice_plugin.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_slice_plugin.py index 6ea2335c7a1b1..98232838ee08b 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_slice_plugin.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_slice_plugin.py @@ -86,5 +86,19 @@ def setUpTensorRTParams(self): self.enable_trt = True +class StaticSlicePluginTRTTestFp16(SlicePluginTRTTest): + def setUpTensorRTParams(self): + self.trt_parameters = SlicePluginTRTTest.TensorRTParam( + 1 << 30, 32, 1, AnalysisConfig.Precision.Half, True, False) + self.enable_trt = True + + +class StaticSlicePluginTRTTestFp32(SlicePluginTRTTest): + def setUpTensorRTParams(self): + self.trt_parameters = SlicePluginTRTTest.TensorRTParam( + 1 << 30, 32, 1, AnalysisConfig.Precision.Float32, True, False) + self.enable_trt = True + + if __name__ == "__main__": unittest.main()