From a8458e67293910eac141e8d19f43518b614ca87f Mon Sep 17 00:00:00 2001
From: huangjianhui <852142024@qq.com>
Date: Thu, 21 Jul 2022 15:38:21 +0800
Subject: [PATCH 1/9] Add new model PaddleSeg (#30)

* Support new model PaddleSeg

* Fix conflict

* PaddleSeg add visulization function

* fix bug

* Fix BindPPSeg wrong name

* Fix variable name

* Update by comments

* Add ppseg-unet example python version

Co-authored-by: Jason <jiangjiajun@baidu.com>
---
 examples/vision/ppseg_unet.cc                 |  59 ++++++++
 fastdeploy/vision.h                           |   1 +
 fastdeploy/vision/__init__.py                 |   1 +
 fastdeploy/vision/common/result.cc            |  22 +++
 fastdeploy/vision/common/result.h             |  13 ++
 fastdeploy/vision/ppseg/__init__.py           |  37 +++++
 fastdeploy/vision/ppseg/model.cc              | 140 ++++++++++++++++++
 fastdeploy/vision/ppseg/model.h               |  35 +++++
 fastdeploy/vision/ppseg/ppseg_pybind.cc       |  30 ++++
 fastdeploy/vision/vision_pybind.cc            |   8 +
 fastdeploy/vision/visualize/__init__.py       |   5 +
 fastdeploy/vision/visualize/segmentation.cc   |  46 ++++++
 fastdeploy/vision/visualize/visualize.h       |   7 +-
 .../vision/visualize/visualize_pybind.cc      |  21 ++-
 model_zoo/vision/ppseg/ppseg_unet.py          |  36 +++++
 15 files changed, 453 insertions(+), 8 deletions(-)
 create mode 100644 examples/vision/ppseg_unet.cc
 create mode 100644 fastdeploy/vision/ppseg/__init__.py
 create mode 100644 fastdeploy/vision/ppseg/model.cc
 create mode 100644 fastdeploy/vision/ppseg/model.h
 create mode 100644 fastdeploy/vision/ppseg/ppseg_pybind.cc
 create mode 100644 fastdeploy/vision/visualize/segmentation.cc
 create mode 100644 model_zoo/vision/ppseg/ppseg_unet.py
diff --git a/examples/vision/ppseg_unet.cc b/examples/vision/ppseg_unet.cc
new file mode 100644
index 0000000000..cb33611ad4
--- /dev/null
+++ b/examples/vision/ppseg_unet.cc
@@ -0,0 +1,59 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/vision.h"
+#include "yaml-cpp/yaml.h"
+
+int main() {
+  namespace vis = fastdeploy::vision;
+
+  std::string model_file = "../resources/models/unet_Cityscapes/model.pdmodel";
+  std::string params_file =
+      "../resources/models/unet_Cityscapes/model.pdiparams";
+  std::string config_file = "../resources/models/unet_Cityscapes/deploy.yaml";
+  std::string img_path = "../resources/images/cityscapes_demo.png";
+  std::string vis_path = "../resources/outputs/vis.jpeg";
+
+  auto model = vis::ppseg::Model(model_file, params_file, config_file);
+  if (!model.Initialized()) {
+    std::cerr << "Init Failed." << std::endl;
+    return -1;
+  }
+
+  cv::Mat im = cv::imread(img_path);
+  cv::Mat vis_im;
+
+  vis::SegmentationResult res;
+  if (!model.Predict(&im, &res)) {
+    std::cerr << "Prediction Failed." << std::endl;
+    return -1;
+  } else {
+    std::cout << "Prediction Done!" << std::endl;
+  }
+
+  // 输出预测框结果
+  std::cout << res.Str() << std::endl;
+
+  YAML::Node cfg = YAML::LoadFile(config_file);
+  int num_classes = 19;
+  if (cfg["Deploy"]["num_classes"]) {
+    num_classes = cfg["Deploy"]["num_classes"].as<int>();
+  }
+
+  // 可视化预测结果
+  vis::Visualize::VisSegmentation(im, res, &vis_im, num_classes);
+  cv::imwrite(vis_path, vis_im);
+  std::cout << "Inference Done! Saved: " << vis_path << std::endl;
+  return 0;
+}
diff --git a/fastdeploy/vision.h b/fastdeploy/vision.h
index 68c0881cac..d539482a72 100644
--- a/fastdeploy/vision.h
+++ b/fastdeploy/vision.h
@@ -19,6 +19,7 @@
 #include "fastdeploy/vision/meituan/yolov6.h"
 #include "fastdeploy/vision/ppcls/model.h"
 #include "fastdeploy/vision/ppdet/ppyoloe.h"
+#include "fastdeploy/vision/ppseg/model.h"
 #include "fastdeploy/vision/ultralytics/yolov5.h"
 #include "fastdeploy/vision/wongkinyiu/yolor.h"
 #include "fastdeploy/vision/wongkinyiu/yolov7.h"
diff --git a/fastdeploy/vision/__init__.py b/fastdeploy/vision/__init__.py
index 6acbf0c376..08b0d68124 100644
--- a/fastdeploy/vision/__init__.py
+++ b/fastdeploy/vision/__init__.py
@@ -16,6 +16,7 @@
 from . import evaluation
 from . import ppcls
 from . import ppdet
+from . import ppseg
 from . import ultralytics
 from . import meituan
 from . import megvii
diff --git a/fastdeploy/vision/common/result.cc b/fastdeploy/vision/common/result.cc
index ece0973c0c..06a85ea454 100644
--- a/fastdeploy/vision/common/result.cc
+++ b/fastdeploy/vision/common/result.cc
@@ -72,5 +72,27 @@ std::string DetectionResult::Str() {
   return out;
 }
 
+void SegmentationResult::Clear() {
+  std::vector<std::vector<int64_t>>().swap(masks);
+}
+
+void SegmentationResult::Resize(int64_t height, int64_t width) {
+  masks.resize(height, std::vector<int64_t>(width));
+}
+
+std::string SegmentationResult::Str() {
+  std::string out;
+  out = "SegmentationResult Image masks 10 rows x 10 cols: \n";
+  for (size_t i = 0; i < 10; ++i) {
+    out += "[";
+    for (size_t j = 0; j < 10; ++j) {
+      out = out + std::to_string(masks[i][j]) + ", ";
+    }
+    out += ".....]\n";
+  }
+  out += "...........\n";
+  return out;
+}
+
 }  // namespace vision
 }  // namespace fastdeploy
diff --git a/fastdeploy/vision/common/result.h b/fastdeploy/vision/common/result.h
index 22227a26cb..7ff104250f 100644
--- a/fastdeploy/vision/common/result.h
+++ b/fastdeploy/vision/common/result.h
@@ -56,5 +56,18 @@ struct FASTDEPLOY_DECL DetectionResult : public BaseResult {
   std::string Str();
 };
 
+struct FASTDEPLOY_DECL SegmentationResult : public BaseResult {
+  // mask
+  std::vector<std::vector<int64_t>> masks;
+
+  ResultType type = ResultType::SEGMENTATION;
+
+  void Clear();
+
+  void Resize(int64_t height, int64_t width);
+
+  std::string Str();
+};
+
 }  // namespace vision
 }  // namespace fastdeploy
diff --git a/fastdeploy/vision/ppseg/__init__.py b/fastdeploy/vision/ppseg/__init__.py
new file mode 100644
index 0000000000..b580c01455
--- /dev/null
+++ b/fastdeploy/vision/ppseg/__init__.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+import logging
+from ... import FastDeployModel, Frontend
+from ... import fastdeploy_main as C
+
+
+class Model(FastDeployModel):
+    def __init__(self,
+                 model_file,
+                 params_file,
+                 config_file,
+                 backend_option=None,
+                 model_format=Frontend.PADDLE):
+        super(Model, self).__init__(backend_option)
+
+        assert model_format == Frontend.PADDLE, "PaddleSeg only support model format of Frontend.Paddle now."
+        self._model = C.vision.ppseg.Model(model_file, params_file,
+                                           config_file, self._runtime_option,
+                                           model_format)
+        assert self.initialized, "PaddleSeg model initialize failed."
+
+    def predict(self, input_image):
+        return self._model.predict(input_image)
diff --git a/fastdeploy/vision/ppseg/model.cc b/fastdeploy/vision/ppseg/model.cc
new file mode 100644
index 0000000000..268d85f7d3
--- /dev/null
+++ b/fastdeploy/vision/ppseg/model.cc
@@ -0,0 +1,140 @@
+#include "fastdeploy/vision/ppseg/model.h"
+#include "fastdeploy/vision.h"
+#include "fastdeploy/vision/utils/utils.h"
+#include "yaml-cpp/yaml.h"
+
+namespace fastdeploy {
+namespace vision {
+namespace ppseg {
+
+Model::Model(const std::string& model_file, const std::string& params_file,
+             const std::string& config_file, const RuntimeOption& custom_option,
+             const Frontend& model_format) {
+  config_file_ = config_file;
+  valid_cpu_backends = {Backend::ORT, Backend::PDINFER};
+  valid_gpu_backends = {Backend::ORT, Backend::PDINFER};
+  runtime_option = custom_option;
+  runtime_option.model_format = model_format;
+  runtime_option.model_file = model_file;
+  runtime_option.params_file = params_file;
+  initialized = Initialize();
+}
+
+bool Model::Initialize() {
+  if (!BuildPreprocessPipelineFromConfig()) {
+    FDERROR << "Failed to build preprocess pipeline from configuration file."
+            << std::endl;
+    return false;
+  }
+  if (!InitRuntime()) {
+    FDERROR << "Failed to initialize fastdeploy backend." << std::endl;
+    return false;
+  }
+  return true;
+}
+
+bool Model::BuildPreprocessPipelineFromConfig() {
+  processors_.clear();
+  YAML::Node cfg;
+  processors_.push_back(std::make_shared<BGR2RGB>());
+  try {
+    cfg = YAML::LoadFile(config_file_);
+  } catch (YAML::BadFile& e) {
+    FDERROR << "Failed to load yaml file " << config_file_
+            << ", maybe you should check this file." << std::endl;
+    return false;
+  }
+
+  if (cfg["Deploy"]["transforms"]) {
+    auto preprocess_cfg = cfg["Deploy"]["transforms"];
+    for (const auto& op : preprocess_cfg) {
+      FDASSERT(op.IsMap(),
+               "Require the transform information in yaml be Map type.");
+      if (op["type"].as<std::string>() == "Normalize") {
+        std::vector<float> mean = {0.5, 0.5, 0.5};
+        std::vector<float> std = {0.5, 0.5, 0.5};
+        if (op["mean"]) {
+          mean = op["mean"].as<std::vector<float>>();
+        }
+        if (op["std"]) {
+          std = op["std"].as<std::vector<float>>();
+        }
+        processors_.push_back(std::make_shared<Normalize>(mean, std));
+
+      } else if (op["type"].as<std::string>() == "Resize") {
+        const auto& target_size = op["target_size"];
+        int resize_width = target_size[0].as<int>();
+        int resize_height = target_size[1].as<int>();
+        processors_.push_back(
+            std::make_shared<Resize>(resize_width, resize_height));
+      }
+    }
+    processors_.push_back(std::make_shared<HWC2CHW>());
+  }
+  return true;
+}
+
+bool Model::Preprocess(Mat* mat, FDTensor* output) {
+  for (size_t i = 0; i < processors_.size(); ++i) {
+    if (!(*(processors_[i].get()))(mat)) {
+      FDERROR << "Failed to process image data in " << processors_[i]->Name()
+              << "." << std::endl;
+      return false;
+    }
+  }
+  mat->ShareWithTensor(output);
+  output->shape.insert(output->shape.begin(), 1);
+  output->name = InputInfoOfRuntime(0).name;
+  return true;
+}
+
+bool Model::Postprocess(const FDTensor& infer_result,
+                        SegmentationResult* result) {
+  FDASSERT(infer_result.dtype == FDDataType::INT64,
+           "Require the data type of output is int64, but now it's " +
+               Str(const_cast<fastdeploy::FDDataType&>(infer_result.dtype)) +
+               ".");
+  result->Clear();
+  std::vector<int64_t> output_shape = infer_result.shape;
+  int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                                std::multiplies<int>());
+  const int64_t* infer_result_buffer =
+      reinterpret_cast<const int64_t*>(infer_result.data.data());
+  int64_t height = output_shape[1];
+  int64_t width = output_shape[2];
+  result->Resize(height, width);
+  for (int64_t i = 0; i < height; i++) {
+    int64_t begin = i * width;
+    int64_t end = (i + 1) * width - 1;
+    std::copy(infer_result_buffer + begin, infer_result_buffer + end,
+              result->masks[i].begin());
+  }
+
+  return true;
+}
+
+bool Model::Predict(cv::Mat* im, SegmentationResult* result) {
+  Mat mat(*im);
+  std::vector<FDTensor> processed_data(1);
+  if (!Preprocess(&mat, &(processed_data[0]))) {
+    FDERROR << "Failed to preprocess input data while using model:"
+            << ModelName() << "." << std::endl;
+    return false;
+  }
+  std::vector<FDTensor> infer_result(1);
+  if (!Infer(processed_data, &infer_result)) {
+    FDERROR << "Failed to inference while using model:" << ModelName() << "."
+            << std::endl;
+    return false;
+  }
+  if (!Postprocess(infer_result[0], result)) {
+    FDERROR << "Failed to postprocess while using model:" << ModelName() << "."
+            << std::endl;
+    return false;
+  }
+  return true;
+}
+
+}  // namespace ppseg
+}  // namespace vision
+}  // namespace fastdeploy
diff --git a/fastdeploy/vision/ppseg/model.h b/fastdeploy/vision/ppseg/model.h
new file mode 100644
index 0000000000..c0ca5a70d0
--- /dev/null
+++ b/fastdeploy/vision/ppseg/model.h
@@ -0,0 +1,35 @@
+#pragma once
+#include "fastdeploy/fastdeploy_model.h"
+#include "fastdeploy/vision/common/processors/transform.h"
+#include "fastdeploy/vision/common/result.h"
+
+namespace fastdeploy {
+namespace vision {
+namespace ppseg {
+
+class FASTDEPLOY_DECL Model : public FastDeployModel {
+ public:
+  Model(const std::string& model_file, const std::string& params_file,
+        const std::string& config_file,
+        const RuntimeOption& custom_option = RuntimeOption(),
+        const Frontend& model_format = Frontend::PADDLE);
+
+  std::string ModelName() const { return "ppseg"; }
+
+  virtual bool Predict(cv::Mat* im, SegmentationResult* result);
+
+ private:
+  bool Initialize();
+
+  bool BuildPreprocessPipelineFromConfig();
+
+  bool Preprocess(Mat* mat, FDTensor* outputs);
+
+  bool Postprocess(const FDTensor& infer_result, SegmentationResult* result);
+
+  std::vector<std::shared_ptr<Processor>> processors_;
+  std::string config_file_;
+};
+}  // namespace ppseg
+}  // namespace vision
+}  // namespace fastdeploy
diff --git a/fastdeploy/vision/ppseg/ppseg_pybind.cc b/fastdeploy/vision/ppseg/ppseg_pybind.cc
new file mode 100644
index 0000000000..60022f914b
--- /dev/null
+++ b/fastdeploy/vision/ppseg/ppseg_pybind.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "fastdeploy/pybind/main.h"
+
+namespace fastdeploy {
+void BindPPSeg(pybind11::module& m) {
+  auto ppseg_module =
+      m.def_submodule("ppseg", "Module to deploy PaddleSegmentation.");
+  pybind11::class_<vision::ppseg::Model, FastDeployModel>(ppseg_module, "Model")
+      .def(pybind11::init<std::string, std::string, std::string, RuntimeOption,
+                          Frontend>())
+      .def("predict", [](vision::ppseg::Model& self, pybind11::array& data) {
+        auto mat = PyArrayToCvMat(data);
+        vision::SegmentationResult res;
+        self.Predict(&mat, &res);
+        return res;
+      });
+}
+}  // namespace fastdeploy
diff --git a/fastdeploy/vision/vision_pybind.cc b/fastdeploy/vision/vision_pybind.cc
index 0334303ce6..22c4f0bc2e 100644
--- a/fastdeploy/vision/vision_pybind.cc
+++ b/fastdeploy/vision/vision_pybind.cc
@@ -19,6 +19,7 @@ namespace fastdeploy {
 void BindPPCls(pybind11::module& m);
 void BindPPDet(pybind11::module& m);
 void BindWongkinyiu(pybind11::module& m);
+void BindPPSeg(pybind11::module& m);
 void BindUltralytics(pybind11::module& m);
 void BindMeituan(pybind11::module& m);
 void BindMegvii(pybind11::module& m);
@@ -42,8 +43,15 @@ void BindVision(pybind11::module& m) {
       .def("__repr__", &vision::DetectionResult::Str)
       .def("__str__", &vision::DetectionResult::Str);
 
+  pybind11::class_<vision::SegmentationResult>(m, "SegmentationResult")
+      .def(pybind11::init())
+      .def_readwrite("masks", &vision::SegmentationResult::masks)
+      .def("__repr__", &vision::SegmentationResult::Str)
+      .def("__str__", &vision::SegmentationResult::Str);
+
   BindPPCls(m);
   BindPPDet(m);
+  BindPPSeg(m);
   BindUltralytics(m);
   BindWongkinyiu(m);
   BindMeituan(m);
diff --git a/fastdeploy/vision/visualize/__init__.py b/fastdeploy/vision/visualize/__init__.py
index 384ec2768f..7d1bcc8926 100644
--- a/fastdeploy/vision/visualize/__init__.py
+++ b/fastdeploy/vision/visualize/__init__.py
@@ -19,3 +19,8 @@
 
 def vis_detection(im_data, det_result, line_size=1, font_size=0.5):
     C.vision.Visualize.vis_detection(im_data, det_result, line_size, font_size)
+
+
+def vis_segmentation(im_data, seg_result, vis_im_data, num_classes=1000):
+    C.vision.Visualize.vis_segmentation(im_data, seg_result, vis_im_data,
+                                        num_classes)
diff --git a/fastdeploy/vision/visualize/segmentation.cc b/fastdeploy/vision/visualize/segmentation.cc
new file mode 100644
index 0000000000..b1b142fc08
--- /dev/null
+++ b/fastdeploy/vision/visualize/segmentation.cc
@@ -0,0 +1,46 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef ENABLE_VISION_VISUALIZE
+
+#include "fastdeploy/vision/visualize/visualize.h"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgproc/imgproc.hpp"
+
+namespace fastdeploy {
+namespace vision {
+
+void Visualize::VisSegmentation(const cv::Mat& im,
+                                const SegmentationResult& result,
+                                cv::Mat* vis_img, const int& num_classes) {
+  auto color_map = GetColorMap(num_classes);
+  int64_t height = result.masks.size();
+  int64_t width = result.masks[1].size();
+  *vis_img = cv::Mat::zeros(height, width, CV_8UC3);
+
+  int64_t index = 0;
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      int category_id = static_cast<int>(result.masks[i][j]);
+      vis_img->at<cv::Vec3b>(i, j)[0] = color_map[3 * category_id + 0];
+      vis_img->at<cv::Vec3b>(i, j)[1] = color_map[3 * category_id + 1];
+      vis_img->at<cv::Vec3b>(i, j)[2] = color_map[3 * category_id + 2];
+    }
+  }
+  cv::addWeighted(im, .5, *vis_img, .5, 0, *vis_img);
+}
+
+}  // namespace vision
+}  // namespace fastdeploy
+#endif
diff --git a/fastdeploy/vision/visualize/visualize.h b/fastdeploy/vision/visualize/visualize.h
index 6fffa521a6..1eb212c2b9 100644
--- a/fastdeploy/vision/visualize/visualize.h
+++ b/fastdeploy/vision/visualize/visualize.h
@@ -27,8 +27,11 @@ class FASTDEPLOY_DECL Visualize {
   static const std::vector<int>& GetColorMap(int num_classes = 1000);
   static void VisDetection(cv::Mat* im, const DetectionResult& result,
                            int line_size = 2, float font_size = 0.5f);
+  static void VisSegmentation(const cv::Mat& im,
+                              const SegmentationResult& result,
+                              cv::Mat* vis_img, const int& num_classes = 1000);
 };
 
-} // namespace vision
-} // namespace fastdeploy
+}  // namespace vision
+}  // namespace fastdeploy
 #endif
diff --git a/fastdeploy/vision/visualize/visualize_pybind.cc b/fastdeploy/vision/visualize/visualize_pybind.cc
index 66ffc74f9f..5d5eb2388d 100644
--- a/fastdeploy/vision/visualize/visualize_pybind.cc
+++ b/fastdeploy/vision/visualize/visualize_pybind.cc
@@ -18,11 +18,20 @@ namespace fastdeploy {
 void BindVisualize(pybind11::module& m) {
   pybind11::class_<vision::Visualize>(m, "Visualize")
       .def(pybind11::init<>())
-      .def_static("vis_detection", [](pybind11::array& im_data,
-                                      vision::DetectionResult& result,
-                                      int line_size, float font_size) {
-        auto im = PyArrayToCvMat(im_data);
-        vision::Visualize::VisDetection(&im, result, line_size, font_size);
+      .def_static("vis_detection",
+                  [](pybind11::array& im_data, vision::DetectionResult& result,
+                     int line_size, float font_size) {
+                    auto im = PyArrayToCvMat(im_data);
+                    vision::Visualize::VisDetection(&im, result, line_size,
+                                                    font_size);
+                  })
+      .def_static("vis_segmentation", [](pybind11::array& im_data,
+                                         vision::SegmentationResult& result,
+                                         pybind11::array& vis_im_data,
+                                         const int& num_classes) {
+        cv::Mat im = PyArrayToCvMat(im_data);
+        cv::Mat vis_im = PyArrayToCvMat(vis_im_data);
+        vision::Visualize::VisSegmentation(im, result, &vis_im, num_classes);
       });
 }
-} // namespace fastdeploy
+}  // namespace fastdeploy
diff --git a/model_zoo/vision/ppseg/ppseg_unet.py b/model_zoo/vision/ppseg/ppseg_unet.py
new file mode 100644
index 0000000000..c279e0a8fd
--- /dev/null
+++ b/model_zoo/vision/ppseg/ppseg_unet.py
@@ -0,0 +1,36 @@
+import fastdeploy as fd
+import cv2
+import tarfile
+
+# 下载模型和测试图片
+model_url = "https://github.com/felixhjh/Fastdeploy-Models/raw/main/unet_Cityscapes.tar.gz"
+test_jpg_url = "https://paddleseg.bj.bcebos.com/dygraph/demo/cityscapes_demo.png"
+fd.download(model_url, ".", show_progress=True)
+fd.download(test_jpg_url, ".", show_progress=True)
+
+try:
+    tar = tarfile.open("unet_Cityscapes.tar.gz", "r:gz")
+    file_names = tar.getnames()
+    for file_name in file_names:
+        tar.extract(file_name, ".")
+    tar.close()
+except Exception as e:
+    raise Exception(e)
+
+# 加载模型
+model = fd.vision.ppseg.Model("./unet_Cityscapes/model.pdmodel",
+                              "./unet_Cityscapes/model.pdiparams",
+                              "./unet_Cityscapes/deploy.yaml")
+
+# 预测图片
+im = cv2.imread("./cityscapes_demo.png")
+result = model.predict(im)
+
+vis_im = im.copy()
+# 可视化结果
+fd.vision.visualize.vis_segmentation(im, result, vis_im)
+cv2.imwrite("vis_img.png", vis_im)
+
+# 输出预测结果
+print(result)
+print(model.runtime_option)

From e2487817847e4bebc4f326214f8a88ede6d8184e Mon Sep 17 00:00:00 2001
From: DefTruth <31974251+DefTruth@users.noreply.github.com>
Date: Fri, 22 Jul 2022 09:49:55 +0800
Subject: [PATCH 2/9] Add NanoDet-Plus Model support (#32)

* update .gitignore

* Added checking for cmake include dir

* fixed missing trt_backend option bug when init from trt

* remove un-need data layout and add pre-check for dtype

* changed RGB2BRG to BGR2RGB in ppcls model

* add model_zoo yolov6 c++/python demo

* fixed CMakeLists.txt typos

* update yolov6 cpp/README.md

* add yolox c++/pybind and model_zoo demo

* move some helpers to private

* fixed CMakeLists.txt typos

* add normalize with alpha and beta

* add version notes for yolov5/yolov6/yolox

* add copyright to yolov5.cc

* revert normalize

* fixed some bugs in yolox

* Add NanoDet-Plus Model support

Co-authored-by: Jason <jiangjiajun@baidu.com>
---
 examples/CMakeLists.txt                       |  14 +
 examples/vision/rangilyu_nanodet_plus.cc      |  53 +++
 fastdeploy/vision.h                           |   1 +
 fastdeploy/vision/__init__.py                 |   1 +
 fastdeploy/vision/megvii/__init__.py          |  14 +-
 fastdeploy/vision/rangilyu/__init__.py        | 105 ++++++
 fastdeploy/vision/rangilyu/nanodet_plus.cc    | 355 ++++++++++++++++++
 fastdeploy/vision/rangilyu/nanodet_plus.h     | 101 +++++
 fastdeploy/vision/rangilyu/rangilyu_pybind.cc |  41 ++
 fastdeploy/vision/vision_pybind.cc            |   2 +
 model_zoo/vision/nanodet_plus/README.md       |  46 +++
 model_zoo/vision/nanodet_plus/api.md          |  71 ++++
 .../vision/nanodet_plus/cpp/CMakeLists.txt    |  17 +
 model_zoo/vision/nanodet_plus/cpp/README.md   |  30 ++
 .../vision/nanodet_plus/cpp/nanodet_plus.cc   |  40 ++
 model_zoo/vision/nanodet_plus/nanodet_plus.py |  23 ++
 16 files changed, 907 insertions(+), 7 deletions(-)
 create mode 100644 examples/vision/rangilyu_nanodet_plus.cc
 create mode 100644 fastdeploy/vision/rangilyu/__init__.py
 create mode 100644 fastdeploy/vision/rangilyu/nanodet_plus.cc
 create mode 100644 fastdeploy/vision/rangilyu/nanodet_plus.h
 create mode 100644 fastdeploy/vision/rangilyu/rangilyu_pybind.cc
 create mode 100644 model_zoo/vision/nanodet_plus/README.md
 create mode 100644 model_zoo/vision/nanodet_plus/api.md
 create mode 100644 model_zoo/vision/nanodet_plus/cpp/CMakeLists.txt
 create mode 100644 model_zoo/vision/nanodet_plus/cpp/README.md
 create mode 100644 model_zoo/vision/nanodet_plus/cpp/nanodet_plus.cc
 create mode 100644 model_zoo/vision/nanodet_plus/nanodet_plus.py

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 112193c86a..31ca40af3c 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,3 +1,17 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 function(add_fastdeploy_executable FIELD CC_FILE)
   # temp target name/file var in function scope
   set(TEMP_TARGET_FILE ${CC_FILE})
diff --git a/examples/vision/rangilyu_nanodet_plus.cc b/examples/vision/rangilyu_nanodet_plus.cc
new file mode 100644
index 0000000000..91dcd604ed
--- /dev/null
+++ b/examples/vision/rangilyu_nanodet_plus.cc
@@ -0,0 +1,53 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/vision.h"
+
+int main() {
+  namespace vis = fastdeploy::vision;
+
+  std::string model_file = "../resources/models/nanodet-plus-m_320.onnx";
+  std::string img_path = "../resources/images/bus.jpg";
+  std::string vis_path =
+      "../resources/outputs/rangilyu_nanodet_plus_vis_result.jpg";
+
+  auto model = vis::rangilyu::NanoDetPlus(model_file);
+  if (!model.Initialized()) {
+    std::cerr << "Init Failed! Model: " << model_file << std::endl;
+    return -1;
+  } else {
+    std::cout << "Init Done! Model:" << model_file << std::endl;
+  }
+  model.EnableDebug();
+
+  cv::Mat im = cv::imread(img_path);
+  cv::Mat vis_im = im.clone();
+
+  vis::DetectionResult res;
+  if (!model.Predict(&im, &res)) {
+    std::cerr << "Prediction Failed." << std::endl;
+    return -1;
+  } else {
+    std::cout << "Prediction Done!" << std::endl;
+  }
+
+  // 输出预测框结果
+  std::cout << res.Str() << std::endl;
+
+  // 可视化预测结果
+  vis::Visualize::VisDetection(&vis_im, res);
+  cv::imwrite(vis_path, vis_im);
+  std::cout << "Detect Done! Saved: " << vis_path << std::endl;
+  return 0;
+}
diff --git a/fastdeploy/vision.h b/fastdeploy/vision.h
index d539482a72..b7836ca466 100644
--- a/fastdeploy/vision.h
+++ b/fastdeploy/vision.h
@@ -19,6 +19,7 @@
 #include "fastdeploy/vision/meituan/yolov6.h"
 #include "fastdeploy/vision/ppcls/model.h"
 #include "fastdeploy/vision/ppdet/ppyoloe.h"
+#include "fastdeploy/vision/rangilyu/nanodet_plus.h"
 #include "fastdeploy/vision/ppseg/model.h"
 #include "fastdeploy/vision/ultralytics/yolov5.h"
 #include "fastdeploy/vision/wongkinyiu/yolor.h"
diff --git a/fastdeploy/vision/__init__.py b/fastdeploy/vision/__init__.py
index 08b0d68124..09be1fa1b7 100644
--- a/fastdeploy/vision/__init__.py
+++ b/fastdeploy/vision/__init__.py
@@ -22,3 +22,4 @@
 from . import megvii
 from . import visualize
 from . import wongkinyiu
+from . import rangilyu
diff --git a/fastdeploy/vision/megvii/__init__.py b/fastdeploy/vision/megvii/__init__.py
index 67096e4fc8..8f96c97428 100644
--- a/fastdeploy/vision/megvii/__init__.py
+++ b/fastdeploy/vision/megvii/__init__.py
@@ -28,8 +28,8 @@ def __init__(self,
         # 初始化后的option保存在self._runtime_option
         super(YOLOX, self).__init__(runtime_option)
 
-        self._model = C.vision.megvii.YOLOX(
-            model_file, params_file, self._runtime_option, model_format)
+        self._model = C.vision.megvii.YOLOX(model_file, params_file,
+                                            self._runtime_option, model_format)
         # 通过self.initialized判断整个模型的初始化是否成功
         assert self.initialized, "YOLOX initialize failed."
 
@@ -53,8 +53,8 @@ def is_decode_exported(self):
 
     @property
     def downsample_strides(self):
-        return self._model.downsample_strides        
-    
+        return self._model.downsample_strides
+
     @property
     def max_wh(self):
         return self._model.max_wh
@@ -78,16 +78,16 @@ def padding_value(self, value):
     @is_decode_exported.setter
     def is_decode_exported(self, value):
         assert isinstance(
-            value, 
+            value,
             bool), "The value to set `is_decode_exported` must be type of bool."
-        self._model.max_wh = value
+        self._model.is_decode_exported = value
 
     @downsample_strides.setter
     def downsample_strides(self, value):
         assert isinstance(
             value,
             list), "The value to set `downsample_strides` must be type of list."
-        self._model.downsample_strides = value          
+        self._model.downsample_strides = value
 
     @max_wh.setter
     def max_wh(self, value):
diff --git a/fastdeploy/vision/rangilyu/__init__.py b/fastdeploy/vision/rangilyu/__init__.py
new file mode 100644
index 0000000000..f2e8ace9fc
--- /dev/null
+++ b/fastdeploy/vision/rangilyu/__init__.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+import logging
+from ... import FastDeployModel, Frontend
+from ... import fastdeploy_main as C
+
+
+class NanoDetPlus(FastDeployModel):
+    def __init__(self,
+                 model_file,
+                 params_file="",
+                 runtime_option=None,
+                 model_format=Frontend.ONNX):
+        # 调用基函数进行backend_option的初始化
+        # 初始化后的option保存在self._runtime_option
+        super(NanoDetPlus, self).__init__(runtime_option)
+
+        self._model = C.vision.rangilyu.NanoDetPlus(
+            model_file, params_file, self._runtime_option, model_format)
+        # 通过self.initialized判断整个模型的初始化是否成功
+        assert self.initialized, "NanoDetPlus initialize failed."
+
+    def predict(self, input_image, conf_threshold=0.25, nms_iou_threshold=0.5):
+        return self._model.predict(input_image, conf_threshold,
+                                   nms_iou_threshold)
+
+    # 一些跟NanoDetPlus模型有关的属性封装
+    # 多数是预处理相关，可通过修改如model.size = [416, 416]改变预处理时resize的大小（前提是模型支持）
+    @property
+    def size(self):
+        return self._model.size
+
+    @property
+    def padding_value(self):
+        return self._model.padding_value
+
+    @property
+    def keep_ratio(self):
+        return self._model.keep_ratio
+
+    @property
+    def downsample_strides(self):
+        return self._model.downsample_strides
+
+    @property
+    def max_wh(self):
+        return self._model.max_wh
+
+    @property
+    def reg_max(self):
+        return self._model.reg_max
+
+    @size.setter
+    def size(self, wh):
+        assert isinstance(wh, [list, tuple]),\
+            "The value to set `size` must be type of tuple or list."
+        assert len(wh) == 2,\
+            "The value to set `size` must contatins 2 elements means [width, height], but now it contains {} elements.".format(
+            len(wh))
+        self._model.size = wh
+
+    @padding_value.setter
+    def padding_value(self, value):
+        assert isinstance(
+            value,
+            list), "The value to set `padding_value` must be type of list."
+        self._model.padding_value = value
+
+    @keep_ratio.setter
+    def keep_ratio(self, value):
+        assert isinstance(
+            value, bool), "The value to set `keep_ratio` must be type of bool."
+        self._model.keep_ratio = value
+
+    @downsample_strides.setter
+    def downsample_strides(self, value):
+        assert isinstance(
+            value,
+            list), "The value to set `downsample_strides` must be type of list."
+        self._model.downsample_strides = value
+
+    @max_wh.setter
+    def max_wh(self, value):
+        assert isinstance(
+            value, float), "The value to set `max_wh` must be type of float."
+        self._model.max_wh = value
+
+    @reg_max.setter
+    def reg_max(self, value):
+        assert isinstance(
+            value, int), "The value to set `reg_max` must be type of int."
+        self._model.reg_max = value
diff --git a/fastdeploy/vision/rangilyu/nanodet_plus.cc b/fastdeploy/vision/rangilyu/nanodet_plus.cc
new file mode 100644
index 0000000000..678e131c41
--- /dev/null
+++ b/fastdeploy/vision/rangilyu/nanodet_plus.cc
@@ -0,0 +1,355 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/vision/rangilyu/nanodet_plus.h"
+#include "fastdeploy/utils/perf.h"
+#include "fastdeploy/vision/utils/utils.h"
+
+namespace fastdeploy {
+
+namespace vision {
+
+namespace rangilyu {
+
+struct NanoDetPlusCenterPoint {
+  int grid0;
+  int grid1;
+  int stride;
+};
+
+void GenerateNanoDetPlusCenterPoints(
+    const std::vector<int>& size, const std::vector<int>& downsample_strides,
+    std::vector<NanoDetPlusCenterPoint>* center_points) {
+  // size: tuple of input (width, height), e.g (320, 320)
+  // downsample_strides: downsample strides in NanoDet and
+  // NanoDet-Plus, e.g (8, 16, 32, 64)
+  const int width = size[0];
+  const int height = size[1];
+  for (const auto& ds : downsample_strides) {
+    int num_grid_w = width / ds;
+    int num_grid_h = height / ds;
+    for (int g1 = 0; g1 < num_grid_h; ++g1) {
+      for (int g0 = 0; g0 < num_grid_w; ++g0) {
+        (*center_points).emplace_back(NanoDetPlusCenterPoint{g0, g1, ds});
+      }
+    }
+  }
+}
+
+void WrapAndResize(Mat* mat, std::vector<int> size, std::vector<float> color,
+                   bool keep_ratio = false) {
+  // Reference: nanodet/data/transform/warp.py#L139
+  // size: tuple of input (width, height)
+  // The default value of `keep_ratio` is `fasle` in
+  // `config/nanodet-plus-m-1.5x_320.yml` for both
+  // train and val processes. So, we just let this
+  // option default `false` according to the official
+  // implementation in NanoDet and NanoDet-Plus.
+  // Note, this function will apply a normal resize
+  // operation to input Mat if the keep_ratio option
+  // is fasle and the behavior will be the same as
+  // yolov5's letterbox if keep_ratio is true.
+
+  // with keep_ratio = false (default)
+  if (!keep_ratio) {
+    int resize_h = size[1];
+    int resize_w = size[0];
+    if (resize_h != mat->Height() || resize_w != mat->Width()) {
+      Resize::Run(mat, resize_w, resize_h);
+    }
+    return;
+  }
+  // with keep_ratio = true, same as yolov5's letterbox
+  float r = std::min(size[1] * 1.0f / static_cast<float>(mat->Height()),
+                     size[0] * 1.0f / static_cast<float>(mat->Width()));
+
+  int resize_h = int(round(static_cast<float>(mat->Height()) * r));
+  int resize_w = int(round(static_cast<float>(mat->Width()) * r));
+
+  if (resize_h != mat->Height() || resize_w != mat->Width()) {
+    Resize::Run(mat, resize_w, resize_h);
+  }
+
+  int pad_w = size[0] - resize_w;
+  int pad_h = size[1] - resize_h;
+  if (pad_h > 0 || pad_w > 0) {
+    float half_h = pad_h * 1.0 / 2;
+    int top = int(round(half_h - 0.1));
+    int bottom = int(round(half_h + 0.1));
+    float half_w = pad_w * 1.0 / 2;
+    int left = int(round(half_w - 0.1));
+    int right = int(round(half_w + 0.1));
+    Pad::Run(mat, top, bottom, left, right, color);
+  }
+}
+
+void GFLRegression(const float* logits, size_t reg_num, float* offset) {
+  // Hint: reg_num = reg_max + 1
+  FDASSERT(((nullptr != logits) && (reg_num != 0)),
+           "NanoDetPlus: logits is nullptr or reg_num is 0 in GFLRegression.");
+  // softmax
+  float total_exp = 0.f;
+  std::vector<float> softmax_probs(reg_num);
+  for (size_t i = 0; i < reg_num; ++i) {
+    softmax_probs[i] = std::exp(logits[i]);
+    total_exp += softmax_probs[i];
+  }
+  for (size_t i = 0; i < reg_num; ++i) {
+    softmax_probs[i] = softmax_probs[i] / total_exp;
+  }
+  // gfl regression -> offset
+  for (size_t i = 0; i < reg_num; ++i) {
+    (*offset) += static_cast<float>(i) * softmax_probs[i];
+  }
+}
+
+NanoDetPlus::NanoDetPlus(const std::string& model_file,
+                         const std::string& params_file,
+                         const RuntimeOption& custom_option,
+                         const Frontend& model_format) {
+  if (model_format == Frontend::ONNX) {
+    valid_cpu_backends = {Backend::ORT};  // 指定可用的CPU后端
+    valid_gpu_backends = {Backend::ORT, Backend::TRT};  // 指定可用的GPU后端
+  } else {
+    valid_cpu_backends = {Backend::PDINFER, Backend::ORT};
+    valid_gpu_backends = {Backend::PDINFER, Backend::ORT, Backend::TRT};
+  }
+  runtime_option = custom_option;
+  runtime_option.model_format = model_format;
+  runtime_option.model_file = model_file;
+  runtime_option.params_file = params_file;
+  initialized = Initialize();
+}
+
+bool NanoDetPlus::Initialize() {
+  // parameters for preprocess
+  size = {320, 320};
+  padding_value = {0.0f, 0.0f, 0.0f};
+  keep_ratio = false;
+  downsample_strides = {8, 16, 32, 64};
+  max_wh = 4096.0f;
+  reg_max = 7;
+
+  if (!InitRuntime()) {
+    FDERROR << "Failed to initialize fastdeploy backend." << std::endl;
+    return false;
+  }
+  // Check if the input shape is dynamic after Runtime already initialized.
+  is_dynamic_input_ = false;
+  auto shape = InputInfoOfRuntime(0).shape;
+  for (int i = 0; i < shape.size(); ++i) {
+    // if height or width is dynamic
+    if (i >= 2 && shape[i] <= 0) {
+      is_dynamic_input_ = true;
+      break;
+    }
+  }
+  return true;
+}
+
+bool NanoDetPlus::Preprocess(
+    Mat* mat, FDTensor* output,
+    std::map<std::string, std::array<float, 2>>* im_info) {
+  // NanoDet-Plus preprocess steps
+  // 1. WrapAndResize
+  // 2. HWC->CHW
+  // 3. Normalize or Convert (keep BGR order)
+  WrapAndResize(mat, size, padding_value, keep_ratio);
+  // Record output shape of preprocessed image
+  (*im_info)["output_shape"] = {static_cast<float>(mat->Height()),
+                                static_cast<float>(mat->Width())};
+
+  // Compute `result = mat * alpha + beta` directly by channel
+  // Reference: /config/nanodet-plus-m-1.5x_320.yml#L89
+  // from mean: [103.53, 116.28, 123.675], std: [57.375, 57.12, 58.395]
+  // x' = (x - mean) / std to x'= x * alpha + beta.
+  // e.g alpha[0] = 0.017429f = 1.0f / 57.375f
+  // e.g beta[0] = -103.53f * 0.0174291f
+  std::vector<float> alpha = {0.017429f, 0.017507f, 0.017125f};
+  std::vector<float> beta = {-103.53f * 0.0174291f, -116.28f * 0.0175070f,
+                             -123.675f * 0.0171247f};  // BGR order
+  Convert::Run(mat, alpha, beta);
+
+  HWC2CHW::Run(mat);
+  Cast::Run(mat, "float");
+  mat->ShareWithTensor(output);
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  return true;
+}
+
+bool NanoDetPlus::Postprocess(
+    FDTensor& infer_result, DetectionResult* result,
+    const std::map<std::string, std::array<float, 2>>& im_info,
+    float conf_threshold, float nms_iou_threshold) {
+  FDASSERT(infer_result.shape[0] == 1, "Only support batch =1 now.");
+  result->Clear();
+  result->Reserve(infer_result.shape[1]);
+  if (infer_result.dtype != FDDataType::FP32) {
+    FDERROR << "Only support post process with float32 data." << std::endl;
+    return false;
+  }
+  // generate center points with dowmsample strides
+  std::vector<NanoDetPlusCenterPoint> center_points;
+  GenerateNanoDetPlusCenterPoints(size, downsample_strides, &center_points);
+
+  // infer_result shape might look like (1,2125,112)
+  const int num_cls_reg = infer_result.shape[2];            // e.g 112
+  const int num_classes = num_cls_reg - (reg_max + 1) * 4;  // e.g 80
+  float* data = static_cast<float*>(infer_result.Data());
+  for (size_t i = 0; i < infer_result.shape[1]; ++i) {
+    float* scores = data + i * num_cls_reg;
+    float* max_class_score = std::max_element(scores, scores + num_classes);
+    float confidence = (*max_class_score);
+    // filter boxes by conf_threshold
+    if (confidence <= conf_threshold) {
+      continue;
+    }
+    int32_t label_id = std::distance(scores, max_class_score);
+    // fetch i-th center point
+    float grid0 = static_cast<float>(center_points.at(i).grid0);
+    float grid1 = static_cast<float>(center_points.at(i).grid1);
+    float downsample_stride = static_cast<float>(center_points.at(i).stride);
+    // apply gfl regression to get offsets (l,t,r,b)
+    float* logits = data + i * num_cls_reg + num_classes;  // 32|44...
+    std::vector<float> offsets(4);
+    for (size_t j = 0; j < 4; ++j) {
+      GFLRegression(logits + j * (reg_max + 1), reg_max + 1, &offsets[j]);
+    }
+    // convert from offsets to [x1, y1, x2, y2]
+    float l = offsets[0];  // left
+    float t = offsets[1];  // top
+    float r = offsets[2];  // right
+    float b = offsets[3];  // bottom
+
+    float x1 = (grid0 - l) * downsample_stride;  // cx - l x1
+    float y1 = (grid1 - t) * downsample_stride;  // cy - t y1
+    float x2 = (grid0 + r) * downsample_stride;  // cx + r x2
+    float y2 = (grid1 + b) * downsample_stride;  // cy + b y2
+
+    result->boxes.emplace_back(
+        std::array<float, 4>{x1 + label_id * max_wh, y1 + label_id * max_wh,
+                             x2 + label_id * max_wh, y2 + label_id * max_wh});
+    // label_id * max_wh for multi classes NMS
+    result->label_ids.push_back(label_id);
+    result->scores.push_back(confidence);
+  }
+  utils::NMS(result, nms_iou_threshold);
+
+  // scale the boxes to the origin image shape
+  auto iter_out = im_info.find("output_shape");
+  auto iter_ipt = im_info.find("input_shape");
+  FDASSERT(iter_out != im_info.end() && iter_ipt != im_info.end(),
+           "Cannot find input_shape or output_shape from im_info.");
+  float out_h = iter_out->second[0];
+  float out_w = iter_out->second[1];
+  float ipt_h = iter_ipt->second[0];
+  float ipt_w = iter_ipt->second[1];
+  // without keep_ratio
+  if (!keep_ratio) {
+    // x' = (x / out_w) * ipt_w = x / (out_w / ipt_w)
+    // y' = (y / out_h) * ipt_h = y / (out_h / ipt_h)
+    float r_w = out_w / ipt_w;
+    float r_h = out_h / ipt_h;
+    for (size_t i = 0; i < result->boxes.size(); ++i) {
+      int32_t label_id = (result->label_ids)[i];
+      // clip box
+      result->boxes[i][0] = result->boxes[i][0] - max_wh * label_id;
+      result->boxes[i][1] = result->boxes[i][1] - max_wh * label_id;
+      result->boxes[i][2] = result->boxes[i][2] - max_wh * label_id;
+      result->boxes[i][3] = result->boxes[i][3] - max_wh * label_id;
+      result->boxes[i][0] = std::max(result->boxes[i][0] / r_w, 0.0f);
+      result->boxes[i][1] = std::max(result->boxes[i][1] / r_h, 0.0f);
+      result->boxes[i][2] = std::max(result->boxes[i][2] / r_w, 0.0f);
+      result->boxes[i][3] = std::max(result->boxes[i][3] / r_h, 0.0f);
+      result->boxes[i][0] = std::min(result->boxes[i][0], ipt_w - 1.0f);
+      result->boxes[i][1] = std::min(result->boxes[i][1], ipt_h - 1.0f);
+      result->boxes[i][2] = std::min(result->boxes[i][2], ipt_w - 1.0f);
+      result->boxes[i][3] = std::min(result->boxes[i][3], ipt_h - 1.0f);
+    }
+    return true;
+  }
+  // with keep_ratio
+  float r = std::min(out_h / ipt_h, out_w / ipt_w);
+  float pad_h = (out_h - ipt_h * r) / 2;
+  float pad_w = (out_w - ipt_w * r) / 2;
+  for (size_t i = 0; i < result->boxes.size(); ++i) {
+    int32_t label_id = (result->label_ids)[i];
+    // clip box
+    result->boxes[i][0] = result->boxes[i][0] - max_wh * label_id;
+    result->boxes[i][1] = result->boxes[i][1] - max_wh * label_id;
+    result->boxes[i][2] = result->boxes[i][2] - max_wh * label_id;
+    result->boxes[i][3] = result->boxes[i][3] - max_wh * label_id;
+    result->boxes[i][0] = std::max((result->boxes[i][0] - pad_w) / r, 0.0f);
+    result->boxes[i][1] = std::max((result->boxes[i][1] - pad_h) / r, 0.0f);
+    result->boxes[i][2] = std::max((result->boxes[i][2] - pad_w) / r, 0.0f);
+    result->boxes[i][3] = std::max((result->boxes[i][3] - pad_h) / r, 0.0f);
+    result->boxes[i][0] = std::min(result->boxes[i][0], ipt_w - 1.0f);
+    result->boxes[i][1] = std::min(result->boxes[i][1], ipt_h - 1.0f);
+    result->boxes[i][2] = std::min(result->boxes[i][2], ipt_w - 1.0f);
+    result->boxes[i][3] = std::min(result->boxes[i][3], ipt_h - 1.0f);
+  }
+  return true;
+}
+
+bool NanoDetPlus::Predict(cv::Mat* im, DetectionResult* result,
+                          float conf_threshold, float nms_iou_threshold) {
+#ifdef FASTDEPLOY_DEBUG
+  TIMERECORD_START(0)
+#endif
+
+  Mat mat(*im);
+  std::vector<FDTensor> input_tensors(1);
+
+  std::map<std::string, std::array<float, 2>> im_info;
+
+  // Record the shape of image and the shape of preprocessed image
+  im_info["input_shape"] = {static_cast<float>(mat.Height()),
+                            static_cast<float>(mat.Width())};
+  im_info["output_shape"] = {static_cast<float>(mat.Height()),
+                             static_cast<float>(mat.Width())};
+
+  if (!Preprocess(&mat, &input_tensors[0], &im_info)) {
+    FDERROR << "Failed to preprocess input image." << std::endl;
+    return false;
+  }
+
+#ifdef FASTDEPLOY_DEBUG
+  TIMERECORD_END(0, "Preprocess")
+  TIMERECORD_START(1)
+#endif
+
+  input_tensors[0].name = InputInfoOfRuntime(0).name;
+  std::vector<FDTensor> output_tensors;
+  if (!Infer(input_tensors, &output_tensors)) {
+    FDERROR << "Failed to inference." << std::endl;
+    return false;
+  }
+#ifdef FASTDEPLOY_DEBUG
+  TIMERECORD_END(1, "Inference")
+  TIMERECORD_START(2)
+#endif
+
+  if (!Postprocess(output_tensors[0], result, im_info, conf_threshold,
+                   nms_iou_threshold)) {
+    FDERROR << "Failed to post process." << std::endl;
+    return false;
+  }
+
+#ifdef FASTDEPLOY_DEBUG
+  TIMERECORD_END(2, "Postprocess")
+#endif
+  return true;
+}
+
+}  // namespace rangilyu
+}  // namespace vision
+}  // namespace fastdeploy
\ No newline at end of file
diff --git a/fastdeploy/vision/rangilyu/nanodet_plus.h b/fastdeploy/vision/rangilyu/nanodet_plus.h
new file mode 100644
index 0000000000..4184aa18e5
--- /dev/null
+++ b/fastdeploy/vision/rangilyu/nanodet_plus.h
@@ -0,0 +1,101 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "fastdeploy/fastdeploy_model.h"
+#include "fastdeploy/vision/common/processors/transform.h"
+#include "fastdeploy/vision/common/result.h"
+
+namespace fastdeploy {
+
+namespace vision {
+
+namespace rangilyu {
+
+class FASTDEPLOY_DECL NanoDetPlus : public FastDeployModel {
+ public:
+  // 当model_format为ONNX时，无需指定params_file
+  // 当model_format为Paddle时，则需同时指定model_file & params_file
+  NanoDetPlus(const std::string& model_file,
+              const std::string& params_file = "",
+              const RuntimeOption& custom_option = RuntimeOption(),
+              const Frontend& model_format = Frontend::ONNX);
+
+  // 定义模型的名称
+  std::string ModelName() const { return "RangiLyu/nanodet"; }
+
+  // 模型预测接口，即用户调用的接口
+  // im 为用户的输入数据，目前对于CV均定义为cv::Mat
+  // result 为模型预测的输出结构体
+  // conf_threshold 为后处理的参数
+  // nms_iou_threshold 为后处理的参数
+  virtual bool Predict(cv::Mat* im, DetectionResult* result,
+                       float conf_threshold = 0.35f,
+                       float nms_iou_threshold = 0.5f);
+
+  // 以下为模型在预测时的一些参数，基本是前后处理所需
+  // 用户在创建模型后，可根据模型的要求，以及自己的需求
+  // 对参数进行修改
+  // tuple of input size (width, height), e.g (320, 320)
+  std::vector<int> size;
+  // padding value, size should be same with Channels
+  std::vector<float> padding_value;
+  // keep aspect ratio or not when perform resize operation.
+  // This option is set as `false` by default in NanoDet-Plus.
+  bool keep_ratio;
+  // downsample strides for NanoDet-Plus to generate anchors, will
+  // take (8, 16, 32, 64) as default values.
+  std::vector<int> downsample_strides;
+  // for offseting the boxes by classes when using NMS, default 4096.
+  float max_wh;
+  // reg_max for GFL regression, default 7
+  int reg_max;
+
+ private:
+  // 初始化函数，包括初始化后端，以及其它模型推理需要涉及的操作
+  bool Initialize();
+
+  // 输入图像预处理操作
+  // Mat为FastDeploy定义的数据结构
+  // FDTensor为预处理后的Tensor数据，传给后端进行推理
+  // im_info为预处理过程保存的数据，在后处理中需要用到
+  bool Preprocess(Mat* mat, FDTensor* output,
+                  std::map<std::string, std::array<float, 2>>* im_info);
+
+  // 后端推理结果后处理，输出给用户
+  // infer_result 为后端推理后的输出Tensor
+  // result 为模型预测的结果
+  // im_info 为预处理记录的信息，后处理用于还原box
+  // conf_threshold 后处理时过滤box的置信度阈值
+  // nms_iou_threshold 后处理时NMS设定的iou阈值
+  bool Postprocess(FDTensor& infer_result, DetectionResult* result,
+                   const std::map<std::string, std::array<float, 2>>& im_info,
+                   float conf_threshold, float nms_iou_threshold);
+
+  // 查看输入是否为动态维度的 不建议直接使用 不同模型的逻辑可能不一致
+  bool IsDynamicInput() const { return is_dynamic_input_; }
+
+  // whether to inference with dynamic shape (e.g ONNX export with dynamic shape
+  // or not.)
+  // RangiLyu/nanodet official 'export_onnx.py' script will export static ONNX
+  // by default.
+  // This value will auto check by fastdeploy after the internal Runtime
+  // initialized.
+  bool is_dynamic_input_;
+};
+
+}  // namespace rangilyu
+}  // namespace vision
+}  // namespace fastdeploy
diff --git a/fastdeploy/vision/rangilyu/rangilyu_pybind.cc b/fastdeploy/vision/rangilyu/rangilyu_pybind.cc
new file mode 100644
index 0000000000..70bde60057
--- /dev/null
+++ b/fastdeploy/vision/rangilyu/rangilyu_pybind.cc
@@ -0,0 +1,41 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/pybind/main.h"
+
+namespace fastdeploy {
+void BindRangiLyu(pybind11::module& m) {
+  auto rangilyu_module =
+      m.def_submodule("rangilyu", "https://github.com/RangiLyu/nanodet");
+  pybind11::class_<vision::rangilyu::NanoDetPlus, FastDeployModel>(
+      rangilyu_module, "NanoDetPlus")
+      .def(pybind11::init<std::string, std::string, RuntimeOption, Frontend>())
+      .def("predict",
+           [](vision::rangilyu::NanoDetPlus& self, pybind11::array& data,
+              float conf_threshold, float nms_iou_threshold) {
+             auto mat = PyArrayToCvMat(data);
+             vision::DetectionResult res;
+             self.Predict(&mat, &res, conf_threshold, nms_iou_threshold);
+             return res;
+           })
+      .def_readwrite("size", &vision::rangilyu::NanoDetPlus::size)
+      .def_readwrite("padding_value",
+                     &vision::rangilyu::NanoDetPlus::padding_value)
+      .def_readwrite("keep_ratio", &vision::rangilyu::NanoDetPlus::keep_ratio)
+      .def_readwrite("downsample_strides",
+                     &vision::rangilyu::NanoDetPlus::downsample_strides)
+      .def_readwrite("max_wh", &vision::rangilyu::NanoDetPlus::max_wh)
+      .def_readwrite("reg_max", &vision::rangilyu::NanoDetPlus::reg_max);
+}
+}  // namespace fastdeploy
diff --git a/fastdeploy/vision/vision_pybind.cc b/fastdeploy/vision/vision_pybind.cc
index 22c4f0bc2e..42fcebff47 100644
--- a/fastdeploy/vision/vision_pybind.cc
+++ b/fastdeploy/vision/vision_pybind.cc
@@ -23,6 +23,7 @@ void BindPPSeg(pybind11::module& m);
 void BindUltralytics(pybind11::module& m);
 void BindMeituan(pybind11::module& m);
 void BindMegvii(pybind11::module& m);
+void BindRangiLyu(pybind11::module& m);
 #ifdef ENABLE_VISION_VISUALIZE
 void BindVisualize(pybind11::module& m);
 #endif
@@ -56,6 +57,7 @@ void BindVision(pybind11::module& m) {
   BindWongkinyiu(m);
   BindMeituan(m);
   BindMegvii(m);
+  BindRangiLyu(m);
 #ifdef ENABLE_VISION_VISUALIZE
   BindVisualize(m);
 #endif
diff --git a/model_zoo/vision/nanodet_plus/README.md b/model_zoo/vision/nanodet_plus/README.md
new file mode 100644
index 0000000000..164f7691fb
--- /dev/null
+++ b/model_zoo/vision/nanodet_plus/README.md
@@ -0,0 +1,46 @@
+# NanoDetPlus部署示例
+
+当前支持模型版本为：[NanoDetPlus v1.0.0-alpha-1](https://github.com/RangiLyu/nanodet/releases/tag/v1.0.0-alpha-1)
+
+本文档说明如何进行[NanoDetPlus](https://github.com/RangiLyu/nanodet)的快速部署推理。本目录结构如下
+```
+.
+├── cpp                  # C++ 代码目录
+│   ├── CMakeLists.txt   # C++ 代码编译CMakeLists文件
+│   ├── README.md        # C++ 代码编译部署文档
+│   └── nanodet_plus.cc  # C++ 示例代码
+├── README.md            # YOLOX 部署文档
+└── nanodet_plus.py      # Python示例代码
+```
+
+## 安装FastDeploy
+
+使用如下命令安装FastDeploy，注意到此处安装的是`vision-cpu`，也可根据需求安装`vision-gpu`
+```
+# 安装fastdeploy-python工具
+pip install fastdeploy-python
+
+# 安装vision-cpu模块
+fastdeploy install vision-cpu
+```
+
+## Python部署
+
+执行如下代码即会自动下载NanoDetPlus模型和测试图片
+```
+python nanodet_plus.py
+```
+
+执行完成后会将可视化结果保存在本地`vis_result.jpg`，同时输出检测结果如下
+```
+DetectionResult: [xmin, ymin, xmax, ymax, score, label_id]
+5.710144,220.634033, 807.854370, 724.089111, 0.825635, 5
+45.646439,393.694061, 229.267044, 903.998413, 0.818263, 0
+218.289322,402.268829, 342.083252, 861.766479, 0.709301, 0
+698.587036,325.627197, 809.000000, 876.990967, 0.630235, 0
+```
+
+## 其它文档
+
+- [C++部署](./cpp/README.md)
+- [NanoDetPlus API文档](./api.md)
diff --git a/model_zoo/vision/nanodet_plus/api.md b/model_zoo/vision/nanodet_plus/api.md
new file mode 100644
index 0000000000..b428e39dfd
--- /dev/null
+++ b/model_zoo/vision/nanodet_plus/api.md
@@ -0,0 +1,71 @@
+# NanoDetPlus API说明
+
+## Python API
+
+### NanoDetPlus类
+```
+fastdeploy.vision.rangilyu.NanoDetPlus(model_file, params_file=None, runtime_option=None, model_format=fd.Frontend.ONNX)
+```
+NanoDetPlus模型加载和初始化，当model_format为`fd.Frontend.ONNX`时，只需提供model_file，如`nanodet-plus-m_320.onnx`；当model_format为`fd.Frontend.PADDLE`时，则需同时提供model_file和params_file。
+
+**参数**
+
+> * **model_file**(str): 模型文件路径
+> * **params_file**(str): 参数文件路径
+> * **runtime_option**(RuntimeOption): 后端推理配置，默认为None，即采用默认配置
+> * **model_format**(Frontend): 模型格式
+
+#### predict函数
+> ```
+> NanoDetPlus.predict(image_data, conf_threshold=0.35, nms_iou_threshold=0.5)
+> ```
+> 模型预测结口，输入图像直接输出检测结果。
+>
+> **参数**
+>
+> > * **image_data**(np.ndarray): 输入数据，注意需为HWC，BGR格式
+> > * **conf_threshold**(float): 检测框置信度过滤阈值
+> > * **nms_iou_threshold**(float): NMS处理过程中iou阈值
+
+示例代码参考[nanodet_plus.py](./nanodet_plus.py)
+
+
+## C++ API
+
+### NanoDetPlus类
+```
+fastdeploy::vision::rangilyu::NanoDetPlus(
+        const string& model_file,
+        const string& params_file = "",
+        const RuntimeOption& runtime_option = RuntimeOption(),
+        const Frontend& model_format = Frontend::ONNX)
+```
+NanoDetPlus模型加载和初始化，当model_format为`Frontend::ONNX`时，只需提供model_file，如`nanodet-plus-m_320.onnx`；当model_format为`Frontend::PADDLE`时，则需同时提供model_file和params_file。
+
+**参数**
+
+> * **model_file**(str): 模型文件路径
+> * **params_file**(str): 参数文件路径
+> * **runtime_option**(RuntimeOption): 后端推理配置，默认为None，即采用默认配置
+> * **model_format**(Frontend): 模型格式
+
+#### Predict函数
+> ```
+> NanoDetPlus::Predict(cv::Mat* im, DetectionResult* result,
+>                      float conf_threshold = 0.35,
+>                      float nms_iou_threshold = 0.5)
+> ```
+> 模型预测接口，输入图像直接输出检测结果。
+>
+> **参数**
+>
+> > * **im**: 输入图像，注意需为HWC，BGR格式
+> > * **result**: 检测结果，包括检测框，各个框的置信度
+> > * **conf_threshold**: 检测框置信度过滤阈值
+> > * **nms_iou_threshold**: NMS处理过程中iou阈值
+
+示例代码参考[cpp/nanodet_plus.cc](cpp/nanodet_plus.cc)
+
+## 其它API使用
+
+- [模型部署RuntimeOption配置](../../../docs/api/runtime_option.md)
diff --git a/model_zoo/vision/nanodet_plus/cpp/CMakeLists.txt b/model_zoo/vision/nanodet_plus/cpp/CMakeLists.txt
new file mode 100644
index 0000000000..7a78ef9e4d
--- /dev/null
+++ b/model_zoo/vision/nanodet_plus/cpp/CMakeLists.txt
@@ -0,0 +1,17 @@
+PROJECT(nanodet_plus_demo C CXX)
+CMAKE_MINIMUM_REQUIRED(VERSION 3.16)
+
+# 在低版本ABI环境中，通过如下代码进行兼容性编译
+# add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0)
+
+# 指定下载解压后的fastdeploy库路径
+set(FASTDEPLOY_INSTALL_DIR ${PROJECT_SOURCE_DIR}/fastdeploy-linux-x64-0.0.3/)
+
+include(${FASTDEPLOY_INSTALL_DIR}/FastDeploy.cmake)
+
+# 添加FastDeploy依赖头文件
+include_directories(${FASTDEPLOY_INCS})
+
+add_executable(nanodet_plus_demo ${PROJECT_SOURCE_DIR}/nanodet_plus.cc)
+# 添加FastDeploy库依赖
+target_link_libraries(nanodet_plus_demo ${FASTDEPLOY_LIBS})
diff --git a/model_zoo/vision/nanodet_plus/cpp/README.md b/model_zoo/vision/nanodet_plus/cpp/README.md
new file mode 100644
index 0000000000..03dc65a0ab
--- /dev/null
+++ b/model_zoo/vision/nanodet_plus/cpp/README.md
@@ -0,0 +1,30 @@
+# 编译NanoDetPlus示例
+
+当前支持模型版本为：[NanoDetPlus v1.0.0-alpha-1](https://github.com/RangiLyu/nanodet/releases/tag/v1.0.0-alpha-1)
+
+```
+# 下载和解压预测库
+wget https://bj.bcebos.com/paddle2onnx/fastdeploy/fastdeploy-linux-x64-0.0.3.tgz
+tar xvf fastdeploy-linux-x64-0.0.3.tgz
+
+# 编译示例代码
+mkdir build & cd build
+cmake ..
+make -j
+
+# 下载模型和图片
+wget https://github.com/RangiLyu/nanodet/releases/download/v1.0.0-alpha-1/nanodet-plus-m_320.onnx
+wget https://raw.githubusercontent.com/ultralytics/yolov5/master/data/images/bus.jpg
+
+# 执行
+./nanodet_plus_demo
+```
+
+执行完后可视化的结果保存在本地`vis_result.jpg`，同时会将检测框输出在终端，如下所示
+```
+DetectionResult: [xmin, ymin, xmax, ymax, score, label_id]
+5.710144,220.634033, 807.854370, 724.089111, 0.825635, 5
+45.646439,393.694061, 229.267044, 903.998413, 0.818263, 0
+218.289322,402.268829, 342.083252, 861.766479, 0.709301, 0
+698.587036,325.627197, 809.000000, 876.990967, 0.630235, 0
+```
diff --git a/model_zoo/vision/nanodet_plus/cpp/nanodet_plus.cc b/model_zoo/vision/nanodet_plus/cpp/nanodet_plus.cc
new file mode 100644
index 0000000000..b252bf6f8b
--- /dev/null
+++ b/model_zoo/vision/nanodet_plus/cpp/nanodet_plus.cc
@@ -0,0 +1,40 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/vision.h"
+
+int main() {
+  namespace vis = fastdeploy::vision;
+  auto model = vis::rangilyu::NanoDetPlus("nanodet-plus-m_320.onnx");
+  if (!model.Initialized()) {
+    std::cerr << "Init Failed." << std::endl;
+    return -1;
+  }
+  cv::Mat im = cv::imread("bus.jpg");
+  cv::Mat vis_im = im.clone();
+
+  vis::DetectionResult res;
+  if (!model.Predict(&im, &res)) {
+    std::cerr << "Prediction Failed." << std::endl;
+    return -1;
+  }
+
+  // 输出预测框结果
+  std::cout << res.Str() << std::endl;
+
+  // 可视化预测结果
+  vis::Visualize::VisDetection(&vis_im, res);
+  cv::imwrite("vis_result.jpg", vis_im);
+  return 0;
+}
diff --git a/model_zoo/vision/nanodet_plus/nanodet_plus.py b/model_zoo/vision/nanodet_plus/nanodet_plus.py
new file mode 100644
index 0000000000..4101d20408
--- /dev/null
+++ b/model_zoo/vision/nanodet_plus/nanodet_plus.py
@@ -0,0 +1,23 @@
+import fastdeploy as fd
+import cv2
+
+# 下载模型和测试图片
+model_url = "https://github.com/RangiLyu/nanodet/releases/download/v1.0.0-alpha-1/nanodet-plus-m_320.onnx"
+test_jpg_url = "https://raw.githubusercontent.com/ultralytics/yolov5/master/data/images/bus.jpg"
+fd.download(model_url, ".", show_progress=True)
+fd.download(test_jpg_url, ".", show_progress=True)
+
+# 加载模型
+model = fd.vision.rangilyu.NanoDetPlus("nanodet-plus-m_320.onnx")
+
+# 预测图片
+im = cv2.imread("bus.jpg")
+result = model.predict(im, conf_threshold=0.35, nms_iou_threshold=0.5)
+
+# 可视化结果
+fd.vision.visualize.vis_detection(im, result)
+cv2.imwrite("vis_result.jpg", im)
+
+# 输出预测结果
+print(result)
+print(model.runtime_option)

From 51ecb407d491281ac18332165ec8e343136517b7 Mon Sep 17 00:00:00 2001
From: Jason <jiangjiajun@baidu.com>
Date: Fri, 22 Jul 2022 09:53:46 +0800
Subject: [PATCH 3/9] Add Paddle Inference as backend (#33)

* Add Paddle Inference as backend

* Update CMakeLists.txt

* Fix default backend option

* Fix some log bug and backend choosing strategy

* Add version of g++
---
 CMakeLists.txt                               |  20 +++-
 FastDeploy.cmake.in                          |  50 ++++++++-
 external/paddle_inference.cmake              | 107 +++++++++++++++++++
 external/summary.cmake                       |  10 +-
 fastdeploy/backends/paddle/paddle_backend.cc | 105 ++++++++++++++++++
 fastdeploy/backends/paddle/paddle_backend.h  |  77 +++++++++++++
 fastdeploy/backends/paddle/util.cc           |  72 +++++++++++++
 fastdeploy/core/config.h.in                  |   4 +
 fastdeploy/fastdeploy_model.cc               |  85 ++++++---------
 fastdeploy/fastdeploy_runtime.cc             |  85 +++++++++++++--
 fastdeploy/fastdeploy_runtime.h              |  10 +-
 fastdeploy/vision/ppdet/ppyoloe.cc           |  16 +--
 model_zoo/vision/ppyoloe/cpp/ppyoloe.cc      |   6 +-
 13 files changed, 569 insertions(+), 78 deletions(-)
 create mode 100644 external/paddle_inference.cmake
 create mode 100644 fastdeploy/backends/paddle/paddle_backend.cc
 create mode 100644 fastdeploy/backends/paddle/paddle_backend.h
 create mode 100644 fastdeploy/backends/paddle/util.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b6ac77f4f7..fa2d421a19 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -29,6 +29,7 @@ option(ENABLE_PADDLE_FRONTEND "if to enable PaddlePaddle frontend to support loa
 option(WITH_GPU "if WITH_GPU=ON, will enable onnxruntime-gpu/paddle-infernce-gpu" OFF)
 option(ENABLE_ORT_BACKEND "if to enable onnxruntime backend." OFF)
 option(ENABLE_TRT_BACKEND "if to enable tensorrt backend." OFF)
+option(ENABLE_PADDLE_BACKEND "if to enable paddle backend." OFF)
 option(CUDA_DIRECTORY "if build tensorrt backend, need to define path of cuda library.")
 option(TRT_DIRECTORY "if build tensorrt backend, need to define path of tensorrt library.")
 option(ENABLE_VISION "if to enable vision models usage." OFF)
@@ -39,7 +40,7 @@ option(ENABLE_OPENCV_CUDA "if to enable opencv with cuda, this will allow proces
 option(ENABLE_DEBUG "if to enable print debug information, this may reduce performance." OFF)
 
 # Whether to build fastdeply with vision/text/... examples, only for testings.
-option(WITH_VISION_EXAMPLES "Whether to build fastdeply with vision examples" ON)
+option(WITH_VISION_EXAMPLES "Whether to build fastdeply with vision examples" OFF)
 
 if(ENABLE_DEBUG)
   add_definitions(-DFASTDEPLOY_DEBUG)
@@ -63,10 +64,11 @@ endif()
 add_definitions(-DFASTDEPLOY_LIB)
 file(GLOB_RECURSE ALL_DEPLOY_SRCS ${PROJECT_SOURCE_DIR}/fastdeploy/*.cc)
 file(GLOB_RECURSE DEPLOY_ORT_SRCS ${PROJECT_SOURCE_DIR}/fastdeploy/backends/ort/*.cc)
+file(GLOB_RECURSE DEPLOY_PADDLE_SRCS ${PROJECT_SOURCE_DIR}/fastdeploy/backends/paddle/*.cc)
 file(GLOB_RECURSE DEPLOY_TRT_SRCS ${PROJECT_SOURCE_DIR}/fastdeploy/backends/tensorrt/*.cc ${PROJECT_SOURCE_DIR}/fastdeploy/backends/tensorrt/*.cpp)
 file(GLOB_RECURSE DEPLOY_VISION_SRCS ${PROJECT_SOURCE_DIR}/fastdeploy/vision/*.cc)
 file(GLOB_RECURSE DEPLOY_PYBIND_SRCS ${PROJECT_SOURCE_DIR}/fastdeploy/pybind/*.cc ${PROJECT_SOURCE_DIR}/fastdeploy/*_pybind.cc)
-list(REMOVE_ITEM ALL_DEPLOY_SRCS ${DEPLOY_ORT_SRCS} ${DEPLOY_TRT_SRCS} ${DEPLOY_VISION_SRCS})
+list(REMOVE_ITEM ALL_DEPLOY_SRCS ${DEPLOY_ORT_SRCS} ${DEPLOY_PADDLE_SRCS} ${DEPLOY_TRT_SRCS} ${DEPLOY_VISION_SRCS})
 
 set(DEPEND_LIBS "")
 
@@ -87,6 +89,13 @@ if(ENABLE_ORT_BACKEND)
   list(APPEND DEPEND_LIBS external_onnxruntime)
 endif()
 
+if(ENABLE_PADDLE_BACKEND)
+  add_definitions(-DENABLE_PADDLE_BACKEND)
+  list(APPEND ALL_DEPLOY_SRCS ${DEPLOY_PADDLE_SRCS})
+  include(external/paddle_inference.cmake)
+  list(APPEND DEPEND_LIBS external_paddle_inference external_dnnl external_omp)
+endif()
+
 if(WITH_GPU)
   if(APPLE)
     message(FATAL_ERROR "Cannot enable GPU while compling in Mac OSX.")
@@ -280,3 +289,10 @@ if(BUILD_FASTDEPLOY_PYTHON)
   endif()
 
 endif(BUILD_FASTDEPLOY_PYTHON)
+
+if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS "5.4.0")
+    string(STRIP "${CMAKE_CXX_COMPILER_VERSION}" CMAKE_CXX_COMPILER_VERSION)
+    message(WARNING "[WARNING] FastDeploy require g++ version >= 5.4.0, but now your g++ version is ${CMAKE_CXX_COMPILER_VERSION}, this may cause failure! Use -DCMAKE_CXX_COMPILER to define path of your compiler.")
+  endif()
+endif()
diff --git a/FastDeploy.cmake.in b/FastDeploy.cmake.in
index 80ceb9b79e..e8c0bb3d59 100644
--- a/FastDeploy.cmake.in
+++ b/FastDeploy.cmake.in
@@ -2,7 +2,9 @@ CMAKE_MINIMUM_REQUIRED (VERSION 3.16)
 
 set(WITH_GPU @WITH_GPU@)
 set(ENABLE_ORT_BACKEND @ENABLE_ORT_BACKEND@)
-set(ENABLE_TRT_BACKEND @ENABLE_TRT_BACKEND&)
+set(ENABLE_PADDLE_BACKEND @ENABLE_PADDLE_BACKEND@)
+set(PADDLEINFERENCE_VERSION @PADDLEINFERENCE_VERSION@)
+set(ENABLE_TRT_BACKEND @ENABLE_TRT_BACKEND@)
 set(ENABLE_PADDLE_FRONTEND @ENABLE_PADDLE_FRONTEND@)
 set(ENABLE_VISION @ENABLE_VISION@)
 set(ENABLE_OPENCV_CUDA @ENABLE_OPENCV_CUDA@)
@@ -17,16 +19,35 @@ endif()
 
 find_library(FDLIB fastdeploy ${CMAKE_CURRENT_LIST_DIR}/lib)
 list(APPEND FASTDEPLOY_LIBS ${FDLIB})
+
 if(ENABLE_ORT_BACKEND)
   find_library(ORT_LIB onnxruntime ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/onnxruntime/lib)
   list(APPEND FASTDEPLOY_LIBS ${ORT_LIB})
 endif()
 
+if(ENABLE_PADDLE_BACKEND)
+  find_library(PADDLE_LIB paddle_inference ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/paddle_inference/paddle/lib)
+  if(WIN32)
+    set(DNNL_LIB "${CMAKE_CURRENT_LIST_DIR}/third_libs/install/paddle_inference/third_party/install/mkldnn/lib/mkldnn.lib")
+    set(IOMP_LIB "${CMAKE_CURRENT_LIST_DIR}/third_libs/install/paddle_inference/third_party/install/mklml/lib/libiomp5.lib")
+  elseif(APPLE)
+    set(DNNL_LIB "")
+    set(IOMP_LIB "")
+  else()
+    set(DNNL_LIB "${CMAKE_CURRENT_LIST_DIR}/third_libs/install/paddle_inference/third_party/install/mkldnn/lib/libmkldnn.so.0")
+    set(IOMP_LIB "${CMAKE_CURRENT_LIST_DIR}/third_libs/install/paddle_inference/third_party/install/mklml/lib/libiomp5.so")
+  endif()
+  list(APPEND FASTDEPLOY_LIBS ${PADDLE_LIB} ${DNNL_LIB} ${IOMP_LIB})
+endif()
+
 if(WITH_GPU)
   if (NOT CUDA_DIRECTORY)
-    message(FATAL_ERROR "[FastDeploy] Please define CUDA_DIRECTORY, e.g -DCUDA_DIRECTORY=/usr/local/cuda")
+    set(CUDA_DIRECTORY "/usr/local/cuda")
   endif()
   find_library(CUDA_LIB cudart ${CUDA_DIRECTORY}/lib64)
+  if(NOT CUDA_LIB)
+    message(FATAL_ERROR "[FastDeploy] Cannot find library cudart in ${CUDA_DIRECTORY}, Please define CUDA_DIRECTORY, e.g -DCUDA_DIRECTORY=/path/to/cuda")
+  endif()
   list(APPEND FASTDEPLOY_LIBS ${CUDA_LIB})
 
   if (ENABLE_TRT_BACKEND)
@@ -61,3 +82,28 @@ if(ENABLE_PADDLE_FRONTEND)
   find_library(PADDLE2ONNX_LIB paddle2onnx  ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/paddle2onnx/lib)
   list(APPEND FASTDEPLOY_LIBS ${PADDLE2ONNX_LIB})
 endif()
+
+# Print compiler information
+message(STATUS "")
+message(STATUS "*************FastDeploy Building Summary**********")
+message(STATUS "  CMake version             : ${CMAKE_VERSION}")
+message(STATUS "  CMake command             : ${CMAKE_COMMAND}")
+message(STATUS "  System                    : ${CMAKE_SYSTEM_NAME}")
+message(STATUS "  C++ compiler              : ${CMAKE_CXX_COMPILER}")
+message(STATUS "  C++ compiler version      : ${CMAKE_CXX_COMPILER_VERSION}")
+message(STATUS "  CXX flags                 : ${CMAKE_CXX_FLAGS}")
+message(STATUS "  WITH_GPU                  : ${WITH_GPU}")
+message(STATUS "  ENABLE_ORT_BACKEND        : ${ENABLE_ORT_BACKEND}")
+message(STATUS "  ENABLE_PADDLE_BACKEND     : ${ENABLE_PADDLE_BACKEND}")
+if(ENABLE_PADDLE_BACKEND)
+  message(STATUS "  Paddle Inference version  : ${PADDLEINFERENCE_VERSION}")
+endif()
+message(STATUS "  ENABLE_TRT_BACKEND        : ${ENABLE_TRT_BACKEND}")
+message(STATUS "  ENABLE_VISION             : ${ENABLE_VISION}")
+
+if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS "5.4.0")
+    string(STRIP "${CMAKE_CXX_COMPILER_VERSION}" CMAKE_CXX_COMPILER_VERSION)
+    message(WARNING "[WARNING] FastDeploy require g++ version >= 5.4.0, but now your g++ version is ${CMAKE_CXX_COMPILER_VERSION}, this may cause failure! Use -DCMAKE_CXX_COMPILER to define path of your compiler.")
+  endif()
+endif()
diff --git a/external/paddle_inference.cmake b/external/paddle_inference.cmake
new file mode 100644
index 0000000000..41aa740f62
--- /dev/null
+++ b/external/paddle_inference.cmake
@@ -0,0 +1,107 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+include(ExternalProject)
+
+set(PADDLEINFERENCE_PROJECT "extern_paddle_inference")
+set(PADDLEINFERENCE_PREFIX_DIR ${THIRD_PARTY_PATH}/paddle_inference)
+set(PADDLEINFERENCE_SOURCE_DIR
+    ${THIRD_PARTY_PATH}/paddle_inference/src/${PADDLEINFERENCE_PROJECT})
+set(PADDLEINFERENCE_INSTALL_DIR ${THIRD_PARTY_PATH}/install/paddle_inference)
+set(PADDLEINFERENCE_INC_DIR
+    "${PADDLEINFERENCE_INSTALL_DIR}/paddle/include"
+    CACHE PATH "paddle_inference include directory." FORCE)
+set(PADDLEINFERENCE_LIB_DIR
+    "${PADDLEINFERENCE_INSTALL_DIR}/paddle/lib/"
+    CACHE PATH "paddle_inference lib directory." FORCE)
+set(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}"
+                      "${PADDLEINFERENCE_LIB_DIR}")
+
+include_directories(${PADDLEINFERENCE_INC_DIR})
+if(WIN32)
+  set(PADDLEINFERENCE_COMPILE_LIB
+      "${PADDLEINFERENCE_INSTALL_DIR}/paddle/lib/paddle_inference.lib"
+      CACHE FILEPATH "paddle_inference compile library." FORCE)
+  set(DNNL_LIB "")
+  set(OMP_LIB "")
+elseif(APPLE)
+  set(PADDLEINFERENCE_COMPILE_LIB
+      "${PADDLEINFERENCE_INSTALL_DIR}/paddle/lib/libpaddle_inference.dylib"
+      CACHE FILEPATH "paddle_inference compile library." FORCE)
+  set(DNNL_LIB "")
+  set(OMP_LIB "")
+else()
+  set(PADDLEINFERENCE_COMPILE_LIB
+      "${PADDLEINFERENCE_INSTALL_DIR}/paddle/lib/libpaddle_inference.so"
+      CACHE FILEPATH "paddle_inference compile library." FORCE)
+  set(DNNL_LIB "${PADDLEINFERENCE_INSTALL_DIR}/third_party/install/mkldnn/lib/libdnnl.so.2")
+  set(OMP_LIB "${PADDLEINFERENCE_INSTALL_DIR}/third_party/install/mklml/lib/libiomp5.so")
+endif(WIN32)
+
+set(PADDLEINFERENCE_URL_BASE "https://bj.bcebos.com/paddle2onnx/libs/")
+set(PADDLEINFERENCE_VERSION "2.3.1")
+if(WIN32)
+  message(FATAL_ERROR "Paddle Backend doesn't support Windows now.")
+  set(PADDLEINFERENCE_FILE "paddle_inference-win-x64-${PADDLEINFERENCE_VERSION}.zip")
+elseif(APPLE)
+  message(FATAL_ERROR "Paddle Backend doesn't support Mac OSX now.")
+  if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "arm64")
+    set(PADDLEINFERENCE_FILE "paddle_inference-osx-arm64-${PADDLEINFERENCE_VERSION}.tgz")
+  else()
+    set(PADDLEINFERENCE_FILE "paddle_inference-osx-x86_64-${PADDLEINFERENCE_VERSION}.tgz")
+  endif()
+else()
+  if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
+    message(FATAL_ERROR "Paddle Backend doesn't support linux aarch64 now.")
+    set(PADDLEINFERENCE_FILE "paddle_inference-linux-aarch64-${PADDLEINFERENCE_VERSION}.tgz")
+  else()
+    set(PADDLEINFERENCE_FILE "paddle_inference-linux-x64-${PADDLEINFERENCE_VERSION}.tgz")
+    if(WITH_GPU)
+        set(PADDLEINFERENCE_FILE "paddle_inference-linux-x64-gpu-${PADDLEINFERENCE_VERSION}.tgz")
+    endif()
+  endif()
+endif()
+set(PADDLEINFERENCE_URL "${PADDLEINFERENCE_URL_BASE}${PADDLEINFERENCE_FILE}")
+
+ExternalProject_Add(
+  ${PADDLEINFERENCE_PROJECT}
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  URL ${PADDLEINFERENCE_URL}
+  PREFIX ${PADDLEINFERENCE_PREFIX_DIR}
+  DOWNLOAD_NO_PROGRESS 1
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND ""
+  UPDATE_COMMAND ""
+  INSTALL_COMMAND
+    ${CMAKE_COMMAND} -E remove_directory ${PADDLEINFERENCE_INSTALL_DIR} &&
+    ${CMAKE_COMMAND} -E make_directory ${PADDLEINFERENCE_INSTALL_DIR} &&
+    ${CMAKE_COMMAND} -E rename ${PADDLEINFERENCE_SOURCE_DIR}/paddle/
+    ${PADDLEINFERENCE_INSTALL_DIR}/paddle && ${CMAKE_COMMAND} -E rename 
+    ${PADDLEINFERENCE_SOURCE_DIR}/third_party ${PADDLEINFERENCE_INSTALL_DIR}/third_party &&
+    ${CMAKE_COMMAND} -E rename ${PADDLEINFERENCE_SOURCE_DIR}/version.txt ${PADDLEINFERENCE_INSTALL_DIR}/version.txt
+  BUILD_BYPRODUCTS ${PADDLEINFERENCE_COMPILE_LIB})
+
+add_library(external_paddle_inference STATIC IMPORTED GLOBAL)
+set_property(TARGET external_paddle_inference PROPERTY IMPORTED_LOCATION
+                                         ${PADDLEINFERENCE_COMPILE_LIB})
+add_dependencies(external_paddle_inference ${PADDLEINFERENCE_PROJECT})
+
+add_library(external_dnnl STATIC IMPORTED GLOBAL)
+set_property(TARGET external_dnnl PROPERTY IMPORTED_LOCATION
+                                         ${DNNL_LIB})
+add_dependencies(external_dnnl ${PADDLEINFERENCE_PROJECT})
+
+add_library(external_omp STATIC IMPORTED GLOBAL)
+set_property(TARGET external_omp PROPERTY IMPORTED_LOCATION
+                                         ${OMP_LIB})
+add_dependencies(external_omp ${PADDLEINFERENCE_PROJECT})
diff --git a/external/summary.cmake b/external/summary.cmake
index 3c2393eda6..bd5e793902 100644
--- a/external/summary.cmake
+++ b/external/summary.cmake
@@ -30,11 +30,17 @@ function(fastdeploy_summary)
   message(STATUS "")
   message(STATUS "  FastDeploy version        : ${FASTDEPLOY_VERSION}")
   message(STATUS "  Paddle2ONNX version       : ${PADDLE2ONNX_VERSION}")
-  message(STATUS "  ONNXRuntime version       : ${ONNXRUNTIME_VERSION}")
   message(STATUS "  ENABLE_ORT_BACKEND        : ${ENABLE_ORT_BACKEND}")
+  message(STATUS "  ENABLE_PADDLE_BACKEND     : ${ENABLE_PADDLE_BACKEND}")
+  message(STATUS "  ENABLE_TRT_BACKEND        : ${ENABLE_TRT_BACKEND}")
+  if(ENABLE_ORT_BACKEND)
+    message(STATUS "  ONNXRuntime version       : ${ONNXRUNTIME_VERSION}")
+  endif()
+  if(ENABLE_PADDLE_BACKEND)
+    message(STATUS "  Paddle Inference version  : ${PADDLEINFERENCE_VERSION}")
+  endif()
   if(WITH_GPU)
     message(STATUS "  WITH_GPU                  : ${WITH_GPU}")
-    message(STATUS "  ENABLE_TRT_BACKEND        : ${ENABLE_TRT_BACKEND}")
     message(STATUS "  CUDA_DIRECTORY            : ${CUDA_DIRECTORY}")
     message(STATUS "  TRT_DRECTORY              : ${TRT_DIRECTORY}")
   endif()
diff --git a/fastdeploy/backends/paddle/paddle_backend.cc b/fastdeploy/backends/paddle/paddle_backend.cc
new file mode 100644
index 0000000000..2fae38937d
--- /dev/null
+++ b/fastdeploy/backends/paddle/paddle_backend.cc
@@ -0,0 +1,105 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/backends/paddle/paddle_backend.h"
+
+namespace fastdeploy {
+
+void PaddleBackend::BuildOption(const PaddleBackendOption& option) {
+  if (option.use_gpu) {
+    config_.EnableUseGpu(option.gpu_mem_init_size, option.gpu_id);
+  } else {
+    config_.DisableGpu();
+    if (option.enable_mkldnn) {
+      config_.EnableMKLDNN();
+      config_.SetMkldnnCacheCapacity(option.mkldnn_cache_size);
+    }
+  }
+  config_.SetCpuMathLibraryNumThreads(option.cpu_thread_num);
+}
+
+bool PaddleBackend::InitFromPaddle(const std::string& model_file,
+                                   const std::string& params_file,
+                                   const PaddleBackendOption& option) {
+  if (initialized_) {
+    FDERROR << "PaddleBackend is already initlized, cannot initialize again."
+            << std::endl;
+    return false;
+  }
+  config_.SetModel(model_file, params_file);
+  BuildOption(option);
+  predictor_ = paddle_infer::CreatePredictor(config_);
+  std::vector<std::string> input_names = predictor_->GetInputNames();
+  std::vector<std::string> output_names = predictor_->GetOutputNames();
+  for (size_t i = 0; i < input_names.size(); ++i) {
+    auto handle = predictor_->GetInputHandle(input_names[i]);
+    TensorInfo info;
+    auto shape = handle->shape();
+    info.shape.assign(shape.begin(), shape.end());
+    info.dtype = PaddleDataTypeToFD(handle->type());
+    info.name = input_names[i];
+    inputs_desc_.emplace_back(info);
+  }
+  for (size_t i = 0; i < output_names.size(); ++i) {
+    auto handle = predictor_->GetOutputHandle(output_names[i]);
+    TensorInfo info;
+    auto shape = handle->shape();
+    info.shape.assign(shape.begin(), shape.end());
+    info.dtype = PaddleDataTypeToFD(handle->type());
+    info.name = output_names[i];
+    outputs_desc_.emplace_back(info);
+  }
+  initialized_ = true;
+  return true;
+}
+
+TensorInfo PaddleBackend::GetInputInfo(int index) {
+  FDASSERT(index < NumInputs(), "The index:" + std::to_string(index) +
+                                    " should less than the number of inputs:" +
+                                    std::to_string(NumInputs()) + ".");
+  return inputs_desc_[index];
+}
+
+TensorInfo PaddleBackend::GetOutputInfo(int index) {
+  FDASSERT(index < NumOutputs(),
+           "The index:" + std::to_string(index) +
+               " should less than the number of outputs:" +
+               std::to_string(NumOutputs()) + ".");
+  return outputs_desc_[index];
+}
+
+bool PaddleBackend::Infer(std::vector<FDTensor>& inputs,
+                          std::vector<FDTensor>* outputs) {
+  if (inputs.size() != inputs_desc_.size()) {
+    FDERROR << "[PaddleBackend] Size of inputs(" << inputs.size()
+            << ") should keep same with the inputs of this model("
+            << inputs_desc_.size() << ")." << std::endl;
+    return false;
+  }
+
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    auto handle = predictor_->GetInputHandle(inputs[i].name);
+    ShareTensorFromCpu(handle.get(), inputs[i]);
+  }
+
+  predictor_->Run();
+  outputs->resize(outputs_desc_.size());
+  for (size_t i = 0; i < outputs_desc_.size(); ++i) {
+    auto handle = predictor_->GetOutputHandle(outputs_desc_[i].name);
+    CopyTensorToCpu(handle, &((*outputs)[i]));
+  }
+  return true;
+}
+
+}  // namespace fastdeploy
diff --git a/fastdeploy/backends/paddle/paddle_backend.h b/fastdeploy/backends/paddle/paddle_backend.h
new file mode 100644
index 0000000000..146dcfd877
--- /dev/null
+++ b/fastdeploy/backends/paddle/paddle_backend.h
@@ -0,0 +1,77 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <iostream>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "fastdeploy/backends/backend.h"
+#include "paddle_inference_api.h"  // NOLINT
+
+namespace fastdeploy {
+
+struct PaddleBackendOption {
+#ifdef WITH_GPU
+  bool use_gpu = true;
+#else
+  bool use_gpu = false;
+#endif
+  bool enable_mkldnn = true;
+
+  int mkldnn_cache_size = 1;
+  int cpu_thread_num = 8;
+  // initialize memory size(MB) for GPU
+  int gpu_mem_init_size = 100;
+  // gpu device id
+  int gpu_id = 0;
+};
+
+// Share memory buffer with paddle_infer::Tensor from fastdeploy::FDTensor
+void ShareTensorFromCpu(paddle_infer::Tensor* tensor, FDTensor& fd_tensor);
+
+// Copy memory data from paddle_infer::Tensor to fastdeploy::FDTensor
+void CopyTensorToCpu(std::unique_ptr<paddle_infer::Tensor>& tensor,
+                     FDTensor* fd_tensor);
+
+// Convert data type from paddle inference to fastdeploy
+FDDataType PaddleDataTypeToFD(const paddle_infer::DataType& dtype);
+
+class PaddleBackend : public BaseBackend {
+ public:
+  PaddleBackend() {}
+  void BuildOption(const PaddleBackendOption& option);
+
+  bool InitFromPaddle(
+      const std::string& model_file, const std::string& params_file,
+      const PaddleBackendOption& option = PaddleBackendOption());
+
+  bool Infer(std::vector<FDTensor>& inputs, std::vector<FDTensor>* outputs);
+
+  int NumInputs() const { return inputs_desc_.size(); }
+
+  int NumOutputs() const { return outputs_desc_.size(); }
+
+  TensorInfo GetInputInfo(int index);
+  TensorInfo GetOutputInfo(int index);
+
+ private:
+  paddle_infer::Config config_;
+  std::shared_ptr<paddle_infer::Predictor> predictor_;
+  std::vector<TensorInfo> inputs_desc_;
+  std::vector<TensorInfo> outputs_desc_;
+};
+}  // namespace fastdeploy
diff --git a/fastdeploy/backends/paddle/util.cc b/fastdeploy/backends/paddle/util.cc
new file mode 100644
index 0000000000..2469596aed
--- /dev/null
+++ b/fastdeploy/backends/paddle/util.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/backends/paddle/paddle_backend.h"
+
+namespace fastdeploy {
+void ShareTensorFromCpu(paddle_infer::Tensor* tensor, FDTensor& fd_tensor) {
+  std::vector<int> shape(fd_tensor.shape.begin(), fd_tensor.shape.end());
+  if (fd_tensor.dtype == FDDataType::FP32) {
+    tensor->ShareExternalData(static_cast<const float*>(fd_tensor.Data()),
+                              shape, paddle_infer::PlaceType::kCPU);
+    return;
+  } else if (fd_tensor.dtype == FDDataType::INT32) {
+    tensor->ShareExternalData(static_cast<const int32_t*>(fd_tensor.Data()),
+                              shape, paddle_infer::PlaceType::kCPU);
+    return;
+  } else if (fd_tensor.dtype == FDDataType::INT64) {
+    tensor->ShareExternalData(static_cast<const int64_t*>(fd_tensor.Data()),
+                              shape, paddle_infer::PlaceType::kCPU);
+    return;
+  }
+  FDASSERT(false, "Unexpected data type(" + Str(fd_tensor.dtype) +
+                      ") while infer with PaddleBackend.");
+}
+
+void CopyTensorToCpu(std::unique_ptr<paddle_infer::Tensor>& tensor,
+                     FDTensor* fd_tensor) {
+  auto fd_dtype = PaddleDataTypeToFD(tensor->type());
+  fd_tensor->Allocate(tensor->shape(), fd_dtype, tensor->name());
+  if (fd_tensor->dtype == FDDataType::FP32) {
+    tensor->CopyToCpu(static_cast<float*>(fd_tensor->MutableData()));
+    return;
+  } else if (fd_tensor->dtype == FDDataType::INT32) {
+    tensor->CopyToCpu(static_cast<int32_t*>(fd_tensor->MutableData()));
+    return;
+  } else if (fd_tensor->dtype == FDDataType::INT64) {
+    tensor->CopyToCpu(static_cast<int64_t*>(fd_tensor->MutableData()));
+    return;
+  }
+  FDASSERT(false, "Unexpected data type(" + Str(fd_tensor->dtype) +
+                      ") while infer with PaddleBackend.");
+}
+
+FDDataType PaddleDataTypeToFD(const paddle_infer::DataType& dtype) {
+  auto fd_dtype = FDDataType::FP32;
+  if (dtype == paddle_infer::FLOAT32) {
+    fd_dtype = FDDataType::FP32;
+  } else if (dtype == paddle_infer::INT64) {
+    fd_dtype = FDDataType::INT64;
+  } else if (dtype == paddle_infer::INT32) {
+    fd_dtype = FDDataType::INT32;
+  } else if (dtype == paddle_infer::UINT8) {
+    fd_dtype = FDDataType::UINT8;
+  } else {
+    FDASSERT(false, "Unexpected data type:" + std::to_string(int(dtype)) +
+                        " while call CopyTensorToCpu in PaddleBackend.");
+  }
+  return fd_dtype;
+}
+
+}  // namespace fastdeploy
diff --git a/fastdeploy/core/config.h.in b/fastdeploy/core/config.h.in
index 48854e1ffe..7713925867 100644
--- a/fastdeploy/core/config.h.in
+++ b/fastdeploy/core/config.h.in
@@ -29,6 +29,10 @@
 #cmakedefine ENABLE_ORT_BACKEND
 #endif
 
+#ifndef ENABLE_PADDLE_BACKEND
+#cmakedefine ENABLE_PADDLE_BACKEND
+#endif
+
 #ifndef WITH_GPU
 #cmakedefine WITH_GPU
 #endif
diff --git a/fastdeploy/fastdeploy_model.cc b/fastdeploy/fastdeploy_model.cc
index f0a6fac711..97a5d9bc45 100644
--- a/fastdeploy/fastdeploy_model.cc
+++ b/fastdeploy/fastdeploy_model.cc
@@ -36,12 +36,21 @@ bool FastDeployModel::InitRuntime() {
     } else if (runtime_option.backend == Backend::TRT) {
       if (!IsBackendAvailable(Backend::TRT)) {
         FDERROR
-            << "Backend:TRT is not complied with current FastDeploy library."
+            << "Backend::TRT is not complied with current FastDeploy library."
             << std::endl;
         return false;
       }
+    } else if (runtime_option.backend == Backend::PDINFER) {
+      if (!IsBackendAvailable(Backend::PDINFER)) {
+        FDERROR << "Backend::PDINFER is not compiled with current FastDeploy "
+                   "library."
+                << std::endl;
+        return false;
+      }
     } else {
-      FDERROR << "Only support Backend::ORT / Backend::TRT now." << std::endl;
+      FDERROR
+          << "Only support Backend::ORT / Backend::TRT / Backend::PDINFER now."
+          << std::endl;
       return false;
     }
     runtime_ = new Runtime();
@@ -74,29 +83,19 @@ bool FastDeployModel::CreateCpuBackend() {
     return false;
   }
 
-  for (auto& b : valid_cpu_backends) {
-    if (b == Backend::ORT) {
-      if (!IsBackendAvailable(Backend::ORT)) {
-        FDERROR << "OrtBackend is not complied with current FastDeploy library."
-                << std::endl;
-        continue;
-      }
-      runtime_option.backend = b;
-      runtime_ = new Runtime();
-      if (!runtime_->Init(runtime_option)) {
-        return false;
-      }
-      runtime_initialized_ = true;
-      return true;
-    } else {
-      FDERROR << "Only Backend::ORT as cpu backend is supported now."
-              << std::endl;
+  for (size_t i = 0; i < valid_cpu_backends.size(); ++i) {
+    if (!IsBackendAvailable(valid_cpu_backends[i])) {
+      continue;
+    }
+    runtime_option.backend = valid_cpu_backends[i];
+    runtime_ = new Runtime();
+    if (!runtime_->Init(runtime_option)) {
       return false;
     }
+    runtime_initialized_ = true;
+    return true;
   }
-
-  FDERROR << "Cannot find an available cpu backend to load this model."
-          << std::endl;
+  FDERROR << "Found no valid backend for model: " << ModelName() << std::endl;
   return false;
 }
 
@@ -107,40 +106,18 @@ bool FastDeployModel::CreateGpuBackend() {
     return false;
   }
 
-  for (auto& b : valid_gpu_backends) {
-    if (b == Backend::ORT) {
-      if (!IsBackendAvailable(Backend::ORT)) {
-        FDERROR << "OrtBackend is not complied with current FastDeploy library."
-                << std::endl;
-        continue;
-      }
-      runtime_option.backend = b;
-      runtime_ = new Runtime();
-      if (!runtime_->Init(runtime_option)) {
-        return false;
-      }
-      runtime_initialized_ = true;
-      return true;
-    } else if (b == Backend::TRT) {
-      if (!IsBackendAvailable(Backend::TRT)) {
-        FDERROR << "TrtBackend is not complied with current FastDeploy library."
-                << std::endl;
-        continue;
-      }
-      runtime_option.backend = b;
-      runtime_ = new Runtime();
-      if (!runtime_->Init(runtime_option)) {
-        return false;
-      }
-      return true;
-    } else {
-      FDERROR << "Only Backend::ORT / Backend::TRT as gpu backends are "
-                 "supported now."
-              << std::endl;
+  for (size_t i = 0; i < valid_gpu_backends.size(); ++i) {
+    if (!IsBackendAvailable(valid_gpu_backends[i])) {
+      continue;
+    }
+    runtime_option.backend = valid_gpu_backends[i];
+    runtime_ = new Runtime();
+    if (!runtime_->Init(runtime_option)) {
       return false;
     }
+    runtime_initialized_ = true;
+    return true;
   }
-
   FDERROR << "Cannot find an available gpu backend to load this model."
           << std::endl;
   return false;
@@ -164,4 +141,4 @@ void FastDeployModel::EnableDebug() {
 
 bool FastDeployModel::DebugEnabled() { return debug_; }
 
-} // namespace fastdeploy
+}  // namespace fastdeploy
diff --git a/fastdeploy/fastdeploy_runtime.cc b/fastdeploy/fastdeploy_runtime.cc
index 28f363d850..6ee9fb3a85 100644
--- a/fastdeploy/fastdeploy_runtime.cc
+++ b/fastdeploy/fastdeploy_runtime.cc
@@ -22,6 +22,10 @@
 #include "fastdeploy/backends/tensorrt/trt_backend.h"
 #endif
 
+#ifdef ENABLE_PADDLE_BACKEND
+#include "fastdeploy/backends/paddle/paddle_backend.h"
+#endif
+
 namespace fastdeploy {
 
 std::vector<Backend> GetAvailableBackends() {
@@ -31,6 +35,9 @@ std::vector<Backend> GetAvailableBackends() {
 #endif
 #ifdef ENABLE_TRT_BACKEND
   backends.push_back(Backend::TRT);
+#endif
+#ifdef ENABLE_PADDLE_BACKEND
+  backends.push_back(Backend::PDINFER);
 #endif
   return backends;
 }
@@ -45,6 +52,26 @@ bool IsBackendAvailable(const Backend& backend) {
   return false;
 }
 
+std::string Str(const Backend& b) {
+  if (b == Backend::ORT) {
+    return "Backend::ORT";
+  } else if (b == Backend::TRT) {
+    return "Backend::TRT";
+  } else if (b == Backend::PDINFER) {
+    return "Backend::PDINFER";
+  }
+  return "UNKNOWN-Backend";
+}
+
+std::string Str(const Frontend& f) {
+  if (f == Frontend::PADDLE) {
+    return "Frontend::PADDLE";
+  } else if (f == Frontend::ONNX) {
+    return "Frontend::ONNX";
+  }
+  return "UNKNOWN-Frontend";
+}
+
 bool ModelFormatCheck(const std::string& model_file,
                       const Frontend& model_format) {
   if (model_format == Frontend::PADDLE) {
@@ -74,14 +101,33 @@ bool ModelFormatCheck(const std::string& model_file,
 
 bool Runtime::Init(const RuntimeOption& _option) {
   option = _option;
+  if (option.backend == Backend::UNKNOWN) {
+    if (IsBackendAvailable(Backend::ORT)) {
+      option.backend = Backend::ORT;
+    } else if (IsBackendAvailable(Backend::PDINFER)) {
+      option.backend = Backend::PDINFER;
+    } else {
+      FDERROR << "Please define backend in RuntimeOption, current it's "
+                 "Backend::UNKNOWN."
+              << std::endl;
+      return false;
+    }
+  }
   if (option.backend == Backend::ORT) {
-    FDASSERT(option.device == Device::CPU || option.device == Device::GPU, "Backend::TRT only supports Device::CPU/Device::GPU.");
+    FDASSERT(option.device == Device::CPU || option.device == Device::GPU,
+             "Backend::TRT only supports Device::CPU/Device::GPU.");
     CreateOrtBackend();
   } else if (option.backend == Backend::TRT) {
-    FDASSERT(option.device == Device::GPU, "Backend::TRT only supports Device::GPU.");
+    FDASSERT(option.device == Device::GPU,
+             "Backend::TRT only supports Device::GPU.");
     CreateTrtBackend();
+  } else if (option.backend == Backend::PDINFER) {
+    FDASSERT(option.device == Device::CPU || option.device == Device::GPU,
+             "Backend::TRT only supports Device::CPU/Device::GPU.");
+    CreatePaddleBackend();
   } else {
-    FDERROR << "Runtime only support Backend::ORT/Backend::TRT as backend now."
+    FDERROR << "Runtime only support "
+               "Backend::ORT/Backend::TRT/Backend::PDINFER as backend now."
             << std::endl;
     return false;
   }
@@ -101,6 +147,27 @@ bool Runtime::Infer(std::vector<FDTensor>& input_tensors,
   return backend_->Infer(input_tensors, output_tensors);
 }
 
+void Runtime::CreatePaddleBackend() {
+#ifdef ENABLE_PADDLE_BACKEND
+  auto pd_option = PaddleBackendOption();
+  pd_option.enable_mkldnn = option.pd_enable_mkldnn;
+  pd_option.mkldnn_cache_size = option.pd_mkldnn_cache_size;
+  pd_option.use_gpu = (option.device == Device::GPU) ? true : false;
+  pd_option.gpu_id = option.device_id;
+  FDASSERT(option.model_format == Frontend::PADDLE,
+           "PaddleBackend only support model format of Frontend::PADDLE.");
+  backend_ = new PaddleBackend();
+  auto casted_backend = dynamic_cast<PaddleBackend*>(backend_);
+  FDASSERT(casted_backend->InitFromPaddle(option.model_file, option.params_file,
+                                          pd_option),
+           "Load model from Paddle failed while initliazing PaddleBackend.");
+#else
+  FDASSERT(false,
+           "OrtBackend is not available, please compiled with "
+           "ENABLE_ORT_BACKEND=ON.");
+#endif
+}
+
 void Runtime::CreateOrtBackend() {
 #ifdef ENABLE_ORT_BACKEND
   auto ort_option = OrtBackendOption();
@@ -125,8 +192,9 @@ void Runtime::CreateOrtBackend() {
              "Load model from Paddle failed while initliazing OrtBackend.");
   }
 #else
-  FDASSERT(false, "OrtBackend is not available, please compiled with "
-                  "ENABLE_ORT_BACKEND=ON.");
+  FDASSERT(false,
+           "OrtBackend is not available, please compiled with "
+           "ENABLE_ORT_BACKEND=ON.");
 #endif
 }
 
@@ -158,8 +226,9 @@ void Runtime::CreateTrtBackend() {
              "Load model from Paddle failed while initliazing TrtBackend.");
   }
 #else
-  FDASSERT(false, "TrtBackend is not available, please compiled with "
-                  "ENABLE_TRT_BACKEND=ON.");
+  FDASSERT(false,
+           "TrtBackend is not available, please compiled with "
+           "ENABLE_TRT_BACKEND=ON.");
 #endif
 }
-} // namespace fastdeploy
+}  // namespace fastdeploy
diff --git a/fastdeploy/fastdeploy_runtime.h b/fastdeploy/fastdeploy_runtime.h
index 877875a134..eb88746321 100644
--- a/fastdeploy/fastdeploy_runtime.h
+++ b/fastdeploy/fastdeploy_runtime.h
@@ -23,6 +23,8 @@ namespace fastdeploy {
 enum FASTDEPLOY_DECL Backend { UNKNOWN, ORT, TRT, PDINFER };
 enum FASTDEPLOY_DECL Frontend { PADDLE, ONNX };
 
+FASTDEPLOY_DECL std::string Str(const Backend& b);
+FASTDEPLOY_DECL std::string Str(const Frontend& f);
 FASTDEPLOY_DECL std::vector<Backend> GetAvailableBackends();
 
 FASTDEPLOY_DECL bool IsBackendAvailable(const Backend& backend);
@@ -31,7 +33,7 @@ bool ModelFormatCheck(const std::string& model_file,
                       const Frontend& model_format);
 
 struct FASTDEPLOY_DECL RuntimeOption {
-  Backend backend = Backend::ORT;
+  Backend backend = Backend::UNKNOWN;
 
   // for cpu inference and preprocess
   int cpu_thread_num = 8;
@@ -52,6 +54,10 @@ struct FASTDEPLOY_DECL RuntimeOption {
   // 0: ORT_SEQUENTIAL 1: ORT_PARALLEL
   int ort_execution_mode = -1;
 
+  // ======Only for Paddle Backend=====
+  bool pd_enable_mkldnn = true;
+  int pd_mkldnn_cache_size = 1;
+
   // ======Only for Trt Backend=======
   std::map<std::string, std::vector<int32_t>> trt_fixed_shape;
   std::map<std::string, std::vector<int32_t>> trt_max_shape;
@@ -79,6 +85,8 @@ struct FASTDEPLOY_DECL Runtime {
 
   void CreateOrtBackend();
 
+  void CreatePaddleBackend();
+
   void CreateTrtBackend();
 
   int NumInputs() { return backend_->NumInputs(); }
diff --git a/fastdeploy/vision/ppdet/ppyoloe.cc b/fastdeploy/vision/ppdet/ppyoloe.cc
index c215ecb0ca..ed8d1e46f8 100644
--- a/fastdeploy/vision/ppdet/ppyoloe.cc
+++ b/fastdeploy/vision/ppdet/ppyoloe.cc
@@ -11,8 +11,8 @@ PPYOLOE::PPYOLOE(const std::string& model_file, const std::string& params_file,
                  const RuntimeOption& custom_option,
                  const Frontend& model_format) {
   config_file_ = config_file;
-  valid_cpu_backends = {Backend::ORT, Backend::PDINFER};
-  valid_gpu_backends = {Backend::ORT, Backend::PDINFER};
+  valid_cpu_backends = {Backend::PDINFER, Backend::ORT};
+  valid_gpu_backends = {Backend::PDINFER, Backend::ORT};
   runtime_option = custom_option;
   runtime_option.model_format = model_format;
   runtime_option.model_file = model_file;
@@ -22,12 +22,12 @@ PPYOLOE::PPYOLOE(const std::string& model_file, const std::string& params_file,
 
 bool PPYOLOE::Initialize() {
   if (!BuildPreprocessPipelineFromConfig()) {
-    std::cout << "Failed to build preprocess pipeline from configuration file."
+    FDERROR << "Failed to build preprocess pipeline from configuration file."
               << std::endl;
     return false;
   }
   if (!InitRuntime()) {
-    std::cout << "Failed to initialize fastdeploy backend." << std::endl;
+    FDERROR << "Failed to initialize fastdeploy backend." << std::endl;
     return false;
   }
   return true;
@@ -39,13 +39,13 @@ bool PPYOLOE::BuildPreprocessPipelineFromConfig() {
   try {
     cfg = YAML::LoadFile(config_file_);
   } catch (YAML::BadFile& e) {
-    std::cout << "Failed to load yaml file " << config_file_
+    FDERROR << "Failed to load yaml file " << config_file_
               << ", maybe you should check this file." << std::endl;
     return false;
   }
 
   if (cfg["arch"].as<std::string>() != "YOLO") {
-    std::cout << "Require the arch of model is YOLO, but arch defined in "
+    FDERROR << "Require the arch of model is YOLO, but arch defined in "
                  "config file is "
               << cfg["arch"].as<std::string>() << "." << std::endl;
     return false;
@@ -76,7 +76,7 @@ bool PPYOLOE::BuildPreprocessPipelineFromConfig() {
     } else if (op_name == "Permute") {
       processors_.push_back(std::make_shared<HWC2CHW>());
     } else {
-      std::cout << "Unexcepted preprocess operator: " << op_name << "."
+      FDERROR << "Unexcepted preprocess operator: " << op_name << "."
                 << std::endl;
       return false;
     }
@@ -89,7 +89,7 @@ bool PPYOLOE::Preprocess(Mat* mat, std::vector<FDTensor>* outputs) {
   int origin_h = mat->Height();
   for (size_t i = 0; i < processors_.size(); ++i) {
     if (!(*(processors_[i].get()))(mat)) {
-      std::cout << "Failed to process image data in " << processors_[i]->Name()
+      FDERROR << "Failed to process image data in " << processors_[i]->Name()
                 << "." << std::endl;
       return false;
     }
diff --git a/model_zoo/vision/ppyoloe/cpp/ppyoloe.cc b/model_zoo/vision/ppyoloe/cpp/ppyoloe.cc
index e63f29e62a..30765f075d 100644
--- a/model_zoo/vision/ppyoloe/cpp/ppyoloe.cc
+++ b/model_zoo/vision/ppyoloe/cpp/ppyoloe.cc
@@ -23,7 +23,11 @@ int main() {
   std::string img_path = "000000014439_640x640.jpg";
   std::string vis_path = "vis.jpeg";
 
-  auto model = vis::ppdet::PPYOLOE(model_file, params_file, config_file);
+  auto option = fastdeploy::RuntimeOption();
+  option.device = fastdeploy::Device::CPU;
+  option.backend = fastdeploy::Backend::PDINFER;
+  auto model =
+      vis::ppdet::PPYOLOE(model_file, params_file, config_file, option);
   if (!model.Initialized()) {
     std::cerr << "Init Failed." << std::endl;
     return -1;

From 4b681581b1b43107c508f561e80b5be7efeeefe1 Mon Sep 17 00:00:00 2001
From: Jason <jiangjiajun@baidu.com>
Date: Sat, 23 Jul 2022 22:21:36 +0800
Subject: [PATCH 4/9] Add custom operator for onnxruntime and fix paddle
 backend (#35)

Add custom operator for onnxruntime ans fix paddle backend
---
 ThirdPartyNotices.txt                         | 209 ++++++++++++++
 external/paddle_inference.cmake               |   7 +-
 external/utils.cmake                          |  13 +
 fastdeploy/backends/ort/ops/multiclass_nms.cc | 260 ++++++++++++++++++
 fastdeploy/backends/ort/ops/multiclass_nms.h  |  76 +++++
 fastdeploy/backends/ort/ort_backend.cc        |  21 +-
 fastdeploy/backends/ort/ort_backend.h         |   9 +-
 fastdeploy/backends/paddle/util.cc            |   1 +
 fastdeploy/fastdeploy_model.cc                |   2 +-
 fastdeploy/fastdeploy_runtime.cc              |  29 +-
 fastdeploy/fastdeploy_runtime.h               |   9 +-
 setup.py                                      |  90 +++---
 12 files changed, 666 insertions(+), 60 deletions(-)
 create mode 100644 fastdeploy/backends/ort/ops/multiclass_nms.cc
 create mode 100644 fastdeploy/backends/ort/ops/multiclass_nms.h

diff --git a/ThirdPartyNotices.txt b/ThirdPartyNotices.txt
index 5842b9a717..fa9df0fbab 100644
--- a/ThirdPartyNotices.txt
+++ b/ThirdPartyNotices.txt
@@ -732,3 +732,212 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
+
+---------
+7. https://github.com/oneapi-src/oneDNN/
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   ============================================================================
+
+   Copyright 2016-2021 Intel Corporation
+   Copyright 2018 YANDEX LLC
+   Copyright 2019-2021 FUJITSU LIMITED
+   Copyright 2020 Arm Limited and affiliates
+   Copyright 2020 Codeplay Software Limited
+   Copyright 2021 Alanna Tempest
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+   This distribution includes third party software ("third party programs").
+   This third party software, even if included with the distribution of
+   the Intel software, may be governed by separate license terms, including
+   without limitation, third party license terms, other Intel software license
+   terms, and open source software license terms. These separate license terms
+   govern your use of the third party programs as set forth in the
+   "THIRD-PARTY-PROGRAMS" file.
diff --git a/external/paddle_inference.cmake b/external/paddle_inference.cmake
index 41aa740f62..8894209f4a 100644
--- a/external/paddle_inference.cmake
+++ b/external/paddle_inference.cmake
@@ -83,12 +83,7 @@ ExternalProject_Add(
   BUILD_COMMAND ""
   UPDATE_COMMAND ""
   INSTALL_COMMAND
-    ${CMAKE_COMMAND} -E remove_directory ${PADDLEINFERENCE_INSTALL_DIR} &&
-    ${CMAKE_COMMAND} -E make_directory ${PADDLEINFERENCE_INSTALL_DIR} &&
-    ${CMAKE_COMMAND} -E rename ${PADDLEINFERENCE_SOURCE_DIR}/paddle/
-    ${PADDLEINFERENCE_INSTALL_DIR}/paddle && ${CMAKE_COMMAND} -E rename 
-    ${PADDLEINFERENCE_SOURCE_DIR}/third_party ${PADDLEINFERENCE_INSTALL_DIR}/third_party &&
-    ${CMAKE_COMMAND} -E rename ${PADDLEINFERENCE_SOURCE_DIR}/version.txt ${PADDLEINFERENCE_INSTALL_DIR}/version.txt
+    ${CMAKE_COMMAND} -E copy_directory ${PADDLEINFERENCE_SOURCE_DIR} ${PADDLEINFERENCE_INSTALL_DIR}
   BUILD_BYPRODUCTS ${PADDLEINFERENCE_COMPILE_LIB})
 
 add_library(external_paddle_inference STATIC IMPORTED GLOBAL)
diff --git a/external/utils.cmake b/external/utils.cmake
index 3e6d70b42d..f5d6972042 100644
--- a/external/utils.cmake
+++ b/external/utils.cmake
@@ -13,3 +13,16 @@ function(redefine_file_macro targetname)
             )
     endforeach()
 endfunction()
+
+function(download_and_decompress url filename decompress_dir)
+  if(NOT EXISTS ${filename})
+    message("Downloading file from ${url} ...")
+    file(DOWNLOAD ${url} "${filename}.tmp" SHOW_PROGRESS)
+    file(RENAME "${filename}.tmp" ${filename})
+  endif()
+  if(NOT EXISTS ${decompress_dir})
+    file(MAKE_DIRECTORY ${decompress_dir})
+  endif()
+  message("Decompress file ${filename} ...")
+  execute_process(COMMAND ${CMAKE_COMMAND} -E tar -xf ${filename} WORKING_DIRECTORY ${decompress_dir})
+endfunction()
diff --git a/fastdeploy/backends/ort/ops/multiclass_nms.cc b/fastdeploy/backends/ort/ops/multiclass_nms.cc
new file mode 100644
index 0000000000..8c00dc7bed
--- /dev/null
+++ b/fastdeploy/backends/ort/ops/multiclass_nms.cc
@@ -0,0 +1,260 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/backends/ort/ops/multiclass_nms.h"
+#include <algorithm>
+#include "fastdeploy/core/fd_tensor.h"
+#include "fastdeploy/utils/utils.h"
+
+namespace fastdeploy {
+
+struct OrtTensorDimensions : std::vector<int64_t> {
+  OrtTensorDimensions(Ort::CustomOpApi ort, const OrtValue* value) {
+    OrtTensorTypeAndShapeInfo* info = ort.GetTensorTypeAndShape(value);
+    std::vector<int64_t>::operator=(ort.GetTensorShape(info));
+    ort.ReleaseTensorTypeAndShapeInfo(info);
+  }
+};
+
+template <class T>
+bool SortScorePairDescend(const std::pair<float, T>& pair1,
+                          const std::pair<float, T>& pair2) {
+  return pair1.first > pair2.first;
+}
+
+void GetMaxScoreIndex(const float* scores, const int& score_size,
+                      const float& threshold, const int& top_k,
+                      std::vector<std::pair<float, int>>* sorted_indices) {
+  for (size_t i = 0; i < score_size; ++i) {
+    if (scores[i] > threshold) {
+      sorted_indices->push_back(std::make_pair(scores[i], i));
+    }
+  }
+  // Sort the score pair according to the scores in descending order
+  std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
+                   SortScorePairDescend<int>);
+  // Keep top_k scores if needed.
+  if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
+    sorted_indices->resize(top_k);
+  }
+}
+
+float BBoxArea(const float* box, const bool& normalized) {
+  if (box[2] < box[0] || box[3] < box[1]) {
+    // If coordinate values are is invalid
+    // (e.g. xmax < xmin or ymax < ymin), return 0.
+    return 0.f;
+  } else {
+    const float w = box[2] - box[0];
+    const float h = box[3] - box[1];
+    if (normalized) {
+      return w * h;
+    } else {
+      // If coordinate values are not within range [0, 1].
+      return (w + 1) * (h + 1);
+    }
+  }
+}
+
+float JaccardOverlap(const float* box1, const float* box2,
+                     const bool& normalized) {
+  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
+      box2[3] < box1[1]) {
+    return 0.f;
+  } else {
+    const float inter_xmin = std::max(box1[0], box2[0]);
+    const float inter_ymin = std::max(box1[1], box2[1]);
+    const float inter_xmax = std::min(box1[2], box2[2]);
+    const float inter_ymax = std::min(box1[3], box2[3]);
+    float norm = normalized ? 0.0f : 1.0f;
+    float inter_w = inter_xmax - inter_xmin + norm;
+    float inter_h = inter_ymax - inter_ymin + norm;
+    const float inter_area = inter_w * inter_h;
+    const float bbox1_area = BBoxArea(box1, normalized);
+    const float bbox2_area = BBoxArea(box2, normalized);
+    return inter_area / (bbox1_area + bbox2_area - inter_area);
+  }
+}
+
+void MultiClassNmsKernel::FastNMS(const float* boxes, const float* scores,
+                                  const int& num_boxes,
+                                  std::vector<int>* keep_indices) {
+  std::vector<std::pair<float, int>> sorted_indices;
+  GetMaxScoreIndex(scores, num_boxes, score_threshold, nms_top_k,
+                   &sorted_indices);
+
+  float adaptive_threshold = nms_threshold;
+  while (sorted_indices.size() != 0) {
+    const int idx = sorted_indices.front().second;
+    bool keep = true;
+    for (size_t k = 0; k < keep_indices->size(); ++k) {
+      if (!keep) {
+        break;
+      }
+      const int kept_idx = (*keep_indices)[k];
+      float overlap =
+          JaccardOverlap(boxes + idx * 4, boxes + kept_idx * 4, normalized);
+      keep = overlap <= adaptive_threshold;
+    }
+    if (keep) {
+      keep_indices->push_back(idx);
+    }
+    sorted_indices.erase(sorted_indices.begin());
+    if (keep && nms_eta<1.0 & adaptive_threshold> 0.5) {
+      adaptive_threshold *= nms_eta;
+    }
+  }
+}
+
+int MultiClassNmsKernel::NMSForEachSample(
+    const float* boxes, const float* scores, int num_boxes, int num_classes,
+    std::map<int, std::vector<int>>* keep_indices) {
+  for (int i = 0; i < num_classes; ++i) {
+    if (i == background_label) {
+      continue;
+    }
+    const float* score_for_class_i = scores + i * num_boxes;
+    FastNMS(boxes, score_for_class_i, num_boxes, &((*keep_indices)[i]));
+  }
+  int num_det = 0;
+  for (auto iter = keep_indices->begin(); iter != keep_indices->end(); ++iter) {
+    num_det += iter->second.size();
+  }
+
+  if (keep_top_k > -1 && num_det > keep_top_k) {
+    std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
+    for (const auto& it : *keep_indices) {
+      int label = it.first;
+      const float* current_score = scores + label * num_boxes;
+      auto& label_indices = it.second;
+      for (size_t j = 0; j < label_indices.size(); ++j) {
+        int idx = label_indices[j];
+        score_index_pairs.push_back(
+            std::make_pair(current_score[idx], std::make_pair(label, idx)));
+      }
+    }
+    std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(),
+                     SortScorePairDescend<std::pair<int, int>>);
+    score_index_pairs.resize(keep_top_k);
+
+    std::map<int, std::vector<int>> new_indices;
+    for (size_t j = 0; j < score_index_pairs.size(); ++j) {
+      int label = score_index_pairs[j].second.first;
+      int idx = score_index_pairs[j].second.second;
+      new_indices[label].push_back(idx);
+    }
+    new_indices.swap(*keep_indices);
+    num_det = keep_top_k;
+  }
+  return num_det;
+}
+
+void MultiClassNmsKernel::Compute(OrtKernelContext* context) {
+  const OrtValue* boxes = ort_.KernelContext_GetInput(context, 0);
+  const OrtValue* scores = ort_.KernelContext_GetInput(context, 1);
+  const float* boxes_data =
+      reinterpret_cast<const float*>(ort_.GetTensorData<float>(boxes));
+  const float* scores_data =
+      reinterpret_cast<const float*>(ort_.GetTensorData<float>(scores));
+  OrtTensorDimensions boxes_dim(ort_, boxes);
+  OrtTensorDimensions scores_dim(ort_, scores);
+  int score_size = scores_dim.size();
+
+  int64_t batch_size = scores_dim[0];
+  int64_t box_dim = boxes_dim[2];
+  int64_t out_dim = box_dim + 2;
+
+  int num_nmsed_out = 0;
+  FDASSERT(score_size == 3, "Require rank of input scores be 3, but now it's " +
+                                std::to_string(score_size) + ".");
+  FDASSERT(boxes_dim[2] == 4,
+           "Require the 3-dimension of input boxes be 4, but now it's " +
+               std::to_string(boxes_dim[2]) + ".");
+  std::vector<int64_t> out_num_rois_dims = {batch_size};
+  OrtValue* out_num_rois = ort_.KernelContext_GetOutput(
+      context, 2, out_num_rois_dims.data(), out_num_rois_dims.size());
+  int32_t* out_num_rois_data = ort_.GetTensorMutableData<int32_t>(out_num_rois);
+
+  std::vector<std::map<int, std::vector<int>>> all_indices;
+  for (size_t i = 0; i < batch_size; ++i) {
+    std::map<int, std::vector<int>> indices;  // indices kept for each class
+    const float* current_boxes_ptr =
+        boxes_data + i * boxes_dim[1] * boxes_dim[2];
+    const float* current_scores_ptr =
+        scores_data + i * scores_dim[1] * scores_dim[2];
+    int num = NMSForEachSample(current_boxes_ptr, current_scores_ptr,
+                               boxes_dim[1], scores_dim[1], &indices);
+    num_nmsed_out += num;
+    out_num_rois_data[i] = num;
+    all_indices.emplace_back(indices);
+  }
+  std::vector<int64_t> out_box_dims = {num_nmsed_out, 6};
+  std::vector<int64_t> out_index_dims = {num_nmsed_out, 1};
+  OrtValue* out_box = ort_.KernelContext_GetOutput(
+      context, 0, out_box_dims.data(), out_box_dims.size());
+  OrtValue* out_index = ort_.KernelContext_GetOutput(
+      context, 1, out_index_dims.data(), out_index_dims.size());
+  if (num_nmsed_out == 0) {
+    int32_t* out_num_rois_data =
+        ort_.GetTensorMutableData<int32_t>(out_num_rois);
+    for (size_t i = 0; i < batch_size; ++i) {
+      out_num_rois_data[i] = 0;
+    }
+    return;
+  }
+  float* out_box_data = ort_.GetTensorMutableData<float>(out_box);
+  int32_t* out_index_data = ort_.GetTensorMutableData<int32_t>(out_index);
+
+  int count = 0;
+  for (size_t i = 0; i < batch_size; ++i) {
+    const float* current_boxes_ptr =
+        boxes_data + i * boxes_dim[1] * boxes_dim[2];
+    const float* current_scores_ptr =
+        scores_data + i * scores_dim[1] * scores_dim[2];
+    for (const auto& it : all_indices[i]) {
+      int label = it.first;
+      const auto& indices = it.second;
+      const float* current_scores_class_ptr =
+          current_scores_ptr + label * scores_dim[2];
+      for (size_t j = 0; j < indices.size(); ++j) {
+        int start = count * 6;
+        out_box_data[start] = label;
+        out_box_data[start + 1] = current_scores_class_ptr[indices[j]];
+
+        out_box_data[start + 2] = current_boxes_ptr[indices[j] * 4];
+        out_box_data[start + 3] = current_boxes_ptr[indices[j] * 4 + 1];
+        out_box_data[start + 4] = current_boxes_ptr[indices[j] * 4 + 2];
+
+        out_box_data[start + 5] = current_boxes_ptr[indices[j] * 4 + 3];
+        out_index_data[count] = i * boxes_dim[1] + indices[j];
+        count += 1;
+      }
+    }
+  }
+}
+
+void MultiClassNmsKernel::GetAttribute(const OrtKernelInfo* info) {
+  background_label =
+      ort_.KernelInfoGetAttribute<int64_t>(info, "background_label");
+  keep_top_k = ort_.KernelInfoGetAttribute<int64_t>(info, "keep_top_k");
+  nms_eta = ort_.KernelInfoGetAttribute<float>(info, "nms_eta");
+  nms_threshold = ort_.KernelInfoGetAttribute<float>(info, "nms_threshold");
+  nms_top_k = ort_.KernelInfoGetAttribute<int64_t>(info, "nms_top_k");
+  normalized = ort_.KernelInfoGetAttribute<int64_t>(info, "normalized");
+  score_threshold = ort_.KernelInfoGetAttribute<float>(info, "score_threshold");
+  std::cout << background_label << " " << keep_top_k << " " << nms_eta << " "
+            << nms_threshold << " " << nms_top_k << " " << normalized << " "
+            << score_threshold << " " << std::endl;
+}
+}  // namespace fastdeploy
diff --git a/fastdeploy/backends/ort/ops/multiclass_nms.h b/fastdeploy/backends/ort/ops/multiclass_nms.h
new file mode 100644
index 0000000000..78f9a22557
--- /dev/null
+++ b/fastdeploy/backends/ort/ops/multiclass_nms.h
@@ -0,0 +1,76 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <map>
+#include "onnxruntime_cxx_api.h"  // NOLINT
+
+namespace fastdeploy {
+
+struct MultiClassNmsKernel {
+ protected:
+  int64_t background_label = -1;
+  int64_t keep_top_k = -1;
+  float nms_eta;
+  float nms_threshold = 0.7;
+  int64_t nms_top_k;
+  bool normalized;
+  float score_threshold;
+  Ort::CustomOpApi ort_;
+
+ public:
+  MultiClassNmsKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
+      : ort_(ort) {
+    GetAttribute(info);
+  }
+
+  void GetAttribute(const OrtKernelInfo* info);
+
+  void Compute(OrtKernelContext* context);
+  void FastNMS(const float* boxes, const float* scores, const int& num_boxes,
+               std::vector<int>* keep_indices);
+  int NMSForEachSample(const float* boxes, const float* scores, int num_boxes,
+                       int num_classes,
+                       std::map<int, std::vector<int>>* keep_indices);
+};
+
+struct MultiClassNmsOp
+    : Ort::CustomOpBase<MultiClassNmsOp, MultiClassNmsKernel> {
+  void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
+    return new MultiClassNmsKernel(api, info);
+  }
+
+  const char* GetName() const { return "MultiClassNMS"; }
+
+  size_t GetInputTypeCount() const { return 2; }
+
+  ONNXTensorElementDataType GetInputType(size_t index) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  }
+
+  size_t GetOutputTypeCount() const { return 3; }
+
+  ONNXTensorElementDataType GetOutputType(size_t index) const {
+    if (index == 0) {
+      return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+    }
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32;
+  }
+
+  const char* GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  }
+};
+
+}  // namespace fastdeploy
diff --git a/fastdeploy/backends/ort/ort_backend.cc b/fastdeploy/backends/ort/ort_backend.cc
index 7060b758c1..909b5f2875 100644
--- a/fastdeploy/backends/ort/ort_backend.cc
+++ b/fastdeploy/backends/ort/ort_backend.cc
@@ -13,15 +13,19 @@
 // limitations under the License.
 
 #include "fastdeploy/backends/ort/ort_backend.h"
+#include <memory>
+#include "fastdeploy/backends/ort/ops/multiclass_nms.h"
 #include "fastdeploy/backends/ort/utils.h"
 #include "fastdeploy/utils/utils.h"
-#include <memory>
 #ifdef ENABLE_PADDLE_FRONTEND
 #include "paddle2onnx/converter.h"
 #endif
 
 namespace fastdeploy {
 
+std::vector<OrtCustomOp*> OrtBackend::custom_operators_ =
+    std::vector<OrtCustomOp*>();
+
 ONNXTensorElementDataType GetOrtDtype(FDDataType fd_dtype) {
   if (fd_dtype == FDDataType::FP32) {
     return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
@@ -131,7 +135,9 @@ bool OrtBackend::InitFromOnnx(const std::string& model_file,
             << std::endl;
     return false;
   }
+
   BuildOption(option);
+  InitCustomOperators();
   if (from_memory_buffer) {
     session_ = {env_, model_file.data(), model_file.size(), session_options_};
   } else {
@@ -275,4 +281,15 @@ TensorInfo OrtBackend::GetOutputInfo(int index) {
   return info;
 }
 
-} // namespace fastdeploy
+void OrtBackend::InitCustomOperators() {
+  if (custom_operators_.size() == 0) {
+    MultiClassNmsOp* custom_op = new MultiClassNmsOp{};
+    custom_operators_.push_back(custom_op);
+  }
+  for (size_t i = 0; i < custom_operators_.size(); ++i) {
+    custom_op_domain_.Add(custom_operators_[i]);
+  }
+  session_options_.Add(custom_op_domain_);
+}
+
+}  // namespace fastdeploy
diff --git a/fastdeploy/backends/ort/ort_backend.h b/fastdeploy/backends/ort/ort_backend.h
index 3200c29352..8556763e0b 100644
--- a/fastdeploy/backends/ort/ort_backend.h
+++ b/fastdeploy/backends/ort/ort_backend.h
@@ -20,7 +20,7 @@
 #include <vector>
 
 #include "fastdeploy/backends/backend.h"
-#include "onnxruntime_cxx_api.h" // NOLINT
+#include "onnxruntime_cxx_api.h"  // NOLINT
 
 namespace fastdeploy {
 
@@ -68,6 +68,8 @@ class OrtBackend : public BaseBackend {
 
   TensorInfo GetInputInfo(int index);
   TensorInfo GetOutputInfo(int index);
+  static std::vector<OrtCustomOp*> custom_operators_;
+  void InitCustomOperators();
 
  private:
   Ort::Env env_;
@@ -76,9 +78,8 @@ class OrtBackend : public BaseBackend {
   std::shared_ptr<Ort::IoBinding> binding_;
   std::vector<OrtValueInfo> inputs_desc_;
   std::vector<OrtValueInfo> outputs_desc_;
-
+  Ort::CustomOpDomain custom_op_domain_ = Ort::CustomOpDomain("Paddle");
   OrtBackendOption option_;
-
   void CopyToCpu(const Ort::Value& value, FDTensor* tensor);
 };
-} // namespace fastdeploy
+}  // namespace fastdeploy
diff --git a/fastdeploy/backends/paddle/util.cc b/fastdeploy/backends/paddle/util.cc
index 2469596aed..b2df989d4a 100644
--- a/fastdeploy/backends/paddle/util.cc
+++ b/fastdeploy/backends/paddle/util.cc
@@ -17,6 +17,7 @@
 namespace fastdeploy {
 void ShareTensorFromCpu(paddle_infer::Tensor* tensor, FDTensor& fd_tensor) {
   std::vector<int> shape(fd_tensor.shape.begin(), fd_tensor.shape.end());
+  tensor->Reshape(shape);
   if (fd_tensor.dtype == FDDataType::FP32) {
     tensor->ShareExternalData(static_cast<const float*>(fd_tensor.Data()),
                               shape, paddle_infer::PlaceType::kCPU);
diff --git a/fastdeploy/fastdeploy_model.cc b/fastdeploy/fastdeploy_model.cc
index 97a5d9bc45..c61eea7cb6 100644
--- a/fastdeploy/fastdeploy_model.cc
+++ b/fastdeploy/fastdeploy_model.cc
@@ -18,7 +18,7 @@ namespace fastdeploy {
 
 bool FastDeployModel::InitRuntime() {
   FDASSERT(
-      ModelFormatCheck(runtime_option.model_file, runtime_option.model_format),
+      CheckModelFormat(runtime_option.model_file, runtime_option.model_format),
       "ModelFormatCheck Failed.");
   if (runtime_initialized_) {
     FDERROR << "The model is already initialized, cannot be initliazed again."
diff --git a/fastdeploy/fastdeploy_runtime.cc b/fastdeploy/fastdeploy_runtime.cc
index 6ee9fb3a85..e353c64167 100644
--- a/fastdeploy/fastdeploy_runtime.cc
+++ b/fastdeploy/fastdeploy_runtime.cc
@@ -72,7 +72,7 @@ std::string Str(const Frontend& f) {
   return "UNKNOWN-Frontend";
 }
 
-bool ModelFormatCheck(const std::string& model_file,
+bool CheckModelFormat(const std::string& model_file,
                       const Frontend& model_format) {
   if (model_format == Frontend::PADDLE) {
     if (model_file.size() < 8 ||
@@ -99,8 +99,28 @@ bool ModelFormatCheck(const std::string& model_file,
   return true;
 }
 
+Frontend GuessModelFormat(const std::string& model_file) {
+  if (model_file.size() > 8 &&
+      model_file.substr(model_file.size() - 8, 8) == ".pdmodel") {
+    FDLogger() << "Model Format: PaddlePaddle." << std::endl;
+    return Frontend::PADDLE;
+  } else if (model_file.size() > 5 &&
+             model_file.substr(model_file.size() - 5, 5) == ".onnx") {
+    FDLogger() << "Model Format: ONNX." << std::endl;
+    return Frontend::ONNX;
+  }
+
+  FDERROR << "Cannot guess which model format you are using, please set "
+             "RuntimeOption::model_format manually."
+          << std::endl;
+  return Frontend::PADDLE;
+}
+
 bool Runtime::Init(const RuntimeOption& _option) {
   option = _option;
+  if (option.model_format == Frontend::AUTOREC) {
+    option.model_format = GuessModelFormat(_option.model_file);
+  }
   if (option.backend == Backend::UNKNOWN) {
     if (IsBackendAvailable(Backend::ORT)) {
       option.backend = Backend::ORT;
@@ -124,6 +144,9 @@ bool Runtime::Init(const RuntimeOption& _option) {
   } else if (option.backend == Backend::PDINFER) {
     FDASSERT(option.device == Device::CPU || option.device == Device::GPU,
              "Backend::TRT only supports Device::CPU/Device::GPU.");
+    FDASSERT(
+        option.model_format == Frontend::PADDLE,
+        "Backend::PDINFER only supports model format of Frontend::PADDLE.");
     CreatePaddleBackend();
   } else {
     FDERROR << "Runtime only support "
@@ -163,8 +186,8 @@ void Runtime::CreatePaddleBackend() {
            "Load model from Paddle failed while initliazing PaddleBackend.");
 #else
   FDASSERT(false,
-           "OrtBackend is not available, please compiled with "
-           "ENABLE_ORT_BACKEND=ON.");
+           "PaddleBackend is not available, please compiled with "
+           "ENABLE_PADDLE_BACKEND=ON.");
 #endif
 }
 
diff --git a/fastdeploy/fastdeploy_runtime.h b/fastdeploy/fastdeploy_runtime.h
index eb88746321..7ec08e9d9d 100644
--- a/fastdeploy/fastdeploy_runtime.h
+++ b/fastdeploy/fastdeploy_runtime.h
@@ -21,7 +21,9 @@
 namespace fastdeploy {
 
 enum FASTDEPLOY_DECL Backend { UNKNOWN, ORT, TRT, PDINFER };
-enum FASTDEPLOY_DECL Frontend { PADDLE, ONNX };
+// AUTOREC will according to the name of model file
+// to decide which Frontend is
+enum FASTDEPLOY_DECL Frontend { AUTOREC, PADDLE, ONNX };
 
 FASTDEPLOY_DECL std::string Str(const Backend& b);
 FASTDEPLOY_DECL std::string Str(const Frontend& f);
@@ -29,8 +31,9 @@ FASTDEPLOY_DECL std::vector<Backend> GetAvailableBackends();
 
 FASTDEPLOY_DECL bool IsBackendAvailable(const Backend& backend);
 
-bool ModelFormatCheck(const std::string& model_file,
+bool CheckModelFormat(const std::string& model_file,
                       const Frontend& model_format);
+Frontend GuessModelFormat(const std::string& model_file);
 
 struct FASTDEPLOY_DECL RuntimeOption {
   Backend backend = Backend::UNKNOWN;
@@ -71,7 +74,7 @@ struct FASTDEPLOY_DECL RuntimeOption {
 
   std::string model_file = "";   // Path of model file
   std::string params_file = "";  // Path of parameters file, can be empty
-  Frontend model_format = Frontend::PADDLE;  // format of input model
+  Frontend model_format = Frontend::AUTOREC;  // format of input model
 };
 
 struct FASTDEPLOY_DECL Runtime {
diff --git a/setup.py b/setup.py
index e76f057b1c..19c47ed9cc 100644
--- a/setup.py
+++ b/setup.py
@@ -126,6 +126,15 @@ def finalize_options(self):
         pass
 
 
+def GetAllFiles(dirname):
+    files = list()
+    for root, dirs, filenames in os.walk(dirname):
+        for f in filenames:
+            fullname = os.path.join(root, f)
+            files.append(fullname)
+    return files
+
+
 class create_version(ONNXCommand):
     def run(self):
         with open(os.path.join(SRC_DIR, 'version.py'), 'w') as f:
@@ -326,50 +335,49 @@ def run(self):
     shutil.copy("LICENSE", "fastdeploy")
     depend_libs = list()
 
-    if platform.system().lower() == "linux":
-        for f in os.listdir(".setuptools-cmake-build"):
-            full_name = os.path.join(".setuptools-cmake-build", f)
-            if not os.path.isfile(full_name):
-                continue
-            if not full_name.count("fastdeploy_main.cpython-"):
-                continue
-            if not full_name.endswith(".so"):
-                continue
-            # modify the search path of libraries
-            command = "patchelf --set-rpath '$ORIGIN/libs/' {}".format(
-                full_name)
-            # The sw_64 not suppot patchelf, so we just disable that.
-            if platform.machine() != 'sw_64' and platform.machine(
-            ) != 'mips64':
-                assert os.system(
-                    command
-                ) == 0, "patch fastdeploy_main.cpython-36m-x86_64-linux-gnu.so failed, the command: {}".format(
-                    command)
-
+    # copy fastdeploy library
+    pybind_so_file = None
     for f in os.listdir(".setuptools-cmake-build"):
         if not os.path.isfile(os.path.join(".setuptools-cmake-build", f)):
             continue
-        if f.count("libfastdeploy") > 0:
+        if f.count("fastdeploy") > 0:
             shutil.copy(
                 os.path.join(".setuptools-cmake-build", f), "fastdeploy/libs")
-    for dirname in os.listdir(".setuptools-cmake-build/third_libs/install"):
-        for lib in os.listdir(
-                os.path.join(".setuptools-cmake-build/third_libs/install",
-                             dirname, "lib")):
-            if lib.count(".so") == 0 and lib.count(
-                    ".dylib") == 0 and lib.count(".a") == 0:
-                continue
-            if not os.path.isfile(
-                    os.path.join(".setuptools-cmake-build/third_libs/install",
-                                 dirname, "lib", lib)):
-                continue
-            shutil.copy(
-                os.path.join(".setuptools-cmake-build/third_libs/install",
-                             dirname, "lib", lib), "fastdeploy/libs")
+        if f.count("fastdeploy_main.cpython-"):
+            pybind_so_file = f
 
-    all_libs = os.listdir("fastdeploy/libs")
-    for lib in all_libs:
-        package_data[PACKAGE_NAME].append(os.path.join("libs", lib))
+    if not os.path.exists(".setuptools-cmake-build/third_libs/install"):
+        raise Exception(
+            "Cannot find directory third_libs/install in .setuptools-cmake-build."
+        )
+
+    if os.path.exists("fastdeploy/libs/third_libs"):
+        shutil.rmtree("fastdeploy/libs/third_libs")
+    shutil.copytree(
+        ".setuptools-cmake-build/third_libs/install",
+        "fastdeploy/libs/third_libs",
+        symlinks=True)
+
+    all_files = GetAllFiles("fastdeploy/libs")
+    for f in all_files:
+        package_data[PACKAGE_NAME].append(os.path.relpath(f, "fastdeploy"))
+
+    if platform.system().lower() == "linux":
+        rpaths = ["${ORIGIN}"]
+        for root, dirs, files in os.walk("fastdeploy/libs/third_libs"):
+            for d in dirs:
+                if d == "lib":
+                    path = os.path.relpath(
+                        os.path.join(root, d), "fastdeploy/libs")
+                    rpaths.append("${ORIGIN}/" + format(path))
+        rpaths = ":".join(rpaths)
+        command = "patchelf --set-rpath '{}' ".format(rpaths) + os.path.join(
+            "fastdeploy/libs", pybind_so_file)
+        # The sw_64 not suppot patchelf, so we just disable that.
+        if platform.machine() != 'sw_64' and platform.machine() != 'mips64':
+            assert os.system(
+                command) == 0, "patchelf {} failed, the command: {}".format(
+                    command, pybind_so_file)
 
 setuptools.setup(
     name=PACKAGE_NAME,
@@ -382,9 +390,9 @@ def run(self):
     include_package_data=True,
     setup_requires=setup_requires,
     extras_require=extras_require,
-    author='paddle-infer',
-    author_email='paddle-infer@baidu.com',
-    url='https://github.com/PaddlePaddle/Paddle2ONNX.git',
+    author='fastdeploy',
+    author_email='fastdeploy@baidu.com',
+    url='https://github.com/PaddlePaddle/FastDeploy.git',
     install_requires=REQUIRED_PACKAGES,
     classifiers=[
         "Programming Language :: Python :: 3",

From 279c993483e8ff924e10688ce85de663aaa97739 Mon Sep 17 00:00:00 2001
From: Jason <jiangjiajun@baidu.com>
Date: Mon, 25 Jul 2022 08:59:53 +0800
Subject: [PATCH 5/9] Polish cmake files and runtime apis (#36)

* Add custom operator for onnxruntime ans fix paddle backend

* Polish cmake files and runtime apis

* Remove copy libraries

* fix some issue

* fix bug

* fix bug
---
 CMakeLists.txt                              | 111 ++++++++++---------
 FastDeploy.cmake.in                         |  21 ++--
 copy_directory.py                           |  32 ++++++
 fastdeploy/backends/ort/ort_backend.cc      |   8 +-
 fastdeploy/backends/tensorrt/trt_backend.cc | 111 ++++++++++++-------
 fastdeploy/backends/tensorrt/trt_backend.h  |   1 -
 fastdeploy/fastdeploy_model.cc              |   6 +-
 fastdeploy/fastdeploy_runtime.cc            | 114 ++++++++++++++++++--
 fastdeploy/fastdeploy_runtime.h             |  53 ++++++++-
 fastdeploy/fastdeploy_runtime.py            |  24 +++--
 fastdeploy/pybind/fastdeploy_runtime.cc     |  83 ++++++++------
 fastdeploy/utils/utils.cc                   |   2 +-
 fastdeploy/utils/utils.h                    |  14 ++-
 fastdeploy/vision/common/processors/cast.cc |  14 ++-
 model_zoo/vision/ppyoloe/cpp/CMakeLists.txt |   2 +-
 model_zoo/vision/ppyoloe/cpp/ppyoloe.cc     |   6 +-
 model_zoo/vision/yolov5/cpp/CMakeLists.txt  |   3 +-
 model_zoo/vision/yolox/cpp/CMakeLists.txt   |   2 +-
 model_zoo/vision/yolox/yolox.py             |   3 +-
 setup.py                                    |  29 ++---
 20 files changed, 446 insertions(+), 193 deletions(-)
 create mode 100644 copy_directory.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fa2d421a19..141c2d1c57 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,8 +15,20 @@
 PROJECT(fastdeploy C CXX)
 CMAKE_MINIMUM_REQUIRED (VERSION 3.16)
 
+option(CSRCS_DIR_NAME "Name of source code directory")
+option(LIBRARY_NAME "Name of build library name")
+option(PY_LIBRARY_NAME "Name of build python library name")
+if(NOT CSRCS_DIR_NAME)
+  set(CSRCS_DIR_NAME "./")
+endif()
+if(NOT LIBRARY_NAME)
+  set(LIBRARY_NAME "fastdeploy")
+endif()
+if(NOT PY_LIBRARY_NAME)
+  set(PY_LIBRARY_NAME "fastdeploy_main")
+endif()
 include(ExternalProject)
-add_subdirectory(fastdeploy)
+add_subdirectory(${CSRCS_DIR_NAME}/fastdeploy)
 include(external/utils.cmake)
 
 # Set C++11 as standard for the whole project
@@ -51,7 +63,8 @@ endif()
 
 option(BUILD_FASTDEPLOY_PYTHON "if build python lib for fastdeploy." OFF)
 
-include_directories(${PROJECT_SOURCE_DIR})
+set(HEAD_DIR "${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}")
+include_directories(${HEAD_DIR})
 include_directories(${CMAKE_CURRENT_BINARY_DIR})
 
 if (WITH_VISION_EXAMPLES AND EXISTS ${PROJECT_SOURCE_DIR}/examples)
@@ -62,12 +75,12 @@ if (WITH_VISION_EXAMPLES AND EXISTS ${PROJECT_SOURCE_DIR}/examples)
 endif()
 
 add_definitions(-DFASTDEPLOY_LIB)
-file(GLOB_RECURSE ALL_DEPLOY_SRCS ${PROJECT_SOURCE_DIR}/fastdeploy/*.cc)
-file(GLOB_RECURSE DEPLOY_ORT_SRCS ${PROJECT_SOURCE_DIR}/fastdeploy/backends/ort/*.cc)
-file(GLOB_RECURSE DEPLOY_PADDLE_SRCS ${PROJECT_SOURCE_DIR}/fastdeploy/backends/paddle/*.cc)
-file(GLOB_RECURSE DEPLOY_TRT_SRCS ${PROJECT_SOURCE_DIR}/fastdeploy/backends/tensorrt/*.cc ${PROJECT_SOURCE_DIR}/fastdeploy/backends/tensorrt/*.cpp)
-file(GLOB_RECURSE DEPLOY_VISION_SRCS ${PROJECT_SOURCE_DIR}/fastdeploy/vision/*.cc)
-file(GLOB_RECURSE DEPLOY_PYBIND_SRCS ${PROJECT_SOURCE_DIR}/fastdeploy/pybind/*.cc ${PROJECT_SOURCE_DIR}/fastdeploy/*_pybind.cc)
+file(GLOB_RECURSE ALL_DEPLOY_SRCS ${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/fastdeploy/*.cc)
+file(GLOB_RECURSE DEPLOY_ORT_SRCS ${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/fastdeploy/backends/ort/*.cc)
+file(GLOB_RECURSE DEPLOY_PADDLE_SRCS ${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/fastdeploy/backends/paddle/*.cc)
+file(GLOB_RECURSE DEPLOY_TRT_SRCS ${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/fastdeploy/backends/tensorrt/*.cc ${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/fastdeploy/backends/tensorrt/*.cpp)
+file(GLOB_RECURSE DEPLOY_VISION_SRCS ${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/fastdeploy/vision/*.cc)
+file(GLOB_RECURSE DEPLOY_PYBIND_SRCS ${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/fastdeploy/pybind/*.cc ${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/fastdeploy/*_pybind.cc)
 list(REMOVE_ITEM ALL_DEPLOY_SRCS ${DEPLOY_ORT_SRCS} ${DEPLOY_PADDLE_SRCS} ${DEPLOY_TRT_SRCS} ${DEPLOY_VISION_SRCS})
 
 set(DEPEND_LIBS "")
@@ -117,7 +130,7 @@ if(ENABLE_TRT_BACKEND)
   endif()
   add_definitions(-DENABLE_TRT_BACKEND)
   include_directories(${TRT_DIRECTORY}/include)
-  include_directories(${PROJECT_SOURCE_DIR}/fastdeploy/backends/tensorrt/common)
+  include_directories(${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/fastdeploy/backends/tensorrt/common)
   list(APPEND ALL_DEPLOY_SRCS ${DEPLOY_TRT_SRCS})
   find_library(TRT_INFER_LIB nvinfer ${TRT_DIRECTORY}/lib)
   find_library(TRT_ONNX_LIB nvonnxparser ${TRT_DIRECTORY}/lib)
@@ -125,12 +138,16 @@ if(ENABLE_TRT_BACKEND)
   find_library(TRT_PLUGIN_LIB nvinfer_plugin ${TRT_DIRECTORY}/lib)
   list(APPEND DEPEND_LIBS ${TRT_INFER_LIB} ${TRT_ONNX_LIB} ${TRT_CAFFE_LIB} ${TRT_PLUGIN_LIB})
 
-  # copy tensorrt libraries to third lib
-#  if(EXISTS "${CMAKE_CURRENT_BINARY_DIR}/third_libs/install/tensorrt")
-#    file(REMOVE_RECURSE "${CMAKE_CURRENT_BINARY_DIR}/third_libs/install/tensorrt/lib")
-#  endif()
-#  file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/third_libs/install/tensorrt/lib")
-#  file(COPY ${TRT_INFER_LIB} ${TRT_ONNX_LIB} ${TRT_CAFFE_LIB} ${TRT_PLUGIN_LIB} DESTINATION "${CMAKE_CURRENT_BINARY_DIR}/third_libs/install/tensorrt/lib" FOLLOW_SYMLINK_CHAIN)
+  if(NOT EXISTS "${CMAKE_CURRENT_BINARY_DIR}/third_libs/install/tensorrt")
+    file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/third_libs/install/tensorrt")
+  endif()
+  if(EXISTS "${CMAKE_CURRENT_BINARY_DIR}/third_libs/install/tensorrt/lib")
+    file(REMOVE_RECURSE "${CMAKE_CURRENT_BINARY_DIR}/third_libs/install/tensorrt/lib")
+  endif()
+  find_package(Python COMPONENTS Interpreter Development REQUIRED)
+  message(STATUS "Copying ${TRT_DIRECTORY}/lib to ${CMAKE_CURRENT_BINARY_DIR}/third_libs/install/tensorrt/lib ...")
+  execute_process(COMMAND ${Python_EXECUTABLE} ${PROJECT_SOURCE_DIR}/copy_directory.py ${TRT_DIRECTORY}/lib ${CMAKE_CURRENT_BINARY_DIR}/third_libs/install/tensorrt/lib)
+
 endif()
 
 if(ENABLE_VISION)
@@ -157,37 +174,37 @@ else()
   endif()
 endif()
 
-configure_file(${PROJECT_SOURCE_DIR}/fastdeploy/core/config.h.in ${PROJECT_SOURCE_DIR}/fastdeploy/core/config.h)
+configure_file(${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/fastdeploy/core/config.h.in ${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/fastdeploy/core/config.h)
 configure_file(${PROJECT_SOURCE_DIR}/FastDeploy.cmake.in ${PROJECT_SOURCE_DIR}/FastDeploy.cmake @ONLY)
 
 list(REMOVE_ITEM ALL_DEPLOY_SRCS ${DEPLOY_PYBIND_SRCS})
 
-add_library(fastdeploy SHARED ${ALL_DEPLOY_SRCS})
-redefine_file_macro(fastdeploy)
-set_target_properties(fastdeploy PROPERTIES COMPILE_FLAGS "-fvisibility=hidden")
+add_library(${LIBRARY_NAME} SHARED ${ALL_DEPLOY_SRCS})
+redefine_file_macro(${LIBRARY_NAME})
+set_target_properties(${LIBRARY_NAME} PROPERTIES COMPILE_FLAGS "-fvisibility=hidden")
 if(NOT APPLE)
-  set_target_properties(fastdeploy PROPERTIES LINK_FLAGS "-Wl,--start-group,--exclude-libs,ALL")
+  set_target_properties(${LIBRARY_NAME} PROPERTIES LINK_FLAGS "-Wl,--start-group,--exclude-libs,ALL")
 endif()
-set_target_properties(fastdeploy PROPERTIES LINK_FLAGS_RELEASE -s)
+set_target_properties(${LIBRARY_NAME} PROPERTIES LINK_FLAGS_RELEASE -s)
 
 file(READ "${PROJECT_SOURCE_DIR}/VERSION_NUMBER" FASTDEPLOY_VERSION)
 string(STRIP "${FASTDEPLOY_VERSION}" FASTDEPLOY_VERSION)
 if (APPLE)
 #  set_target_properties(fastdeploy PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
-  set_target_properties(fastdeploy PROPERTIES COMPILE_FLAGS "-fvisibility=hidden")
+  set_target_properties(${LIBRARY_NAME} PROPERTIES COMPILE_FLAGS "-fvisibility=hidden")
 elseif(MSVC)
 else()
-  set_target_properties(fastdeploy PROPERTIES COMPILE_FLAGS "-fvisibility=hidden")
-  set_target_properties(fastdeploy PROPERTIES LINK_FLAGS "-Wl,--exclude-libs,ALL")
-  set_target_properties(fastdeploy PROPERTIES LINK_FLAGS_RELEASE -s)
+  set_target_properties(${LIBRARY_NAME} PROPERTIES COMPILE_FLAGS "-fvisibility=hidden")
+  set_target_properties(${LIBRARY_NAME} PROPERTIES LINK_FLAGS "-Wl,--exclude-libs,ALL")
+  set_target_properties(${LIBRARY_NAME} PROPERTIES LINK_FLAGS_RELEASE -s)
 endif()
 
 find_package(OpenMP)
 if(OpenMP_CXX_FOUND)
   list(APPEND DEPEND_LIBS OpenMP::OpenMP_CXX)
 endif()
-set_target_properties(fastdeploy PROPERTIES VERSION ${FASTDEPLOY_VERSION})
-target_link_libraries(fastdeploy ${DEPEND_LIBS})
+set_target_properties(${LIBRARY_NAME} PROPERTIES VERSION ${FASTDEPLOY_VERSION})
+target_link_libraries(${LIBRARY_NAME} ${DEPEND_LIBS})
 
 # add examples after prepare include paths for third-parties
 if (WITH_VISION_EXAMPLES AND EXISTS ${PROJECT_SOURCE_DIR}/examples)
@@ -200,15 +217,15 @@ include(external/summary.cmake)
 fastdeploy_summary()
 
 install(
-  TARGETS fastdeploy
+  TARGETS ${LIBRARY_NAME}
   LIBRARY DESTINATION lib
 )
 install(
-  DIRECTORY ${PROJECT_SOURCE_DIR}/fastdeploy
+  DIRECTORY ${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/fastdeploy
   DESTINATION ${CMAKE_INSTALL_PREFIX}/include
   FILES_MATCHING
   PATTERN "*.h"
-  PATTERN "${PROJECT_SOURCE_DIR}/fastdeploy/backends/*/*.h"
+  PATTERN "${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/fastdeploy/backends/*/*.h"
 )
 install(
   DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/third_libs/install
@@ -243,40 +260,34 @@ if(BUILD_FASTDEPLOY_PYTHON)
   endif()
 
   if(NOT ENABLE_VISION)
-    file(GLOB_RECURSE VISION_PYBIND_SRCS ${PROJECT_SOURCE_DIR}/fastdeploy/vision/*_pybind.cc)
+    file(GLOB_RECURSE VISION_PYBIND_SRCS ${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}/fastdeploy/vision/*_pybind.cc)
     list(REMOVE_ITEM DEPLOY_PYBIND_SRCS ${VISION_PYBIND_SRCS})
   endif()
-  add_library(fastdeploy_main MODULE ${DEPLOY_PYBIND_SRCS})
-  redefine_file_macro(fastdeploy_main)
-  set_target_properties(fastdeploy_main PROPERTIES PREFIX "")
-  set_target_properties(fastdeploy_main
+  add_library(${PY_LIBRARY_NAME} MODULE ${DEPLOY_PYBIND_SRCS})
+  redefine_file_macro(${PY_LIBRARY_NAME})
+  set_target_properties(${PY_LIBRARY_NAME} PROPERTIES PREFIX "")
+  set_target_properties(${PY_LIBRARY_NAME}
                         PROPERTIES COMPILE_FLAGS "-fvisibility=hidden")
-  set_target_properties(fastdeploy_main PROPERTIES SUFFIX ${PY_EXT_SUFFIX})
-  set_target_properties(fastdeploy_main
+  set_target_properties(${PY_LIBRARY_NAME} PROPERTIES SUFFIX ${PY_EXT_SUFFIX})
+  set_target_properties(${PY_LIBRARY_NAME}
                         PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
-  target_include_directories(fastdeploy_main PRIVATE
+  target_include_directories(${PY_LIBRARY_NAME} PRIVATE
                              $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
                              $<INSTALL_INTERFACE:include>
                              ${PYTHON_INCLUDE_DIR})
 
-  target_include_directories(fastdeploy_main PUBLIC ${PROJECT_SOURCE_DIR}/third_party/pybind11/include)
+  target_include_directories(${PY_LIBRARY_NAME} PUBLIC ${PROJECT_SOURCE_DIR}/third_party/pybind11/include)
 
   if(APPLE)
-    set_target_properties(fastdeploy_main
+    set_target_properties(${PY_LIBRARY_NAME}
                           PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
   endif()
 
-  if(APPLE)
-    target_link_libraries(fastdeploy_main PUBLIC fastdeploy)
-  elseif(WIN32)
-    target_link_libraries(fastdeploy_main PUBLIC fastdeploy)
-  else()
-    target_link_libraries(fastdeploy_main PUBLIC fastdeploy)
-  endif()
+  target_link_libraries(${PY_LIBRARY_NAME} PUBLIC ${LIBRARY_NAME})
 
   if(MSVC)
-    target_link_libraries(fastdeploy_main PRIVATE ${PYTHON_LIBRARIES})
-    target_compile_options(fastdeploy_main
+    target_link_libraries(${PY_LIBRARY_NAME} PRIVATE ${PYTHON_LIBRARIES})
+    target_compile_options(${PY_LIBRARY_NAME}
                            PRIVATE /MP
                                    /wd4244 # 'argument': conversion from 'google::
                                            # protobuf::uint64' to 'int', possible
@@ -285,7 +296,7 @@ if(BUILD_FASTDEPLOY_PYTHON)
                                            # possible loss of data
                                    /wd4996 # The second parameter is ignored.
                                    ${EXTRA_FLAGS})
-    target_compile_options(fastdeploy_main PRIVATE $<$<NOT:$<CONFIG:Debug>>:/MT> $<$<CONFIG:Debug>:/MTd>)
+    target_compile_options(${PY_LIBRARY_NAME} PRIVATE $<$<NOT:$<CONFIG:Debug>>:/MT> $<$<CONFIG:Debug>:/MTd>)
   endif()
 
 endif(BUILD_FASTDEPLOY_PYTHON)
diff --git a/FastDeploy.cmake.in b/FastDeploy.cmake.in
index e8c0bb3d59..ccf2a574b3 100644
--- a/FastDeploy.cmake.in
+++ b/FastDeploy.cmake.in
@@ -8,6 +8,7 @@ set(ENABLE_TRT_BACKEND @ENABLE_TRT_BACKEND@)
 set(ENABLE_PADDLE_FRONTEND @ENABLE_PADDLE_FRONTEND@)
 set(ENABLE_VISION @ENABLE_VISION@)
 set(ENABLE_OPENCV_CUDA @ENABLE_OPENCV_CUDA@)
+set(LIBRARY_NAME @LIBRARY_NAME@)
 
 set(FASTDEPLOY_LIBS "")
 set(FASTDEPLOY_INCS "")
@@ -17,7 +18,7 @@ if(NOT CMAKE_CXX_STANDARD)
   set(CMAKE_CXX_STANDARD 11)
 endif()
 
-find_library(FDLIB fastdeploy ${CMAKE_CURRENT_LIST_DIR}/lib)
+find_library(FDLIB ${LIBRARY_NAME} ${CMAKE_CURRENT_LIST_DIR}/lib)
 list(APPEND FASTDEPLOY_LIBS ${FDLIB})
 
 if(ENABLE_ORT_BACKEND)
@@ -51,13 +52,17 @@ if(WITH_GPU)
   list(APPEND FASTDEPLOY_LIBS ${CUDA_LIB})
 
   if (ENABLE_TRT_BACKEND)
-    if (NOT TRT_DIRECTORY)
-      message(FATAL_ERROR "[FastDeploy] Please define TRT_DIRECTORY, e.g -DTRT_DIRECTORY=/usr/downloads/TensorRT-8.4.1.0")
-    endif()
-    find_library(TRT_INFER_LIB nvinfer ${TRT_DIRECTORY}/lib)
-    find_library(TRT_ONNX_LIB nvonnxparser ${TRT_DIRECTORY}/lib)
-    find_library(TRT_CAFFE_LIB nvcaffe_parser ${TRT_DIRECTORY}/lib)
-    find_library(TRT_PLUGIN_LIB nvinfer_plugin ${TRT_DIRECTORY}/lib)
+#    if (NOT TRT_DIRECTORY)
+#      message(FATAL_ERROR "[FastDeploy] Please define TRT_DIRECTORY, e.g -DTRT_DIRECTORY=/usr/downloads/TensorRT-8.4.1.0")
+#    endif()
+#    find_library(TRT_INFER_LIB nvinfer ${TRT_DIRECTORY}/lib)
+#    find_library(TRT_ONNX_LIB nvonnxparser ${TRT_DIRECTORY}/lib)
+#    find_library(TRT_CAFFE_LIB nvcaffe_parser ${TRT_DIRECTORY}/lib)
+#    find_library(TRT_PLUGIN_LIB nvinfer_plugin ${TRT_DIRECTORY}/lib)
+    find_library(TRT_INFER_LIB nvinfer ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/tensorrt/lib)
+    find_library(TRT_ONNX_LIB nvonnxparser ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/tensorrt/lib)
+    find_library(TRT_CAFFE_LIB nvcaffe_parser ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/tensorrt/lib)
+    find_library(TRT_PLUGIN_LIB nvinfer_plugin ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/tensorrt/lib)
     list(APPEND FASTDEPLOY_LIBS ${TRT_INFER_LIB} ${TRT_ONNX_LIB} ${TRT_CAFFE_LIB} ${TRT_PLUGIN_LIB})
   endif()
 endif()
diff --git a/copy_directory.py b/copy_directory.py
new file mode 100644
index 0000000000..f0313db3c2
--- /dev/null
+++ b/copy_directory.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import shutil
+import os
+import sys
+
+
+def copy_directory(src, dst):
+    if os.path.exists(dst):
+        raise Exception("Destination {} is already exist.".format(dst))
+    if not os.path.exists(src):
+        raise Exception("Source {} is not exist.".format(src))
+    try:
+        shutil.copytree(src, dst, symlinks=True)
+    except:
+        raise Exception("Copy {} to {} failed.".format(src, dst))
+
+
+if __name__ == "__main__":
+    copy_directory(sys.argv[1], sys.argv[2])
diff --git a/fastdeploy/backends/ort/ort_backend.cc b/fastdeploy/backends/ort/ort_backend.cc
index 909b5f2875..f5d0bfdd98 100644
--- a/fastdeploy/backends/ort/ort_backend.cc
+++ b/fastdeploy/backends/ort/ort_backend.cc
@@ -81,10 +81,10 @@ void OrtBackend::BuildOption(const OrtBackendOption& option) {
       }
     }
     if (!support_cuda) {
-      FDLogger() << "[WARN] Compiled fastdeploy with onnxruntime doesn't "
-                    "support GPU, the available providers are "
-                 << providers_msg << "will fallback to CPUExecutionProvider."
-                 << std::endl;
+      FDWARNING << "Compiled fastdeploy with onnxruntime doesn't "
+                   "support GPU, the available providers are "
+                << providers_msg << "will fallback to CPUExecutionProvider."
+                << std::endl;
       option_.use_gpu = false;
     } else {
       FDASSERT(option.gpu_id == 0, "Requires gpu_id == 0, but now gpu_id = " +
diff --git a/fastdeploy/backends/tensorrt/trt_backend.cc b/fastdeploy/backends/tensorrt/trt_backend.cc
index a29af6e9a3..d050cc9f22 100644
--- a/fastdeploy/backends/tensorrt/trt_backend.cc
+++ b/fastdeploy/backends/tensorrt/trt_backend.cc
@@ -52,6 +52,61 @@ std::vector<int> toVec(const nvinfer1::Dims& dim) {
   return out;
 }
 
+bool CheckDynamicShapeConfig(const paddle2onnx::OnnxReader& reader,
+                             const TrtBackendOption& option) {
+  paddle2onnx::ModelTensorInfo inputs[reader.NumInputs()];
+  std::string input_shapes[reader.NumInputs()];
+  for (int i = 0; i < reader.NumInputs(); ++i) {
+    reader.GetInputInfo(i, &inputs[i]);
+
+    // change 0 to -1, when input_dim is a string, onnx will make it to zero
+    for (int j = 0; j < inputs[i].rank; ++j) {
+      if (inputs[i].shape[j] <= 0) {
+        inputs[i].shape[j] = -1;
+      }
+    }
+
+    input_shapes[i] = "";
+    for (int j = 0; j < inputs[i].rank; ++j) {
+      if (j != inputs[i].rank - 1) {
+        input_shapes[i] += (std::to_string(inputs[i].shape[j]) + ", ");
+      } else {
+        input_shapes[i] += std::to_string(inputs[i].shape[j]);
+      }
+    }
+  }
+
+  bool all_check_passed = true;
+  for (int i = 0; i < reader.NumInputs(); ++i) {
+    bool contain_unknown_dim = false;
+    for (int j = 0; j < inputs[i].rank; ++j) {
+      if (inputs[i].shape[j] < 0) {
+        contain_unknown_dim = true;
+      }
+    }
+
+    std::string name(inputs[i].name, strlen(inputs[i].name));
+    FDINFO << "The loaded model's input tensor:" << name
+           << " has shape [" + input_shapes[i] << "]." << std::endl;
+    if (contain_unknown_dim) {
+      auto iter1 = option.min_shape.find(name);
+      auto iter2 = option.max_shape.find(name);
+      auto iter3 = option.opt_shape.find(name);
+      if (iter1 == option.min_shape.end() || iter2 == option.max_shape.end() ||
+          iter3 == option.opt_shape.end()) {
+        FDERROR << "The loaded model's input tensor:" << name
+                << " has dynamic shape [" + input_shapes[i] +
+                       "], but didn't configure it's shape for tensorrt with "
+                       "SetTrtInputShape correctly."
+                << std::endl;
+        all_check_passed = false;
+      }
+    }
+  }
+
+  return all_check_passed;
+}
+
 bool TrtBackend::InitFromTrt(const std::string& trt_engine_file,
                              const TrtBackendOption& option) {
   if (initialized_) {
@@ -167,13 +222,17 @@ bool TrtBackend::InitFromOnnx(const std::string& model_file,
         onnx_reader.output_names[i] + strlen(onnx_reader.output_names[i]));
     outputs_order_[name] = i;
   }
+  if (!CheckDynamicShapeConfig(onnx_reader, option)) {
+    FDERROR << "TrtBackend::CheckDynamicShapeConfig failed." << std::endl;
+    return false;
+  }
 
   if (option.serialize_file != "") {
     std::ifstream fin(option.serialize_file, std::ios::binary | std::ios::in);
     if (fin) {
-      FDLogger() << "Detect serialized TensorRT Engine file in "
-                 << option.serialize_file << ", will load it directly."
-                 << std::endl;
+      FDINFO << "Detect serialized TensorRT Engine file in "
+             << option.serialize_file << ", will load it directly."
+             << std::endl;
       fin.close();
       return InitFromTrt(option.serialize_file);
     }
@@ -311,9 +370,9 @@ bool TrtBackend::CreateTrtEngine(const std::string& onnx_model,
 
   if (option.enable_fp16) {
     if (!builder->platformHasFastFp16()) {
-      FDLogger() << "[WARN] Detected FP16 is not supported in the current GPU, "
-                    "will use FP32 instead."
-                 << std::endl;
+      FDWARNING << "Detected FP16 is not supported in the current GPU, "
+                   "will use FP32 instead."
+                << std::endl;
     } else {
       config->setFlag(nvinfer1::BuilderFlag::kFP16);
     }
@@ -330,33 +389,13 @@ bool TrtBackend::CreateTrtEngine(const std::string& onnx_model,
     return false;
   }
 
-  FDLogger() << "Start to building TensorRT Engine..." << std::endl;
+  FDINFO << "Start to building TensorRT Engine..." << std::endl;
   bool fp16 = builder->platformHasFastFp16();
   builder->setMaxBatchSize(option.max_batch_size);
 
   config->setMaxWorkspaceSize(option.max_workspace_size);
 
-  if (option.fixed_shape.size() > 0) {
-    auto profile = builder->createOptimizationProfile();
-    for (auto& item : option.fixed_shape) {
-      FDASSERT(profile->setDimensions(item.first.c_str(),
-                                      nvinfer1::OptProfileSelector::kMIN,
-                                      sample::toDims(item.second)),
-               "[TrtBackend] Failed to set min_shape for input: " + item.first +
-                   " in TrtBackend.");
-      FDASSERT(profile->setDimensions(item.first.c_str(),
-                                      nvinfer1::OptProfileSelector::kOPT,
-                                      sample::toDims(item.second)),
-               "[TrtBackend] Failed to set min_shape for input: " + item.first +
-                   " in TrtBackend.");
-      FDASSERT(profile->setDimensions(item.first.c_str(),
-                                      nvinfer1::OptProfileSelector::kMAX,
-                                      sample::toDims(item.second)),
-               "[TrtBackend] Failed to set min_shape for input: " + item.first +
-                   " in TrtBackend.");
-    }
-    config->addOptimizationProfile(profile);
-  } else if (option.max_shape.size() > 0) {
+  if (option.max_shape.size() > 0) {
     auto profile = builder->createOptimizationProfile();
     FDASSERT(option.max_shape.size() == option.min_shape.size() &&
                  option.min_shape.size() == option.opt_shape.size(),
@@ -416,10 +455,10 @@ bool TrtBackend::CreateTrtEngine(const std::string& onnx_model,
     return false;
   }
 
-  FDLogger() << "TensorRT Engine is built succussfully." << std::endl;
+  FDINFO << "TensorRT Engine is built succussfully." << std::endl;
   if (option.serialize_file != "") {
-    FDLogger() << "Serialize TensorRTEngine to local file "
-               << option.serialize_file << "." << std::endl;
+    FDINFO << "Serialize TensorRTEngine to local file " << option.serialize_file
+           << "." << std::endl;
     std::ofstream engine_file(option.serialize_file.c_str());
     if (!engine_file) {
       FDERROR << "Failed to open " << option.serialize_file << " to write."
@@ -428,11 +467,11 @@ bool TrtBackend::CreateTrtEngine(const std::string& onnx_model,
     }
     engine_file.write(static_cast<char*>(plan->data()), plan->size());
     engine_file.close();
-    FDLogger() << "TensorRTEngine is serialized to local file "
-               << option.serialize_file
-               << ", we can load this model from the seralized engine "
-                  "directly next time."
-               << std::endl;
+    FDINFO << "TensorRTEngine is serialized to local file "
+           << option.serialize_file
+           << ", we can load this model from the seralized engine "
+              "directly next time."
+           << std::endl;
   }
   return true;
 }
diff --git a/fastdeploy/backends/tensorrt/trt_backend.h b/fastdeploy/backends/tensorrt/trt_backend.h
index 1da7f14714..27e6e552b4 100644
--- a/fastdeploy/backends/tensorrt/trt_backend.h
+++ b/fastdeploy/backends/tensorrt/trt_backend.h
@@ -46,7 +46,6 @@ struct TrtBackendOption {
   bool enable_int8 = false;
   size_t max_batch_size = 32;
   size_t max_workspace_size = 1 << 30;
-  std::map<std::string, std::vector<int32_t>> fixed_shape;
   std::map<std::string, std::vector<int32_t>> max_shape;
   std::map<std::string, std::vector<int32_t>> min_shape;
   std::map<std::string, std::vector<int32_t>> opt_shape;
diff --git a/fastdeploy/fastdeploy_model.cc b/fastdeploy/fastdeploy_model.cc
index c61eea7cb6..e434e19fa5 100644
--- a/fastdeploy/fastdeploy_model.cc
+++ b/fastdeploy/fastdeploy_model.cc
@@ -132,9 +132,9 @@ void FastDeployModel::EnableDebug() {
 #ifdef FASTDEPLOY_DEBUG
   debug_ = true;
 #else
-  FDLogger() << "The compile FastDeploy is not with -DENABLE_DEBUG=ON, so "
-                "cannot enable debug mode."
-             << std::endl;
+  FDWARNING << "The compile FastDeploy is not with -DENABLE_DEBUG=ON, so "
+               "cannot enable debug mode."
+            << std::endl;
   debug_ = false;
 #endif
 }
diff --git a/fastdeploy/fastdeploy_runtime.cc b/fastdeploy/fastdeploy_runtime.cc
index e353c64167..05af6e14e3 100644
--- a/fastdeploy/fastdeploy_runtime.cc
+++ b/fastdeploy/fastdeploy_runtime.cc
@@ -77,23 +77,23 @@ bool CheckModelFormat(const std::string& model_file,
   if (model_format == Frontend::PADDLE) {
     if (model_file.size() < 8 ||
         model_file.substr(model_file.size() - 8, 8) != ".pdmodel") {
-      FDLogger() << "With model format of Frontend::PADDLE, the model file "
-                    "should ends with `.pdmodel`, but now it's "
-                 << model_file << std::endl;
+      FDERROR << "With model format of Frontend::PADDLE, the model file "
+                 "should ends with `.pdmodel`, but now it's "
+              << model_file << std::endl;
       return false;
     }
   } else if (model_format == Frontend::ONNX) {
     if (model_file.size() < 5 ||
         model_file.substr(model_file.size() - 5, 5) != ".onnx") {
-      FDLogger() << "With model format of Frontend::ONNX, the model file "
-                    "should ends with `.onnx`, but now it's "
-                 << model_file << std::endl;
+      FDERROR << "With model format of Frontend::ONNX, the model file "
+                 "should ends with `.onnx`, but now it's "
+              << model_file << std::endl;
       return false;
     }
   } else {
-    FDLogger() << "Only support model format with frontend Frontend::PADDLE / "
-                  "Frontend::ONNX."
-               << std::endl;
+    FDERROR << "Only support model format with frontend Frontend::PADDLE / "
+               "Frontend::ONNX."
+            << std::endl;
     return false;
   }
   return true;
@@ -116,6 +116,101 @@ Frontend GuessModelFormat(const std::string& model_file) {
   return Frontend::PADDLE;
 }
 
+void RuntimeOption::SetModelPath(const std::string& model_path,
+                                 const std::string& params_path,
+                                 const std::string& _model_format) {
+  if (_model_format == "paddle") {
+    model_file = model_path;
+    params_file = params_path;
+    model_format = Frontend::PADDLE;
+  } else if (_model_format == "onnx") {
+    model_file = model_path;
+    model_format = Frontend::ONNX;
+  } else {
+    FDASSERT(false, "The model format only can be 'paddle' or 'onnx'.");
+  }
+}
+
+void RuntimeOption::UseGpu(int gpu_id) {
+#ifdef WITH_GPU
+  device = Device::GPU;
+  device_id = gpu_id;
+#else
+  FDWARNING << "The FastDeploy didn't compile with GPU, will force to use CPU."
+            << std::endl;
+  device = Device::CPU;
+#endif
+}
+
+void RuntimeOption::UseCpu() { device = Device::CPU; }
+
+void RuntimeOption::SetCpuThreadNum(int thread_num) {
+  FDASSERT(thread_num > 0, "The thread_num must be greater than 0.");
+  cpu_thread_num = thread_num;
+}
+
+// use paddle inference backend
+void RuntimeOption::UsePaddleBackend() {
+#ifdef ENABLE_PADDLE_BACKEND
+  backend = Backend::PDINFER;
+#else
+  FDASSERT(false, "The FastDeploy didn't compile with Paddle Inference.");
+#endif
+}
+
+// use onnxruntime backend
+void RuntimeOption::UseOrtBackend() {
+#ifdef ENABLE_ORT_BACKEND
+  backend = Backend::ORT;
+#else
+  FDASSERT(false, "The FastDeploy didn't compile with OrtBackend.");
+#endif
+}
+
+void RuntimeOption::UseTrtBackend() {
+#ifdef ENABLE_TRT_BACKEND
+  backend = Backend::TRT;
+#else
+  FDASSERT(false, "The FastDeploy didn't compile with TrtBackend.");
+#endif
+}
+
+void RuntimeOption::EnablePaddleMKLDNN() { pd_enable_mkldnn = true; }
+
+void RuntimeOption::DisablePaddleMKLDNN() { pd_enable_mkldnn = false; }
+
+void RuntimeOption::SetPaddleMKLDNNCacheSize(int size) {
+  FDASSERT(size > 0, "Parameter size must greater than 0.");
+  pd_mkldnn_cache_size = size;
+}
+
+void RuntimeOption::SetTrtInputShape(const std::string& input_name,
+                                     const std::vector<int32_t>& min_shape,
+                                     const std::vector<int32_t>& opt_shape,
+                                     const std::vector<int32_t>& max_shape) {
+  trt_min_shape[input_name].clear();
+  trt_max_shape[input_name].clear();
+  trt_opt_shape[input_name].clear();
+  trt_min_shape[input_name].assign(min_shape.begin(), min_shape.end());
+  if (opt_shape.size() == 0) {
+    trt_opt_shape[input_name].assign(min_shape.begin(), min_shape.end());
+  } else {
+    trt_opt_shape[input_name].assign(opt_shape.begin(), opt_shape.end());
+  }
+  if (max_shape.size() == 0) {
+    trt_max_shape[input_name].assign(min_shape.begin(), min_shape.end());
+  } else {
+    trt_max_shape[input_name].assign(max_shape.begin(), max_shape.end());
+  }
+  FDINFO << trt_min_shape[input_name].size() << " "
+         << trt_opt_shape[input_name].size() << " "
+         << trt_max_shape[input_name].size() << std::endl;
+}
+
+void RuntimeOption::EnableTrtFP16() { trt_enable_fp16 = true; }
+
+void RuntimeOption::DisableTrtFP16() { trt_enable_fp16 = false; }
+
 bool Runtime::Init(const RuntimeOption& _option) {
   option = _option;
   if (option.model_format == Frontend::AUTOREC) {
@@ -229,7 +324,6 @@ void Runtime::CreateTrtBackend() {
   trt_option.enable_int8 = option.trt_enable_int8;
   trt_option.max_batch_size = option.trt_max_batch_size;
   trt_option.max_workspace_size = option.trt_max_workspace_size;
-  trt_option.fixed_shape = option.trt_fixed_shape;
   trt_option.max_shape = option.trt_max_shape;
   trt_option.min_shape = option.trt_min_shape;
   trt_option.opt_shape = option.trt_opt_shape;
diff --git a/fastdeploy/fastdeploy_runtime.h b/fastdeploy/fastdeploy_runtime.h
index 7ec08e9d9d..d0f01069f8 100644
--- a/fastdeploy/fastdeploy_runtime.h
+++ b/fastdeploy/fastdeploy_runtime.h
@@ -36,8 +36,58 @@ bool CheckModelFormat(const std::string& model_file,
 Frontend GuessModelFormat(const std::string& model_file);
 
 struct FASTDEPLOY_DECL RuntimeOption {
-  Backend backend = Backend::UNKNOWN;
+  // set path of model file and params file
+  // for onnx, only need to define model_file, but also need to
+  // define model_format
+  // model_format support 'paddle' / 'onnx' now.
+  void SetModelPath(const std::string& model_path,
+                    const std::string& params_path = "",
+                    const std::string& _model_format = "paddle");
+
+  // set model inference in GPU
+  void UseCpu();
+
+  // set model inference in CPU
+  void UseGpu(int gpu_id = 0);
+
+  // set number of thread while inference in CPU
+  void SetCpuThreadNum(int thread_num);
+
+  // use paddle inference backend
+  void UsePaddleBackend();
+
+  // use onnxruntime backend
+  void UseOrtBackend();
+
+  // use tensorrt backend
+  void UseTrtBackend();
 
+  // enable mkldnn while use paddle inference in CPU
+  void EnablePaddleMKLDNN();
+  // disable mkldnn while use paddle inference in CPU
+  void DisablePaddleMKLDNN();
+
+  // set size of cached shape while enable mkldnn with paddle inference backend
+  void SetPaddleMKLDNNCacheSize(int size);
+
+  // set tensorrt shape while the inputs of model contain dynamic shape
+  // min_shape: the minimum shape
+  // opt_shape: the most common shape while inference, default be empty
+  // max_shape: the maximum shape, default be empty
+
+  // if opt_shape, max_shape are empty, they will keep same with the min_shape
+  // which means the shape will be fixed as min_shape while inference
+  void SetTrtInputShape(
+      const std::string& input_name, const std::vector<int32_t>& min_shape,
+      const std::vector<int32_t>& opt_shape = std::vector<int32_t>(),
+      const std::vector<int32_t>& max_shape = std::vector<int32_t>());
+
+  // enable half precision while use tensorrt backend
+  void EnableTrtFP16();
+  // disable half precision, change to full precision(float32)
+  void DisableTrtFP16();
+
+  Backend backend = Backend::UNKNOWN;
   // for cpu inference and preprocess
   int cpu_thread_num = 8;
   int device_id = 0;
@@ -62,7 +112,6 @@ struct FASTDEPLOY_DECL RuntimeOption {
   int pd_mkldnn_cache_size = 1;
 
   // ======Only for Trt Backend=======
-  std::map<std::string, std::vector<int32_t>> trt_fixed_shape;
   std::map<std::string, std::vector<int32_t>> trt_max_shape;
   std::map<std::string, std::vector<int32_t>> trt_min_shape;
   std::map<std::string, std::vector<int32_t>> trt_opt_shape;
diff --git a/fastdeploy/fastdeploy_runtime.py b/fastdeploy/fastdeploy_runtime.py
index 592d1d2952..b23879b36d 100644
--- a/fastdeploy/fastdeploy_runtime.py
+++ b/fastdeploy/fastdeploy_runtime.py
@@ -55,27 +55,33 @@ def initialized(self):
         return self._model.initialized()
 
 
-class FastDeployRuntime:
+class Runtime:
     def __init__(self, runtime_option):
-        self._runtime = C.Runtime();
-        assert self._runtime.init(runtime_option), "Initialize FastDeployRuntime Failed!"
+        self._runtime = C.Runtime()
+        assert self._runtime.init(runtime_option), "Initialize Runtime Failed!"
 
     def infer(self, data):
         assert isinstance(data, dict), "The input data should be type of dict."
         return self._runtime.infer(data)
 
     def num_inputs(self):
-        return self._runtime.num_inputs();
+        return self._runtime.num_inputs()
 
     def num_outputs(self):
-        return self._runtime.num_outputs();
+        return self._runtime.num_outputs()
 
     def get_input_info(self, index):
-        assert isinstance(index, int), "The input parameter index should be type of int."
-        assert index < self.num_inputs(), "The input parameter index:{} should less than number of inputs:{}.".format(index, self.num_inputs)
+        assert isinstance(
+            index, int), "The input parameter index should be type of int."
+        assert index < self.num_inputs(
+        ), "The input parameter index:{} should less than number of inputs:{}.".format(
+            index, self.num_inputs)
         return self._runtime.get_input_info(index)
 
     def get_output_info(self, index):
-        assert isinstance(index, int), "The input parameter index should be type of int."
-        assert index < self.num_outputs(), "The input parameter index:{} should less than number of outputs:{}.".format(index, self.num_outputs)
+        assert isinstance(
+            index, int), "The input parameter index should be type of int."
+        assert index < self.num_outputs(
+        ), "The input parameter index:{} should less than number of outputs:{}.".format(
+            index, self.num_outputs)
         return self._runtime.get_output_info(index)
diff --git a/fastdeploy/pybind/fastdeploy_runtime.cc b/fastdeploy/pybind/fastdeploy_runtime.cc
index e3c6dd19ae..5f27509caf 100644
--- a/fastdeploy/pybind/fastdeploy_runtime.cc
+++ b/fastdeploy/pybind/fastdeploy_runtime.cc
@@ -19,6 +19,20 @@ namespace fastdeploy {
 void BindRuntime(pybind11::module& m) {
   pybind11::class_<RuntimeOption>(m, "RuntimeOption")
       .def(pybind11::init())
+      .def("set_model_path", &RuntimeOption::SetModelPath)
+      .def("use_gpu", &RuntimeOption::UseGpu)
+      .def("use_cpu", &RuntimeOption::UseCpu)
+      .def("set_cpu_thread_num", &RuntimeOption::SetCpuThreadNum)
+      .def("use_paddle_backend", &RuntimeOption::UsePaddleBackend)
+      .def("use_ort_backend", &RuntimeOption::UseOrtBackend)
+      .def("use_trt_backend", &RuntimeOption::UseTrtBackend)
+      .def("enable_paddle_mkldnn", &RuntimeOption::EnablePaddleMKLDNN)
+      .def("disable_paddle_mkldnn", &RuntimeOption::DisablePaddleMKLDNN)
+      .def("set_paddle_mkldnn_cache_size",
+           &RuntimeOption::SetPaddleMKLDNNCacheSize)
+      .def("set_trt_input_shape", &RuntimeOption::SetTrtInputShape)
+      .def("enable_trt_fp16", &RuntimeOption::EnableTrtFP16)
+      .def("disable_trt_fp16", &RuntimeOption::DisableTrtFP16)
       .def_readwrite("model_file", &RuntimeOption::model_file)
       .def_readwrite("params_file", &RuntimeOption::params_file)
       .def_readwrite("model_format", &RuntimeOption::model_format)
@@ -30,7 +44,6 @@ void BindRuntime(pybind11::module& m) {
       .def_readwrite("ort_inter_op_num_threads",
                      &RuntimeOption::ort_inter_op_num_threads)
       .def_readwrite("ort_execution_mode", &RuntimeOption::ort_execution_mode)
-      .def_readwrite("trt_fixed_shape", &RuntimeOption::trt_fixed_shape)
       .def_readwrite("trt_max_shape", &RuntimeOption::trt_max_shape)
       .def_readwrite("trt_opt_shape", &RuntimeOption::trt_opt_shape)
       .def_readwrite("trt_min_shape", &RuntimeOption::trt_min_shape)
@@ -49,41 +62,43 @@ void BindRuntime(pybind11::module& m) {
   pybind11::class_<Runtime>(m, "Runtime")
       .def(pybind11::init())
       .def("init", &Runtime::Init)
-      .def("infer", [](Runtime& self,
-                       std::map<std::string, pybind11::array>& data) {
-        std::vector<FDTensor> inputs(data.size());
-        int index = 0;
-        for (auto iter = data.begin(); iter != data.end(); ++iter) {
-          inputs[index].dtype = NumpyDataTypeToFDDataType(iter->second.dtype());
-          inputs[index].shape.insert(
-              inputs[index].shape.begin(), iter->second.shape(),
-              iter->second.shape() + iter->second.ndim());
-          // TODO(jiangjiajun) Maybe skip memory copy is a better choice
-          // use SetExternalData
-          inputs[index].data.resize(iter->second.nbytes());
-          memcpy(inputs[index].data.data(), iter->second.mutable_data(),
-                 iter->second.nbytes());
-          inputs[index].name = iter->first;
-        }
+      .def("infer",
+           [](Runtime& self, std::map<std::string, pybind11::array>& data) {
+             std::vector<FDTensor> inputs(data.size());
+             int index = 0;
+             for (auto iter = data.begin(); iter != data.end(); ++iter) {
+               inputs[index].dtype =
+                   NumpyDataTypeToFDDataType(iter->second.dtype());
+               inputs[index].shape.insert(
+                   inputs[index].shape.begin(), iter->second.shape(),
+                   iter->second.shape() + iter->second.ndim());
+               // TODO(jiangjiajun) Maybe skip memory copy is a better choice
+               // use SetExternalData
+               inputs[index].data.resize(iter->second.nbytes());
+               memcpy(inputs[index].data.data(), iter->second.mutable_data(),
+                      iter->second.nbytes());
+               inputs[index].name = iter->first;
+             }
 
-        std::vector<FDTensor> outputs(self.NumOutputs());
-        self.Infer(inputs, &outputs);
+             std::vector<FDTensor> outputs(self.NumOutputs());
+             self.Infer(inputs, &outputs);
 
-        std::vector<pybind11::array> results;
-        results.reserve(outputs.size());
-        for (size_t i = 0; i < outputs.size(); ++i) {
-          auto numpy_dtype = FDDataTypeToNumpyDataType(outputs[i].dtype);
-          results.emplace_back(pybind11::array(numpy_dtype, outputs[i].shape));
-          memcpy(results[i].mutable_data(), outputs[i].data.data(),
-                 outputs[i].Numel() * FDDataTypeSize(outputs[i].dtype));
-        }
-        return results;
-      })
-     .def("num_inputs", &Runtime::NumInputs)
-     .def("num_outputs", &Runtime::NumOutputs)
-     .def("get_input_info", &Runtime::GetInputInfo)
-     .def("get_output_info", &Runtime::GetOutputInfo)
-     .def_readonly("option", &Runtime::option);
+             std::vector<pybind11::array> results;
+             results.reserve(outputs.size());
+             for (size_t i = 0; i < outputs.size(); ++i) {
+               auto numpy_dtype = FDDataTypeToNumpyDataType(outputs[i].dtype);
+               results.emplace_back(
+                   pybind11::array(numpy_dtype, outputs[i].shape));
+               memcpy(results[i].mutable_data(), outputs[i].data.data(),
+                      outputs[i].Numel() * FDDataTypeSize(outputs[i].dtype));
+             }
+             return results;
+           })
+      .def("num_inputs", &Runtime::NumInputs)
+      .def("num_outputs", &Runtime::NumOutputs)
+      .def("get_input_info", &Runtime::GetInputInfo)
+      .def("get_output_info", &Runtime::GetOutputInfo)
+      .def_readonly("option", &Runtime::option);
 
   pybind11::enum_<Backend>(m, "Backend", pybind11::arithmetic(),
                            "Backend for inference.")
diff --git a/fastdeploy/utils/utils.cc b/fastdeploy/utils/utils.cc
index e4e5d1472d..dfe5326d12 100644
--- a/fastdeploy/utils/utils.cc
+++ b/fastdeploy/utils/utils.cc
@@ -31,4 +31,4 @@ FDLogger& FDLogger::operator<<(std::ostream& (*os)(std::ostream&)) {
   return *this;
 }
 
-} // namespace fastdeploy
+}  // namespace fastdeploy
diff --git a/fastdeploy/utils/utils.h b/fastdeploy/utils/utils.h
index b57a27f80c..f427cd7a3b 100644
--- a/fastdeploy/utils/utils.h
+++ b/fastdeploy/utils/utils.h
@@ -69,13 +69,17 @@ class FASTDEPLOY_DECL FDLogger {
 #define __REL_FILE__ __FILE__
 #endif
 
-#define FDERROR                                                                \
-  FDLogger(true, "[ERROR]")                                                    \
+#define FDERROR             \
+  FDLogger(true, "[ERROR]") \
       << __REL_FILE__ << "(" << __LINE__ << ")::" << __FUNCTION__ << "\t"
 
-#define FDERROR                                                \
-  FDLogger(true, "[ERROR]") << __REL_FILE__ << "(" << __LINE__ \
-                            << ")::" << __FUNCTION__ << "\t"
+#define FDWARNING             \
+  FDLogger(true, "[WARNING]") \
+      << __REL_FILE__ << "(" << __LINE__ << ")::" << __FUNCTION__ << "\t"
+
+#define FDINFO                                                \
+  FDLogger(true, "[INFO]") << __REL_FILE__ << "(" << __LINE__ \
+                           << ")::" << __FUNCTION__ << "\t"
 
 #define FDASSERT(condition, message) \
   if (!(condition)) {                \
diff --git a/fastdeploy/vision/common/processors/cast.cc b/fastdeploy/vision/common/processors/cast.cc
index 77a1b249ae..b9a757f142 100644
--- a/fastdeploy/vision/common/processors/cast.cc
+++ b/fastdeploy/vision/common/processors/cast.cc
@@ -29,9 +29,8 @@ bool Cast::CpuRun(Mat* mat) {
       im->convertTo(*im, CV_64FC(c));
     }
   } else {
-    FDLogger() << "[WARN] Cast not support for " << dtype_ 
-               << " now! will skip this operation."
-               << std::endl;
+    FDWARNING << "Cast not support for " << dtype_
+              << " now! will skip this operation." << std::endl;
   }
   return true;
 }
@@ -49,9 +48,8 @@ bool Cast::GpuRun(Mat* mat) {
       im->convertTo(*im, CV_64FC(c));
     }
   } else {
-    FDLogger() << "[WARN] Cast not support for " << dtype_ 
-               << " now! will skip this operation."
-               << std::endl;
+    FDWARNING << "Cast not support for " << dtype_
+              << " now! will skip this operation." << std::endl;
   }
   return true;
 }
@@ -62,5 +60,5 @@ bool Cast::Run(Mat* mat, const std::string& dtype, ProcLib lib) {
   return c(mat, lib);
 }
 
-} // namespace vision
-} // namespace fastdeploy
+}  // namespace vision
+}  // namespace fastdeploy
diff --git a/model_zoo/vision/ppyoloe/cpp/CMakeLists.txt b/model_zoo/vision/ppyoloe/cpp/CMakeLists.txt
index e681566517..42ae436106 100644
--- a/model_zoo/vision/ppyoloe/cpp/CMakeLists.txt
+++ b/model_zoo/vision/ppyoloe/cpp/CMakeLists.txt
@@ -5,7 +5,7 @@ CMAKE_MINIMUM_REQUIRED (VERSION 3.16)
 # add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0)
 
 # 指定下载解压后的fastdeploy库路径
-set(FASTDEPLOY_INSTALL_DIR ${PROJECT_SOURCE_DIR}/fastdeploy-linux-x64-0.3.0/)
+set(FASTDEPLOY_INSTALL_DIR  /fastdeploy/CustomOp/FastDeploy/build1/fastdeploy-linux-x64-gpu-0.3.0)
 
 include(${FASTDEPLOY_INSTALL_DIR}/FastDeploy.cmake)
 
diff --git a/model_zoo/vision/ppyoloe/cpp/ppyoloe.cc b/model_zoo/vision/ppyoloe/cpp/ppyoloe.cc
index 30765f075d..e63f29e62a 100644
--- a/model_zoo/vision/ppyoloe/cpp/ppyoloe.cc
+++ b/model_zoo/vision/ppyoloe/cpp/ppyoloe.cc
@@ -23,11 +23,7 @@ int main() {
   std::string img_path = "000000014439_640x640.jpg";
   std::string vis_path = "vis.jpeg";
 
-  auto option = fastdeploy::RuntimeOption();
-  option.device = fastdeploy::Device::CPU;
-  option.backend = fastdeploy::Backend::PDINFER;
-  auto model =
-      vis::ppdet::PPYOLOE(model_file, params_file, config_file, option);
+  auto model = vis::ppdet::PPYOLOE(model_file, params_file, config_file);
   if (!model.Initialized()) {
     std::cerr << "Init Failed." << std::endl;
     return -1;
diff --git a/model_zoo/vision/yolov5/cpp/CMakeLists.txt b/model_zoo/vision/yolov5/cpp/CMakeLists.txt
index 13ddc9d21f..c1f82a6fe5 100644
--- a/model_zoo/vision/yolov5/cpp/CMakeLists.txt
+++ b/model_zoo/vision/yolov5/cpp/CMakeLists.txt
@@ -5,7 +5,8 @@ CMAKE_MINIMUM_REQUIRED (VERSION 3.16)
 # add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0)
 
 # 指定下载解压后的fastdeploy库路径
-set(FASTDEPLOY_INSTALL_DIR ${PROJECT_SOURCE_DIR}/fastdeploy-linux-x64-0.3.0/)
+set(FASTDEPLOY_INSTALL_DIR  /fastdeploy/CustomOp/FastDeploy/build1/fastdeploy-linux-x64-gpu-0.3.0)
+
 
 include(${FASTDEPLOY_INSTALL_DIR}/FastDeploy.cmake)
 
diff --git a/model_zoo/vision/yolox/cpp/CMakeLists.txt b/model_zoo/vision/yolox/cpp/CMakeLists.txt
index fe9668f6a0..67bf0f2da6 100644
--- a/model_zoo/vision/yolox/cpp/CMakeLists.txt
+++ b/model_zoo/vision/yolox/cpp/CMakeLists.txt
@@ -5,7 +5,7 @@ CMAKE_MINIMUM_REQUIRED (VERSION 3.16)
 # add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0)
 
 # 指定下载解压后的fastdeploy库路径
-set(FASTDEPLOY_INSTALL_DIR ${PROJECT_SOURCE_DIR}/fastdeploy-linux-x64-0.0.3/)
+set(FASTDEPLOY_INSTALL_DIR /fastdeploy/CustomOp/FastDeploy/build1/fastdeploy-linux-x64-gpu-0.3.0)
 
 include(${FASTDEPLOY_INSTALL_DIR}/FastDeploy.cmake)
 
diff --git a/model_zoo/vision/yolox/yolox.py b/model_zoo/vision/yolox/yolox.py
index 8fd1a8a021..b63675049b 100644
--- a/model_zoo/vision/yolox/yolox.py
+++ b/model_zoo/vision/yolox/yolox.py
@@ -1,5 +1,5 @@
 import fastdeploy as fd
-import cv2 
+import cv2
 
 # 下载模型和测试图片
 model_url = "https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_s.onnx"
@@ -20,4 +20,3 @@
 
 # 输出预测结果
 print(result)
-print(model.runtime_option)
diff --git a/setup.py b/setup.py
index 19c47ed9cc..06e9b12be6 100644
--- a/setup.py
+++ b/setup.py
@@ -126,7 +126,7 @@ def finalize_options(self):
         pass
 
 
-def GetAllFiles(dirname):
+def get_all_files(dirname):
     files = list()
     for root, dirs, filenames in os.walk(dirname):
         for f in filenames:
@@ -353,23 +353,22 @@ def run(self):
 
     if os.path.exists("fastdeploy/libs/third_libs"):
         shutil.rmtree("fastdeploy/libs/third_libs")
-    shutil.copytree(
-        ".setuptools-cmake-build/third_libs/install",
-        "fastdeploy/libs/third_libs",
-        symlinks=True)
-
-    all_files = GetAllFiles("fastdeploy/libs")
-    for f in all_files:
-        package_data[PACKAGE_NAME].append(os.path.relpath(f, "fastdeploy"))
+#    shutil.copytree(
+#        ".setuptools-cmake-build/third_libs/install",
+#        "fastdeploy/libs/third_libs",
+#        symlinks=True)
 
     if platform.system().lower() == "linux":
         rpaths = ["${ORIGIN}"]
-        for root, dirs, files in os.walk("fastdeploy/libs/third_libs"):
+        for root, dirs, files in os.walk(
+                ".setuptools-cmake-build/third_libs/install"):
             for d in dirs:
                 if d == "lib":
                     path = os.path.relpath(
-                        os.path.join(root, d), "fastdeploy/libs")
-                    rpaths.append("${ORIGIN}/" + format(path))
+                        os.path.join(root, d),
+                        ".setuptools-cmake-build/third_libs/install")
+                    rpaths.append("${ORIGIN}/" + os.path.join(
+                        "libs/third_libs", path))
         rpaths = ":".join(rpaths)
         command = "patchelf --set-rpath '{}' ".format(rpaths) + os.path.join(
             "fastdeploy/libs", pybind_so_file)
@@ -379,6 +378,12 @@ def run(self):
                 command) == 0, "patchelf {} failed, the command: {}".format(
                     command, pybind_so_file)
 
+    all_files = get_all_files("fastdeploy/libs")
+    for f in all_files:
+        if f.count("third_libs") > 0:
+            continue
+        package_data[PACKAGE_NAME].append(os.path.relpath(f, "fastdeploy"))
+
 setuptools.setup(
     name=PACKAGE_NAME,
     version=VersionInfo.version,

From b604c70dca8b6bb6384c44650c2c451a85e2a68e Mon Sep 17 00:00:00 2001
From: jiangjiajun <jiangjiajun@baidu.com>
Date: Mon, 25 Jul 2022 02:07:05 +0000
Subject: [PATCH 6/9] fix core dump

---
 setup.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/setup.py b/setup.py
index 06e9b12be6..0faa983721 100644
--- a/setup.py
+++ b/setup.py
@@ -353,10 +353,10 @@ def run(self):
 
     if os.path.exists("fastdeploy/libs/third_libs"):
         shutil.rmtree("fastdeploy/libs/third_libs")
-#    shutil.copytree(
-#        ".setuptools-cmake-build/third_libs/install",
-#        "fastdeploy/libs/third_libs",
-#        symlinks=True)
+    shutil.copytree(
+        ".setuptools-cmake-build/third_libs/install",
+        "fastdeploy/libs/third_libs",
+        symlinks=True)
 
     if platform.system().lower() == "linux":
         rpaths = ["${ORIGIN}"]
@@ -380,8 +380,6 @@ def run(self):
 
     all_files = get_all_files("fastdeploy/libs")
     for f in all_files:
-        if f.count("third_libs") > 0:
-            continue
         package_data[PACKAGE_NAME].append(os.path.relpath(f, "fastdeploy"))
 
 setuptools.setup(

From 445bd041cba0833f1158b5576bacc8a93a3e6e21 Mon Sep 17 00:00:00 2001
From: Jason <jiangjiajun@baidu.com>
Date: Mon, 25 Jul 2022 11:06:10 +0800
Subject: [PATCH 7/9] Update setup.py

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 0faa983721..8509480c69 100644
--- a/setup.py
+++ b/setup.py
@@ -44,6 +44,7 @@
 setup_configs["ENABLE_PADDLE_FRONTEND"] = os.getenv("ENABLE_PADDLE_FRONTEND",
                                                     "ON")
 setup_configs["ENABLE_ORT_BACKEND"] = os.getenv("ENABLE_ORT_BACKEND", "ON")
+setup_configs["ENABLE_PADDLE_BACKEND"] = os.getenv("ENABLE_PADDLE_BACKEND", "ON")
 setup_configs["BUILD_DEMO"] = os.getenv("BUILD_DEMO", "ON")
 setup_configs["ENABLE_VISION"] = os.getenv("ENABLE_VISION", "ON")
 setup_configs["ENABLE_TRT_BACKEND"] = os.getenv("ENABLE_TRT_BACKEND", "OFF")

From 5bf68c89095d8d3fa29bd5940a534fef1e7d59f5 Mon Sep 17 00:00:00 2001
From: Jason <jiangjiajun@baidu.com>
Date: Mon, 25 Jul 2022 11:07:02 +0800
Subject: [PATCH 8/9] Update setup.py

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 8509480c69..cb705376cd 100644
--- a/setup.py
+++ b/setup.py
@@ -44,7 +44,7 @@
 setup_configs["ENABLE_PADDLE_FRONTEND"] = os.getenv("ENABLE_PADDLE_FRONTEND",
                                                     "ON")
 setup_configs["ENABLE_ORT_BACKEND"] = os.getenv("ENABLE_ORT_BACKEND", "ON")
-setup_configs["ENABLE_PADDLE_BACKEND"] = os.getenv("ENABLE_PADDLE_BACKEND", "ON")
+setup_configs["ENABLE_PADDLE_BACKEND"] = os.getenv("ENABLE_PADDLE_BACKEND", "OFF")
 setup_configs["BUILD_DEMO"] = os.getenv("BUILD_DEMO", "ON")
 setup_configs["ENABLE_VISION"] = os.getenv("ENABLE_VISION", "ON")
 setup_configs["ENABLE_TRT_BACKEND"] = os.getenv("ENABLE_TRT_BACKEND", "OFF")

From 36fc77e6b8ac8f21b99698cb4ab1dc51c8dc5266 Mon Sep 17 00:00:00 2001
From: ziqi-jin <67993288+ziqi-jin@users.noreply.github.com>
Date: Mon, 25 Jul 2022 13:38:23 +0800
Subject: [PATCH 9/9] Add model ScaledYOLOv4 Support (#34)

* first commit for yolov7

* pybind for yolov7

* CPP README.md

* CPP README.md

* modified yolov7.cc

* README.md

* python file modify

* delete license in fastdeploy/

* repush the conflict part

* README.md modified

* README.md modified

* file path modified

* file path modified

* file path modified

* file path modified

* file path modified

* README modified

* README modified

* move some helpers to private

* add examples for yolov7

* api.md modified

* api.md modified

* api.md modified

* YOLOv7

* yolov7 release link

* yolov7 release link

* yolov7 release link

* copyright

* change some helpers to private

* change variables to const and fix documents.

* gitignore

* Transfer some funtions to private member of class

* Transfer some funtions to private member of class

* Merge from develop (#9)

* Fix compile problem in different python version (#26)

* fix some usage problem in linux

* Fix compile problem

Co-authored-by: root <root@bjyz-sys-gpu-kongming3.bjyz.baidu.com>

* Add PaddleDetetion/PPYOLOE model support (#22)

* add ppdet/ppyoloe

* Add demo code and documents

* add convert processor to vision (#27)

* update .gitignore

* Added checking for cmake include dir

* fixed missing trt_backend option bug when init from trt

* remove un-need data layout and add pre-check for dtype

* changed RGB2BRG to BGR2RGB in ppcls model

* add model_zoo yolov6 c++/python demo

* fixed CMakeLists.txt typos

* update yolov6 cpp/README.md

* add yolox c++/pybind and model_zoo demo

* move some helpers to private

* fixed CMakeLists.txt typos

* add normalize with alpha and beta

* add version notes for yolov5/yolov6/yolox

* add copyright to yolov5.cc

* revert normalize

* fixed some bugs in yolox

* fixed examples/CMakeLists.txt to avoid conflicts

* add convert processor to vision

* format examples/CMakeLists summary

* Fix bug while the inference result is empty with YOLOv5 (#29)

* Add multi-label function for yolov5

* Update README.md

Update doc

* Update fastdeploy_runtime.cc

fix variable option.trt_max_shape wrong name

* Update runtime_option.md

Update resnet model dynamic shape setting name from images to x

* Fix bug when inference result boxes are empty

* Delete detection.py

Co-authored-by: Jason <jiangjiajun@baidu.com>
Co-authored-by: root <root@bjyz-sys-gpu-kongming3.bjyz.baidu.com>
Co-authored-by: DefTruth <31974251+DefTruth@users.noreply.github.com>
Co-authored-by: huangjianhui <852142024@qq.com>

* first commit for yolor

* for merge

* Develop (#11)

* Fix compile problem in different python version (#26)

* fix some usage problem in linux

* Fix compile problem

Co-authored-by: root <root@bjyz-sys-gpu-kongming3.bjyz.baidu.com>

* Add PaddleDetetion/PPYOLOE model support (#22)

* add ppdet/ppyoloe

* Add demo code and documents

* add convert processor to vision (#27)

* update .gitignore

* Added checking for cmake include dir

* fixed missing trt_backend option bug when init from trt

* remove un-need data layout and add pre-check for dtype

* changed RGB2BRG to BGR2RGB in ppcls model

* add model_zoo yolov6 c++/python demo

* fixed CMakeLists.txt typos

* update yolov6 cpp/README.md

* add yolox c++/pybind and model_zoo demo

* move some helpers to private

* fixed CMakeLists.txt typos

* add normalize with alpha and beta

* add version notes for yolov5/yolov6/yolox

* add copyright to yolov5.cc

* revert normalize

* fixed some bugs in yolox

* fixed examples/CMakeLists.txt to avoid conflicts

* add convert processor to vision

* format examples/CMakeLists summary

* Fix bug while the inference result is empty with YOLOv5 (#29)

* Add multi-label function for yolov5

* Update README.md

Update doc

* Update fastdeploy_runtime.cc

fix variable option.trt_max_shape wrong name

* Update runtime_option.md

Update resnet model dynamic shape setting name from images to x

* Fix bug when inference result boxes are empty

* Delete detection.py

Co-authored-by: Jason <jiangjiajun@baidu.com>
Co-authored-by: root <root@bjyz-sys-gpu-kongming3.bjyz.baidu.com>
Co-authored-by: DefTruth <31974251+DefTruth@users.noreply.github.com>
Co-authored-by: huangjianhui <852142024@qq.com>

* Yolor (#16)

* Develop (#11) (#12)

* Fix compile problem in different python version (#26)

* fix some usage problem in linux

* Fix compile problem

Co-authored-by: root <root@bjyz-sys-gpu-kongming3.bjyz.baidu.com>

* Add PaddleDetetion/PPYOLOE model support (#22)

* add ppdet/ppyoloe

* Add demo code and documents

* add convert processor to vision (#27)

* update .gitignore

* Added checking for cmake include dir

* fixed missing trt_backend option bug when init from trt

* remove un-need data layout and add pre-check for dtype

* changed RGB2BRG to BGR2RGB in ppcls model

* add model_zoo yolov6 c++/python demo

* fixed CMakeLists.txt typos

* update yolov6 cpp/README.md

* add yolox c++/pybind and model_zoo demo

* move some helpers to private

* fixed CMakeLists.txt typos

* add normalize with alpha and beta

* add version notes for yolov5/yolov6/yolox

* add copyright to yolov5.cc

* revert normalize

* fixed some bugs in yolox

* fixed examples/CMakeLists.txt to avoid conflicts

* add convert processor to vision

* format examples/CMakeLists summary

* Fix bug while the inference result is empty with YOLOv5 (#29)

* Add multi-label function for yolov5

* Update README.md

Update doc

* Update fastdeploy_runtime.cc

fix variable option.trt_max_shape wrong name

* Update runtime_option.md

Update resnet model dynamic shape setting name from images to x

* Fix bug when inference result boxes are empty

* Delete detection.py

Co-authored-by: Jason <jiangjiajun@baidu.com>
Co-authored-by: root <root@bjyz-sys-gpu-kongming3.bjyz.baidu.com>
Co-authored-by: DefTruth <31974251+DefTruth@users.noreply.github.com>
Co-authored-by: huangjianhui <852142024@qq.com>

Co-authored-by: Jason <jiangjiajun@baidu.com>
Co-authored-by: root <root@bjyz-sys-gpu-kongming3.bjyz.baidu.com>
Co-authored-by: DefTruth <31974251+DefTruth@users.noreply.github.com>
Co-authored-by: huangjianhui <852142024@qq.com>

* Develop (#13)

* Fix compile problem in different python version (#26)

* fix some usage problem in linux

* Fix compile problem

Co-authored-by: root <root@bjyz-sys-gpu-kongming3.bjyz.baidu.com>

* Add PaddleDetetion/PPYOLOE model support (#22)

* add ppdet/ppyoloe

* Add demo code and documents

* add convert processor to vision (#27)

* update .gitignore

* Added checking for cmake include dir

* fixed missing trt_backend option bug when init from trt

* remove un-need data layout and add pre-check for dtype

* changed RGB2BRG to BGR2RGB in ppcls model

* add model_zoo yolov6 c++/python demo

* fixed CMakeLists.txt typos

* update yolov6 cpp/README.md

* add yolox c++/pybind and model_zoo demo

* move some helpers to private

* fixed CMakeLists.txt typos

* add normalize with alpha and beta

* add version notes for yolov5/yolov6/yolox

* add copyright to yolov5.cc

* revert normalize

* fixed some bugs in yolox

* fixed examples/CMakeLists.txt to avoid conflicts

* add convert processor to vision

* format examples/CMakeLists summary

* Fix bug while the inference result is empty with YOLOv5 (#29)

* Add multi-label function for yolov5

* Update README.md

Update doc

* Update fastdeploy_runtime.cc

fix variable option.trt_max_shape wrong name

* Update runtime_option.md

Update resnet model dynamic shape setting name from images to x

* Fix bug when inference result boxes are empty

* Delete detection.py

Co-authored-by: Jason <jiangjiajun@baidu.com>
Co-authored-by: root <root@bjyz-sys-gpu-kongming3.bjyz.baidu.com>
Co-authored-by: DefTruth <31974251+DefTruth@users.noreply.github.com>
Co-authored-by: huangjianhui <852142024@qq.com>

* documents

* documents

* documents

* documents

* documents

* documents

* documents

* documents

* documents

* documents

* documents

* documents

* Develop (#14)

* Fix compile problem in different python version (#26)

* fix some usage problem in linux

* Fix compile problem

Co-authored-by: root <root@bjyz-sys-gpu-kongming3.bjyz.baidu.com>

* Add PaddleDetetion/PPYOLOE model support (#22)

* add ppdet/ppyoloe

* Add demo code and documents

* add convert processor to vision (#27)

* update .gitignore

* Added checking for cmake include dir

* fixed missing trt_backend option bug when init from trt

* remove un-need data layout and add pre-check for dtype

* changed RGB2BRG to BGR2RGB in ppcls model

* add model_zoo yolov6 c++/python demo

* fixed CMakeLists.txt typos

* update yolov6 cpp/README.md

* add yolox c++/pybind and model_zoo demo

* move some helpers to private

* fixed CMakeLists.txt typos

* add normalize with alpha and beta

* add version notes for yolov5/yolov6/yolox

* add copyright to yolov5.cc

* revert normalize

* fixed some bugs in yolox

* fixed examples/CMakeLists.txt to avoid conflicts

* add convert processor to vision

* format examples/CMakeLists summary

* Fix bug while the inference result is empty with YOLOv5 (#29)

* Add multi-label function for yolov5

* Update README.md

Update doc

* Update fastdeploy_runtime.cc

fix variable option.trt_max_shape wrong name

* Update runtime_option.md

Update resnet model dynamic shape setting name from images to x

* Fix bug when inference result boxes are empty

* Delete detection.py

Co-authored-by: root <root@bjyz-sys-gpu-kongming3.bjyz.baidu.com>
Co-authored-by: DefTruth <31974251+DefTruth@users.noreply.github.com>
Co-authored-by: huangjianhui <852142024@qq.com>

Co-authored-by: Jason <jiangjiajun@baidu.com>
Co-authored-by: root <root@bjyz-sys-gpu-kongming3.bjyz.baidu.com>
Co-authored-by: DefTruth <31974251+DefTruth@users.noreply.github.com>
Co-authored-by: huangjianhui <852142024@qq.com>
Co-authored-by: Jason <928090362@qq.com>

* first commit for scaled_yolov4

* commit for documents

* change py name

* accelerate the normalize

* code fixed by the commets above

Co-authored-by: Jason <jiangjiajun@baidu.com>
Co-authored-by: root <root@bjyz-sys-gpu-kongming3.bjyz.baidu.com>
Co-authored-by: DefTruth <31974251+DefTruth@users.noreply.github.com>
Co-authored-by: huangjianhui <852142024@qq.com>
Co-authored-by: Jason <928090362@qq.com>
---
 examples/vision/wongkinyiu_scaledyolov4.cc    |  52 ++++
 fastdeploy/vision.h                           |   1 +
 fastdeploy/vision/wongkinyiu/__init__.py      |  98 +++++++
 fastdeploy/vision/wongkinyiu/scaledyolov4.cc  | 254 ++++++++++++++++++
 fastdeploy/vision/wongkinyiu/scaledyolov4.h   |  95 +++++++
 .../vision/wongkinyiu/wongkinyiu_pybind.cc    |  19 ++
 fastdeploy/vision/wongkinyiu/yolor.cc         |  33 ++-
 fastdeploy/vision/wongkinyiu/yolov7.cc        |  33 ++-
 model_zoo/vision/scaledyolov4/README.md       |  66 +++++
 model_zoo/vision/scaledyolov4/api.md          |  71 +++++
 .../vision/scaledyolov4/cpp/CMakeLists.txt    |  17 ++
 model_zoo/vision/scaledyolov4/cpp/README.md   |  53 ++++
 .../vision/scaledyolov4/cpp/scaledyolov4.cc   |  40 +++
 .../vision/scaledyolov4/scaled_yolov4.py      |  21 ++
 14 files changed, 831 insertions(+), 22 deletions(-)
 create mode 100644 examples/vision/wongkinyiu_scaledyolov4.cc
 create mode 100644 fastdeploy/vision/wongkinyiu/scaledyolov4.cc
 create mode 100644 fastdeploy/vision/wongkinyiu/scaledyolov4.h
 create mode 100644 model_zoo/vision/scaledyolov4/README.md
 create mode 100644 model_zoo/vision/scaledyolov4/api.md
 create mode 100644 model_zoo/vision/scaledyolov4/cpp/CMakeLists.txt
 create mode 100644 model_zoo/vision/scaledyolov4/cpp/README.md
 create mode 100644 model_zoo/vision/scaledyolov4/cpp/scaledyolov4.cc
 create mode 100644 model_zoo/vision/scaledyolov4/scaled_yolov4.py

diff --git a/examples/vision/wongkinyiu_scaledyolov4.cc b/examples/vision/wongkinyiu_scaledyolov4.cc
new file mode 100644
index 0000000000..5374d34536
--- /dev/null
+++ b/examples/vision/wongkinyiu_scaledyolov4.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/vision.h"
+
+int main() {
+  namespace vis = fastdeploy::vision;
+
+  std::string model_file = "../resources/models/scaledyolov4.onnx";
+  std::string img_path = "../resources/images/bus.jpg";
+  std::string vis_path = "../resources/outputs/wongkinyiu_scaledyolov4_vis_result.jpg";
+
+  auto model = vis::wongkinyiu::ScaledYOLOv4(model_file);
+  if (!model.Initialized()) {
+    std::cerr << "Init Failed! Model: " << model_file << std::endl;
+    return -1;
+  } else {
+    std::cout << "Init Done! Model:" << model_file << std::endl;
+  }
+  model.EnableDebug();
+
+  cv::Mat im = cv::imread(img_path);
+  cv::Mat vis_im = im.clone();
+
+  vis::DetectionResult res;
+  if (!model.Predict(&im, &res)) {
+    std::cerr << "Prediction Failed." << std::endl;
+    return -1;
+  } else {
+    std::cout << "Prediction Done!" << std::endl;
+  }
+
+  // 输出预测框结果
+  std::cout << res.Str() << std::endl;
+
+  // 可视化预测结果
+  vis::Visualize::VisDetection(&vis_im, res);
+  cv::imwrite(vis_path, vis_im);
+  std::cout << "Detect Done! Saved: " << vis_path << std::endl;
+  return 0;
+}
diff --git a/fastdeploy/vision.h b/fastdeploy/vision.h
index b7836ca466..1281df2af6 100644
--- a/fastdeploy/vision.h
+++ b/fastdeploy/vision.h
@@ -24,6 +24,7 @@
 #include "fastdeploy/vision/ultralytics/yolov5.h"
 #include "fastdeploy/vision/wongkinyiu/yolor.h"
 #include "fastdeploy/vision/wongkinyiu/yolov7.h"
+#include "fastdeploy/vision/wongkinyiu/scaledyolov4.h"
 #endif
 
 #include "fastdeploy/vision/visualize/visualize.h"
diff --git a/fastdeploy/vision/wongkinyiu/__init__.py b/fastdeploy/vision/wongkinyiu/__init__.py
index 3c77e85896..c1a3d37f7f 100644
--- a/fastdeploy/vision/wongkinyiu/__init__.py
+++ b/fastdeploy/vision/wongkinyiu/__init__.py
@@ -212,3 +212,101 @@ def max_wh(self, value):
         assert isinstance(
             value, float), "The value to set `max_wh` must be type of float."
         self._model.max_wh = value
+
+
+class ScaledYOLOv4(FastDeployModel):
+    def __init__(self,
+                 model_file,
+                 params_file="",
+                 runtime_option=None,
+                 model_format=Frontend.ONNX):
+        # 调用基函数进行backend_option的初始化
+        # 初始化后的option保存在self._runtime_option
+        super(ScaledYOLOv4, self).__init__(runtime_option)
+
+        self._model = C.vision.wongkinyiu.ScaledYOLOv4(
+            model_file, params_file, self._runtime_option, model_format)
+        # 通过self.initialized判断整个模型的初始化是否成功
+        assert self.initialized, "ScaledYOLOv4 initialize failed."
+
+    def predict(self, input_image, conf_threshold=0.25, nms_iou_threshold=0.5):
+        return self._model.predict(input_image, conf_threshold,
+                                   nms_iou_threshold)
+
+    # 一些跟ScaledYOLOv4模型有关的属性封装
+    # 多数是预处理相关，可通过修改如model.size = [1280, 1280]改变预处理时resize的大小（前提是模型支持）
+    @property
+    def size(self):
+        return self._model.size
+
+    @property
+    def padding_value(self):
+        return self._model.padding_value
+
+    @property
+    def is_no_pad(self):
+        return self._model.is_no_pad
+
+    @property
+    def is_mini_pad(self):
+        return self._model.is_mini_pad
+
+    @property
+    def is_scale_up(self):
+        return self._model.is_scale_up
+
+    @property
+    def stride(self):
+        return self._model.stride
+
+    @property
+    def max_wh(self):
+        return self._model.max_wh
+
+    @size.setter
+    def size(self, wh):
+        assert isinstance(wh, [list, tuple]),\
+            "The value to set `size` must be type of tuple or list."
+        assert len(wh) == 2,\
+            "The value to set `size` must contatins 2 elements means [width, height], but now it contains {} elements.".format(
+            len(wh))
+        self._model.size = wh
+
+    @padding_value.setter
+    def padding_value(self, value):
+        assert isinstance(
+            value,
+            list), "The value to set `padding_value` must be type of list."
+        self._model.padding_value = value
+
+    @is_no_pad.setter
+    def is_no_pad(self, value):
+        assert isinstance(
+            value, bool), "The value to set `is_no_pad` must be type of bool."
+        self._model.is_no_pad = value
+
+    @is_mini_pad.setter
+    def is_mini_pad(self, value):
+        assert isinstance(
+            value,
+            bool), "The value to set `is_mini_pad` must be type of bool."
+        self._model.is_mini_pad = value
+
+    @is_scale_up.setter
+    def is_scale_up(self, value):
+        assert isinstance(
+            value,
+            bool), "The value to set `is_scale_up` must be type of bool."
+        self._model.is_scale_up = value
+
+    @stride.setter
+    def stride(self, value):
+        assert isinstance(
+            value, int), "The value to set `stride` must be type of int."
+        self._model.stride = value
+
+    @max_wh.setter
+    def max_wh(self, value):
+        assert isinstance(
+            value, float), "The value to set `max_wh` must be type of float."
+        self._model.max_wh = value
diff --git a/fastdeploy/vision/wongkinyiu/scaledyolov4.cc b/fastdeploy/vision/wongkinyiu/scaledyolov4.cc
new file mode 100644
index 0000000000..240e3b7ba2
--- /dev/null
+++ b/fastdeploy/vision/wongkinyiu/scaledyolov4.cc
@@ -0,0 +1,254 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/vision/wongkinyiu/scaledyolov4.h"
+#include "fastdeploy/utils/perf.h"
+#include "fastdeploy/vision/utils/utils.h"
+
+namespace fastdeploy {
+namespace vision {
+namespace wongkinyiu {
+
+void ScaledYOLOv4::LetterBox(Mat* mat, const std::vector<int>& size,
+                      const std::vector<float>& color, bool _auto,
+                      bool scale_fill, bool scale_up, int stride) {
+  float scale =
+      std::min(size[1] * 1.0 / mat->Height(), size[0] * 1.0 / mat->Width());
+  if (!scale_up) {
+    scale = std::min(scale, 1.0f);
+  }
+
+  int resize_h = int(round(mat->Height() * scale));
+  int resize_w = int(round(mat->Width() * scale));
+
+  int pad_w = size[0] - resize_w;
+  int pad_h = size[1] - resize_h;
+  if (_auto) {
+    pad_h = pad_h % stride;
+    pad_w = pad_w % stride;
+  } else if (scale_fill) {
+    pad_h = 0;
+    pad_w = 0;
+    resize_h = size[1];
+    resize_w = size[0];
+  }
+  if (resize_h != mat->Height() || resize_w != mat->Width()) {
+    Resize::Run(mat, resize_w, resize_h);
+  }
+  if (pad_h > 0 || pad_w > 0) {
+    float half_h = pad_h * 1.0 / 2;
+    int top = int(round(half_h - 0.1));
+    int bottom = int(round(half_h + 0.1));
+    float half_w = pad_w * 1.0 / 2;
+    int left = int(round(half_w - 0.1));
+    int right = int(round(half_w + 0.1));
+    Pad::Run(mat, top, bottom, left, right, color);
+  }
+}
+
+ScaledYOLOv4::ScaledYOLOv4(const std::string& model_file, const std::string& params_file,
+             const RuntimeOption& custom_option, const Frontend& model_format) {
+  if (model_format == Frontend::ONNX) {
+    valid_cpu_backends = {Backend::ORT};  // 指定可用的CPU后端
+    valid_gpu_backends = {Backend::ORT, Backend::TRT};  // 指定可用的GPU后端
+  } else {
+    valid_cpu_backends = {Backend::PDINFER, Backend::ORT};
+    valid_gpu_backends = {Backend::PDINFER, Backend::ORT, Backend::TRT};
+  }
+  runtime_option = custom_option;
+  runtime_option.model_format = model_format;
+  runtime_option.model_file = model_file;
+  runtime_option.params_file = params_file;
+  initialized = Initialize();
+}
+
+bool ScaledYOLOv4::Initialize() {
+  // parameters for preprocess
+  size = {640, 640};
+  padding_value = {114.0, 114.0, 114.0};
+  is_mini_pad = false;
+  is_no_pad = false;
+  is_scale_up = false;
+  stride = 32;
+  max_wh = 7680.0;
+
+  if (!InitRuntime()) {
+    FDERROR << "Failed to initialize fastdeploy backend." << std::endl;
+    return false;
+  }
+  return true;
+}
+
+bool ScaledYOLOv4::Preprocess(Mat* mat, FDTensor* output,
+                       std::map<std::string, std::array<float, 2>>* im_info) {
+  // process after image load
+  float ratio = std::min(size[1] * 1.0f / static_cast<float>(mat->Height()),
+                         size[0] * 1.0f / static_cast<float>(mat->Width()));
+  if (ratio != 1.0) {
+    int interp = cv::INTER_AREA;
+    if (ratio > 1.0) {
+      interp = cv::INTER_LINEAR;
+    }
+    int resize_h = int(mat->Height() * ratio);
+    int resize_w = int(mat->Width() * ratio);
+    Resize::Run(mat, resize_w, resize_h, -1, -1, interp);
+  }
+  // ScaledYOLOv4's preprocess steps
+  // 1. letterbox
+  // 2. BGR->RGB
+  // 3. HWC->CHW
+  ScaledYOLOv4::LetterBox(mat, size, padding_value, is_mini_pad, is_no_pad,
+                   is_scale_up, stride);
+  BGR2RGB::Run(mat);
+  // Normalize::Run(mat, std::vector<float>(mat->Channels(), 0.0),
+  //                std::vector<float>(mat->Channels(), 1.0));
+  // Compute `result = mat * alpha + beta` directly by channel
+  std::vector<float> alpha = {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f};
+  std::vector<float> beta = {0.0f, 0.0f, 0.0f};
+  Convert::Run(mat, alpha, beta);
+
+  // Record output shape of preprocessed image
+  (*im_info)["output_shape"] = {static_cast<float>(mat->Height()),
+                                static_cast<float>(mat->Width())};
+
+  HWC2CHW::Run(mat);
+  Cast::Run(mat, "float");
+  mat->ShareWithTensor(output);
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  return true;
+}
+
+bool ScaledYOLOv4::Postprocess(
+    FDTensor& infer_result, DetectionResult* result,
+    const std::map<std::string, std::array<float, 2>>& im_info,
+    float conf_threshold, float nms_iou_threshold) {
+  FDASSERT(infer_result.shape[0] == 1, "Only support batch =1 now.");
+  result->Clear();
+  result->Reserve(infer_result.shape[1]);
+  if (infer_result.dtype != FDDataType::FP32) {
+    FDERROR << "Only support post process with float32 data." << std::endl;
+    return false;
+  }
+  float* data = static_cast<float*>(infer_result.Data());
+  for (size_t i = 0; i < infer_result.shape[1]; ++i) {
+    int s = i * infer_result.shape[2];
+    float confidence = data[s + 4];
+    float* max_class_score =
+        std::max_element(data + s + 5, data + s + infer_result.shape[2]);
+    confidence *= (*max_class_score);
+    // filter boxes by conf_threshold
+    if (confidence <= conf_threshold) {
+      continue;
+    }
+    int32_t label_id = std::distance(data + s + 5, max_class_score);
+    // convert from [x, y, w, h] to [x1, y1, x2, y2]
+    result->boxes.emplace_back(std::array<float, 4>{
+        data[s] - data[s + 2] / 2.0f + label_id * max_wh,
+        data[s + 1] - data[s + 3] / 2.0f + label_id * max_wh,
+        data[s + 0] + data[s + 2] / 2.0f + label_id * max_wh,
+        data[s + 1] + data[s + 3] / 2.0f + label_id * max_wh});
+    result->label_ids.push_back(label_id);
+    result->scores.push_back(confidence);
+  }
+  utils::NMS(result, nms_iou_threshold);
+
+  // scale the boxes to the origin image shape
+  auto iter_out = im_info.find("output_shape");
+  auto iter_ipt = im_info.find("input_shape");
+  FDASSERT(iter_out != im_info.end() && iter_ipt != im_info.end(),
+           "Cannot find input_shape or output_shape from im_info.");
+  float out_h = iter_out->second[0];
+  float out_w = iter_out->second[1];
+  float ipt_h = iter_ipt->second[0];
+  float ipt_w = iter_ipt->second[1];
+  float scale = std::min(out_h / ipt_h, out_w / ipt_w);
+  float pad_h = (out_h - ipt_h * scale) / 2.0f;
+  float pad_w = (out_w - ipt_w * scale) / 2.0f;
+  if (is_mini_pad) {
+    // 和 LetterBox中_auto=true的处理逻辑对应 
+    pad_h = static_cast<float>(static_cast<int>(pad_h) % stride);
+    pad_w = static_cast<float>(static_cast<int>(pad_w) % stride);
+  }
+  for (size_t i = 0; i < result->boxes.size(); ++i) {
+    int32_t label_id = (result->label_ids)[i];
+    // clip box
+    result->boxes[i][0] = result->boxes[i][0] - max_wh * label_id;
+    result->boxes[i][1] = result->boxes[i][1] - max_wh * label_id;
+    result->boxes[i][2] = result->boxes[i][2] - max_wh * label_id;
+    result->boxes[i][3] = result->boxes[i][3] - max_wh * label_id;
+    result->boxes[i][0] = std::max((result->boxes[i][0] - pad_w) / scale, 0.0f);
+    result->boxes[i][1] = std::max((result->boxes[i][1] - pad_h) / scale, 0.0f);
+    result->boxes[i][2] = std::max((result->boxes[i][2] - pad_w) / scale, 0.0f);
+    result->boxes[i][3] = std::max((result->boxes[i][3] - pad_h) / scale, 0.0f);
+    result->boxes[i][0] = std::min(result->boxes[i][0], ipt_w - 1.0f);
+    result->boxes[i][1] = std::min(result->boxes[i][1], ipt_h - 1.0f);
+    result->boxes[i][2] = std::min(result->boxes[i][2], ipt_w - 1.0f);
+    result->boxes[i][3] = std::min(result->boxes[i][3], ipt_h - 1.0f);
+  }
+  return true;
+}
+
+bool ScaledYOLOv4::Predict(cv::Mat* im, DetectionResult* result, float conf_threshold,
+                    float nms_iou_threshold) {
+#ifdef FASTDEPLOY_DEBUG
+  TIMERECORD_START(0)
+#endif
+
+  Mat mat(*im);
+  std::vector<FDTensor> input_tensors(1);
+
+  std::map<std::string, std::array<float, 2>> im_info;
+
+  // Record the shape of image and the shape of preprocessed image
+  im_info["input_shape"] = {static_cast<float>(mat.Height()),
+                            static_cast<float>(mat.Width())};
+  im_info["output_shape"] = {static_cast<float>(mat.Height()),
+                             static_cast<float>(mat.Width())};
+
+  if (!Preprocess(&mat, &input_tensors[0], &im_info)) {
+    FDERROR << "Failed to preprocess input image." << std::endl;
+    return false;
+  }
+
+#ifdef FASTDEPLOY_DEBUG
+  TIMERECORD_END(0, "Preprocess")
+  TIMERECORD_START(1)
+#endif
+
+  input_tensors[0].name = InputInfoOfRuntime(0).name;
+  std::vector<FDTensor> output_tensors;
+  if (!Infer(input_tensors, &output_tensors)) {
+    FDERROR << "Failed to inference." << std::endl;
+    return false;
+  }
+#ifdef FASTDEPLOY_DEBUG
+  TIMERECORD_END(1, "Inference")
+  TIMERECORD_START(2)
+#endif
+
+  if (!Postprocess(output_tensors[0], result, im_info, conf_threshold,
+                   nms_iou_threshold)) {
+    FDERROR << "Failed to post process." << std::endl;
+    return false;
+  }
+
+#ifdef FASTDEPLOY_DEBUG
+  TIMERECORD_END(2, "Postprocess")
+#endif
+  return true;
+}
+
+}  // namespace wongkinyiu
+}  // namespace vision
+}  // namespace fastdeploy
diff --git a/fastdeploy/vision/wongkinyiu/scaledyolov4.h b/fastdeploy/vision/wongkinyiu/scaledyolov4.h
new file mode 100644
index 0000000000..c85fc8a04f
--- /dev/null
+++ b/fastdeploy/vision/wongkinyiu/scaledyolov4.h
@@ -0,0 +1,95 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "fastdeploy/fastdeploy_model.h"
+#include "fastdeploy/vision/common/processors/transform.h"
+#include "fastdeploy/vision/common/result.h"
+
+namespace fastdeploy {
+namespace vision {
+namespace wongkinyiu {
+
+class FASTDEPLOY_DECL ScaledYOLOv4 : public FastDeployModel {
+ public:
+  // 当model_format为ONNX时，无需指定params_file
+  // 当model_format为Paddle时，则需同时指定model_file & params_file
+  ScaledYOLOv4(const std::string& model_file, const std::string& params_file = "",
+        const RuntimeOption& custom_option = RuntimeOption(),
+        const Frontend& model_format = Frontend::ONNX);
+
+  // 定义模型的名称
+  virtual std::string ModelName() const { return "WongKinYiu/ScaledYOLOv4"; }
+
+  // 模型预测接口，即用户调用的接口
+  // im 为用户的输入数据，目前对于CV均定义为cv::Mat
+  // result 为模型预测的输出结构体
+  // conf_threshold 为后处理的参数
+  // nms_iou_threshold 为后处理的参数
+  virtual bool Predict(cv::Mat* im, DetectionResult* result,
+                       float conf_threshold = 0.25,
+                       float nms_iou_threshold = 0.5);
+
+  // 以下为模型在预测时的一些参数，基本是前后处理所需
+  // 用户在创建模型后，可根据模型的要求，以及自己的需求
+  // 对参数进行修改
+  // tuple of (width, height)
+  std::vector<int> size;
+  // padding value, size should be same with Channels
+  std::vector<float> padding_value;
+  // only pad to the minimum rectange which height and width is times of stride
+  bool is_mini_pad;
+  // while is_mini_pad = false and is_no_pad = true, will resize the image to
+  // the set size
+  bool is_no_pad;
+  // if is_scale_up is false, the input image only can be zoom out, the maximum
+  // resize scale cannot exceed 1.0
+  bool is_scale_up;
+  // padding stride, for is_mini_pad
+  int stride;
+  // for offseting the boxes by classes when using NMS
+  float max_wh;
+
+ private:
+  // 初始化函数，包括初始化后端，以及其它模型推理需要涉及的操作
+  bool Initialize();
+
+  // 输入图像预处理操作
+  // Mat为FastDeploy定义的数据结构
+  // FDTensor为预处理后的Tensor数据，传给后端进行推理
+  // im_info为预处理过程保存的数据，在后处理中需要用到
+  bool Preprocess(Mat* mat, FDTensor* outputs,
+                  std::map<std::string, std::array<float, 2>>* im_info);
+
+  // 后端推理结果后处理，输出给用户
+  // infer_result 为后端推理后的输出Tensor
+  // result 为模型预测的结果
+  // im_info 为预处理记录的信息，后处理用于还原box
+  // conf_threshold 后处理时过滤box的置信度阈值
+  // nms_iou_threshold 后处理时NMS设定的iou阈值
+  bool Postprocess(FDTensor& infer_result, DetectionResult* result,
+                   const std::map<std::string, std::array<float, 2>>& im_info,
+                   float conf_threshold, float nms_iou_threshold);
+
+  // 对图片进行LetterBox处理
+  // mat 为读取到的原图
+  // size 为输入模型的图像尺寸
+  void LetterBox(Mat* mat, const std::vector<int>& size,
+                 const std::vector<float>& color, bool _auto,
+                 bool scale_fill = false, bool scale_up = true,
+                 int stride = 32);
+};
+}  // namespace wongkinyiu
+}  // namespace vision
+}  // namespace fastdeploy
diff --git a/fastdeploy/vision/wongkinyiu/wongkinyiu_pybind.cc b/fastdeploy/vision/wongkinyiu/wongkinyiu_pybind.cc
index 6bde2a1841..c90ed3f112 100644
--- a/fastdeploy/vision/wongkinyiu/wongkinyiu_pybind.cc
+++ b/fastdeploy/vision/wongkinyiu/wongkinyiu_pybind.cc
@@ -56,5 +56,24 @@ void BindWongkinyiu(pybind11::module& m) {
       .def_readwrite("is_scale_up", &vision::wongkinyiu::YOLOR::is_scale_up)
       .def_readwrite("stride", &vision::wongkinyiu::YOLOR::stride)
       .def_readwrite("max_wh", &vision::wongkinyiu::YOLOR::max_wh);
+
+  pybind11::class_<vision::wongkinyiu::ScaledYOLOv4, FastDeployModel>(
+      wongkinyiu_module, "ScaledYOLOv4")
+      .def(pybind11::init<std::string, std::string, RuntimeOption, Frontend>())
+      .def("predict",
+           [](vision::wongkinyiu::ScaledYOLOv4& self, pybind11::array& data,
+              float conf_threshold, float nms_iou_threshold) {
+             auto mat = PyArrayToCvMat(data);
+             vision::DetectionResult res;
+             self.Predict(&mat, &res, conf_threshold, nms_iou_threshold);
+             return res;
+           })
+      .def_readwrite("size", &vision::wongkinyiu::ScaledYOLOv4::size)
+      .def_readwrite("padding_value", &vision::wongkinyiu::ScaledYOLOv4::padding_value)
+      .def_readwrite("is_mini_pad", &vision::wongkinyiu::ScaledYOLOv4::is_mini_pad)
+      .def_readwrite("is_no_pad", &vision::wongkinyiu::ScaledYOLOv4::is_no_pad)
+      .def_readwrite("is_scale_up", &vision::wongkinyiu::ScaledYOLOv4::is_scale_up)
+      .def_readwrite("stride", &vision::wongkinyiu::ScaledYOLOv4::stride)
+      .def_readwrite("max_wh", &vision::wongkinyiu::ScaledYOLOv4::max_wh);
 }
 }  // namespace fastdeploy
diff --git a/fastdeploy/vision/wongkinyiu/yolor.cc b/fastdeploy/vision/wongkinyiu/yolor.cc
index 5cf9d6cb83..070ea72e60 100644
--- a/fastdeploy/vision/wongkinyiu/yolor.cc
+++ b/fastdeploy/vision/wongkinyiu/yolor.cc
@@ -43,7 +43,9 @@ void YOLOR::LetterBox(Mat* mat, const std::vector<int>& size,
     resize_h = size[1];
     resize_w = size[0];
   }
-  Resize::Run(mat, resize_w, resize_h);
+  if (resize_h != mat->Height() || resize_w != mat->Width()) {
+    Resize::Run(mat, resize_w, resize_h);
+  }
   if (pad_h > 0 || pad_w > 0) {
     float half_h = pad_h * 1.0 / 2;
     int top = int(round(half_h - 0.1));
@@ -91,8 +93,8 @@ bool YOLOR::Initialize() {
 bool YOLOR::Preprocess(Mat* mat, FDTensor* output,
                        std::map<std::string, std::array<float, 2>>* im_info) {
   // process after image load
-  double ratio = (size[0] * 1.0) / std::max(static_cast<float>(mat->Height()),
-                                            static_cast<float>(mat->Width()));
+  float ratio = std::min(size[1] * 1.0f / static_cast<float>(mat->Height()),
+                         size[0] * 1.0f / static_cast<float>(mat->Width()));
   if (ratio != 1.0) {
     int interp = cv::INTER_AREA;
     if (ratio > 1.0) {
@@ -109,8 +111,12 @@ bool YOLOR::Preprocess(Mat* mat, FDTensor* output,
   YOLOR::LetterBox(mat, size, padding_value, is_mini_pad, is_no_pad,
                    is_scale_up, stride);
   BGR2RGB::Run(mat);
-  Normalize::Run(mat, std::vector<float>(mat->Channels(), 0.0),
-                 std::vector<float>(mat->Channels(), 1.0));
+  // Normalize::Run(mat, std::vector<float>(mat->Channels(), 0.0),
+  //                std::vector<float>(mat->Channels(), 1.0));
+  // Compute `result = mat * alpha + beta` directly by channel
+  std::vector<float> alpha = {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f};
+  std::vector<float> beta = {0.0f, 0.0f, 0.0f};
+  Convert::Run(mat, alpha, beta);
 
   // Record output shape of preprocessed image
   (*im_info)["output_shape"] = {static_cast<float>(mat->Height()),
@@ -167,9 +173,14 @@ bool YOLOR::Postprocess(
   float ipt_h = iter_ipt->second[0];
   float ipt_w = iter_ipt->second[1];
   float scale = std::min(out_h / ipt_h, out_w / ipt_w);
+  float pad_h = (out_h - ipt_h * scale) / 2.0f;
+  float pad_w = (out_w - ipt_w * scale) / 2.0f;
+  if (is_mini_pad) {
+    // 和 LetterBox中_auto=true的处理逻辑对应 
+    pad_h = static_cast<float>(static_cast<int>(pad_h) % stride);
+    pad_w = static_cast<float>(static_cast<int>(pad_w) % stride);
+  }
   for (size_t i = 0; i < result->boxes.size(); ++i) {
-    float pad_h = (out_h - ipt_h * scale) / 2;
-    float pad_w = (out_w - ipt_w * scale) / 2;
     int32_t label_id = (result->label_ids)[i];
     // clip box
     result->boxes[i][0] = result->boxes[i][0] - max_wh * label_id;
@@ -180,10 +191,10 @@ bool YOLOR::Postprocess(
     result->boxes[i][1] = std::max((result->boxes[i][1] - pad_h) / scale, 0.0f);
     result->boxes[i][2] = std::max((result->boxes[i][2] - pad_w) / scale, 0.0f);
     result->boxes[i][3] = std::max((result->boxes[i][3] - pad_h) / scale, 0.0f);
-    result->boxes[i][0] = std::min(result->boxes[i][0], ipt_w);
-    result->boxes[i][1] = std::min(result->boxes[i][1], ipt_h);
-    result->boxes[i][2] = std::min(result->boxes[i][2], ipt_w);
-    result->boxes[i][3] = std::min(result->boxes[i][3], ipt_h);
+    result->boxes[i][0] = std::min(result->boxes[i][0], ipt_w - 1.0f);
+    result->boxes[i][1] = std::min(result->boxes[i][1], ipt_h - 1.0f);
+    result->boxes[i][2] = std::min(result->boxes[i][2], ipt_w - 1.0f);
+    result->boxes[i][3] = std::min(result->boxes[i][3], ipt_h - 1.0f);
   }
   return true;
 }
diff --git a/fastdeploy/vision/wongkinyiu/yolov7.cc b/fastdeploy/vision/wongkinyiu/yolov7.cc
index 532f552947..457f8800cf 100644
--- a/fastdeploy/vision/wongkinyiu/yolov7.cc
+++ b/fastdeploy/vision/wongkinyiu/yolov7.cc
@@ -43,7 +43,9 @@ void YOLOv7::LetterBox(Mat* mat, const std::vector<int>& size,
     resize_h = size[1];
     resize_w = size[0];
   }
-  Resize::Run(mat, resize_w, resize_h);
+  if (resize_h != mat->Height() || resize_w != mat->Width()) {
+    Resize::Run(mat, resize_w, resize_h);
+  }
   if (pad_h > 0 || pad_w > 0) {
     float half_h = pad_h * 1.0 / 2;
     int top = int(round(half_h - 0.1));
@@ -92,8 +94,8 @@ bool YOLOv7::Initialize() {
 bool YOLOv7::Preprocess(Mat* mat, FDTensor* output,
                         std::map<std::string, std::array<float, 2>>* im_info) {
   // process after image load
-  double ratio = (size[0] * 1.0) / std::max(static_cast<float>(mat->Height()),
-                                            static_cast<float>(mat->Width()));
+  float ratio = std::min(size[1] * 1.0f / static_cast<float>(mat->Height()),
+                         size[0] * 1.0f / static_cast<float>(mat->Width()));
   if (ratio != 1.0) {
     int interp = cv::INTER_AREA;
     if (ratio > 1.0) {
@@ -110,8 +112,12 @@ bool YOLOv7::Preprocess(Mat* mat, FDTensor* output,
   YOLOv7::LetterBox(mat, size, padding_value, is_mini_pad, is_no_pad,
                     is_scale_up, stride);
   BGR2RGB::Run(mat);
-  Normalize::Run(mat, std::vector<float>(mat->Channels(), 0.0),
-                 std::vector<float>(mat->Channels(), 1.0));
+  // Normalize::Run(mat, std::vector<float>(mat->Channels(), 0.0),
+  //                std::vector<float>(mat->Channels(), 1.0));
+  // Compute `result = mat * alpha + beta` directly by channel
+  std::vector<float> alpha = {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f};
+  std::vector<float> beta = {0.0f, 0.0f, 0.0f};
+  Convert::Run(mat, alpha, beta);
 
   // Record output shape of preprocessed image
   (*im_info)["output_shape"] = {static_cast<float>(mat->Height()),
@@ -168,9 +174,14 @@ bool YOLOv7::Postprocess(
   float ipt_h = iter_ipt->second[0];
   float ipt_w = iter_ipt->second[1];
   float scale = std::min(out_h / ipt_h, out_w / ipt_w);
+  float pad_h = (out_h - ipt_h * scale) / 2.0f;
+  float pad_w = (out_w - ipt_w * scale) / 2.0f;
+  if (is_mini_pad) {
+    // 和 LetterBox中_auto=true的处理逻辑对应 
+    pad_h = static_cast<float>(static_cast<int>(pad_h) % stride);
+    pad_w = static_cast<float>(static_cast<int>(pad_w) % stride);
+  }
   for (size_t i = 0; i < result->boxes.size(); ++i) {
-    float pad_h = (out_h - ipt_h * scale) / 2;
-    float pad_w = (out_w - ipt_w * scale) / 2;
     int32_t label_id = (result->label_ids)[i];
     // clip box
     result->boxes[i][0] = result->boxes[i][0] - max_wh * label_id;
@@ -181,10 +192,10 @@ bool YOLOv7::Postprocess(
     result->boxes[i][1] = std::max((result->boxes[i][1] - pad_h) / scale, 0.0f);
     result->boxes[i][2] = std::max((result->boxes[i][2] - pad_w) / scale, 0.0f);
     result->boxes[i][3] = std::max((result->boxes[i][3] - pad_h) / scale, 0.0f);
-    result->boxes[i][0] = std::min(result->boxes[i][0], ipt_w);
-    result->boxes[i][1] = std::min(result->boxes[i][1], ipt_h);
-    result->boxes[i][2] = std::min(result->boxes[i][2], ipt_w);
-    result->boxes[i][3] = std::min(result->boxes[i][3], ipt_h);
+    result->boxes[i][0] = std::min(result->boxes[i][0], ipt_w - 1.0f);
+    result->boxes[i][1] = std::min(result->boxes[i][1], ipt_h - 1.0f);
+    result->boxes[i][2] = std::min(result->boxes[i][2], ipt_w - 1.0f);
+    result->boxes[i][3] = std::min(result->boxes[i][3], ipt_h - 1.0f);
   }
   return true;
 }
diff --git a/model_zoo/vision/scaledyolov4/README.md b/model_zoo/vision/scaledyolov4/README.md
new file mode 100644
index 0000000000..93d3bd6c15
--- /dev/null
+++ b/model_zoo/vision/scaledyolov4/README.md
@@ -0,0 +1,66 @@
+# 编译ScaledYOLOv4示例
+
+当前支持模型版本为：[ScaledYOLOv4 branch yolov4-large](https://github.com/WongKinYiu/ScaledYOLOv4)
+
+本文档说明如何进行[ScaledYOLOv4](https://github.com/WongKinYiu/ScaledYOLOv4)的快速部署推理。本目录结构如下
+
+```
+.
+├── cpp
+│   ├── CMakeLists.txt
+│   ├── README.md
+│   └── scaledyolov4.cc
+├── README.md
+└── scaled_yolov4.py
+```
+
+## 获取ONNX文件
+
+- 手动获取
+
+  访问[ScaledYOLOv4](https://github.com/WongKinYiu/ScaledYOLOv4)官方github库，按照指引下载安装，下载`scaledyolov4.pt` 模型，利用 `models/export.py` 得到`onnx`格式文件。如果您导出的`onnx`模型出现问题，可以参考[ScaledYOLOv4#401](https://github.com/WongKinYiu/ScaledYOLOv4/issues/401)的解决办法
+
+  ```
+  #下载ScaledYOLOv4模型文件
+  Download from the goole drive https://drive.google.com/file/d/1aXZZE999sHMP1gev60XhNChtHPRMH3Fz/view?usp=sharing
+
+  # 导出onnx格式文件
+  python models/export.py  --weights PATH/TO/scaledyolov4-xx.pt --img-size 640
+
+  # 移动onnx文件到demo目录
+  cp PATH/TO/scaledyolov4.onnx PATH/TO/model_zoo/vision/scaledyolov4/
+  ```
+
+## 安装FastDeploy
+
+使用如下命令安装FastDeploy，注意到此处安装的是`vision-cpu`，也可根据需求安装`vision-gpu`
+
+```
+# 安装fastdeploy-python工具
+pip install fastdeploy-python
+
+# 安装vision-cpu模块
+fastdeploy install vision-cpu
+```
+## Python部署
+
+执行如下代码即会自动下载测试图片
+```
+python scaled_yolov4.py
+```
+
+执行完成后会将可视化结果保存在本地`vis_result.jpg`，同时输出检测结果如下
+```
+DetectionResult: [xmin, ymin, xmax, ymax, score, label_id]
+665.666321,390.477173, 810.000000, 879.829346, 0.940627, 0
+48.266064,396.217163, 247.338425, 901.974915, 0.922277, 0
+221.351868,408.446259, 345.524017, 857.927917, 0.910516, 0
+14.989746,228.662842, 801.292236, 735.677490, 0.820487, 5
+0.000000,548.260864, 75.825439, 873.932495, 0.718777, 0
+134.789062,473.950195, 148.526367, 506.777344, 0.513963, 27
+```
+
+## 其它文档
+
+- [C++部署](./cpp/README.md)
+- [ScaledYOLOv4 API文档](./api.md)
diff --git a/model_zoo/vision/scaledyolov4/api.md b/model_zoo/vision/scaledyolov4/api.md
new file mode 100644
index 0000000000..e23559229d
--- /dev/null
+++ b/model_zoo/vision/scaledyolov4/api.md
@@ -0,0 +1,71 @@
+# ScaledYOLOv4 API说明
+
+## Python API
+
+### ScaledYOLOv4类
+```
+fastdeploy.vision.wongkinyiu.ScaledYOLOv4(model_file, params_file=None, runtime_option=None, model_format=fd.Frontend.ONNX)
+```
+ScaledYOLOv4模型加载和初始化，当model_format为`fd.Frontend.ONNX`时，只需提供model_file，如`scaledyolov4.onnx`；当model_format为`fd.Frontend.PADDLE`时，则需同时提供model_file和params_file。
+
+**参数**
+
+> * **model_file**(str): 模型文件路径
+> * **params_file**(str): 参数文件路径
+> * **runtime_option**(RuntimeOption): 后端推理配置，默认为None，即采用默认配置
+> * **model_format**(Frontend): 模型格式
+
+#### predict函数
+> ```
+> ScaledYOLOv4.predict(image_data, conf_threshold=0.25, nms_iou_threshold=0.5)
+> ```
+> 模型预测结口，输入图像直接输出检测结果。
+>
+> **参数**
+>
+> > * **image_data**(np.ndarray): 输入数据，注意需为HWC，BGR格式
+> > * **conf_threshold**(float): 检测框置信度过滤阈值
+> > * **nms_iou_threshold**(float): NMS处理过程中iou阈值
+
+示例代码参考[scaled_yolov4.py](./scaled_yolov4.py)
+
+
+## C++ API
+
+### ScaledYOLOv4类
+```
+fastdeploy::vision::wongkinyiu::ScaledYOLOv4(
+        const string& model_file,
+        const string& params_file = "",
+        const RuntimeOption& runtime_option = RuntimeOption(),
+        const Frontend& model_format = Frontend::ONNX)
+```
+ScaledYOLOv4模型加载和初始化，当model_format为`Frontend::ONNX`时，只需提供model_file，如`scaledyolov4.onnx`；当model_format为`Frontend::PADDLE`时，则需同时提供model_file和params_file。
+
+**参数**
+
+> * **model_file**(str): 模型文件路径
+> * **params_file**(str): 参数文件路径
+> * **runtime_option**(RuntimeOption): 后端推理配置，默认为None，即采用默认配置
+> * **model_format**(Frontend): 模型格式
+
+#### Predict函数
+> ```
+> ScaledYOLOv4::Predict(cv::Mat* im, DetectionResult* result,
+>                       float conf_threshold = 0.25,
+>                       float nms_iou_threshold = 0.5)
+> ```
+> 模型预测接口，输入图像直接输出检测结果。
+>
+> **参数**
+>
+> > * **im**: 输入图像，注意需为HWC，BGR格式
+> > * **result**: 检测结果，包括检测框，各个框的置信度
+> > * **conf_threshold**: 检测框置信度过滤阈值
+> > * **nms_iou_threshold**: NMS处理过程中iou阈值
+
+示例代码参考[cpp/scaledyolov4.cc](cpp/scaledyolov4.cc)
+
+## 其它API使用
+
+- [模型部署RuntimeOption配置](../../../docs/api/runtime_option.md)
diff --git a/model_zoo/vision/scaledyolov4/cpp/CMakeLists.txt b/model_zoo/vision/scaledyolov4/cpp/CMakeLists.txt
new file mode 100644
index 0000000000..062f4fa5d7
--- /dev/null
+++ b/model_zoo/vision/scaledyolov4/cpp/CMakeLists.txt
@@ -0,0 +1,17 @@
+PROJECT(scaledyolov4_demo C CXX)
+CMAKE_MINIMUM_REQUIRED (VERSION 3.16)
+
+# 在低版本ABI环境中，通过如下代码进行兼容性编译
+# add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0)
+
+# 指定下载解压后的fastdeploy库路径
+set(FASTDEPLOY_INSTALL_DIR ${PROJECT_SOURCE_DIR}/fastdeploy-linux-x64-0.3.0/)
+
+include(${FASTDEPLOY_INSTALL_DIR}/FastDeploy.cmake)
+
+# 添加FastDeploy依赖头文件
+include_directories(${FASTDEPLOY_INCS})
+
+add_executable(scaledyolov4_demo ${PROJECT_SOURCE_DIR}/scaledyolov4.cc)
+# 添加FastDeploy库依赖
+target_link_libraries(scaledyolov4_demo ${FASTDEPLOY_LIBS})
diff --git a/model_zoo/vision/scaledyolov4/cpp/README.md b/model_zoo/vision/scaledyolov4/cpp/README.md
new file mode 100644
index 0000000000..7372cc8b92
--- /dev/null
+++ b/model_zoo/vision/scaledyolov4/cpp/README.md
@@ -0,0 +1,53 @@
+# 编译ScaledYOLOv4示例
+
+当前支持模型版本为：[ScaledYOLOv4 branch yolov4-large](https://github.com/WongKinYiu/ScaledYOLOv4)
+## 获取ONNX文件
+
+- 手动获取
+
+  访问[ScaledYOLOv4](https://github.com/WongKinYiu/ScaledYOLOv4)官方github库，按照指引下载安装，下载`scaledyolov4.pt` 模型，利用 `models/export.py` 得到`onnx`格式文件。如果您导出的`onnx`模型出现问题，可以参考[ScaledYOLOv4#401](https://github.com/WongKinYiu/ScaledYOLOv4/issues/401)的解决办法
+
+  ```
+  #下载ScaledYOLOv4模型文件
+  Download from the goole drive https://drive.google.com/file/d/1aXZZE999sHMP1gev60XhNChtHPRMH3Fz/view?usp=sharing
+
+  # 导出onnx格式文件
+  python models/export.py  --weights PATH/TO/scaledyolov4-xx-xx-xx.pt --img-size 640
+
+  # 移动onnx文件到demo目录
+  cp PATH/TO/scaledyolov4.onnx PATH/TO/model_zoo/vision/scaledyolov4/
+  ```
+
+
+## 运行demo
+
+```
+# 下载和解压预测库
+wget https://bj.bcebos.com/paddle2onnx/fastdeploy/fastdeploy-linux-x64-0.0.3.tgz
+tar xvf fastdeploy-linux-x64-0.0.3.tgz
+
+# 编译示例代码
+mkdir build & cd build
+cmake ..
+make -j
+
+# 移动onnx文件到demo目录
+cp PATH/TO/scaledyolov4.onnx PATH/TO/model_zoo/vision/scaledyolov4/cpp/build/
+
+# 下载图片
+wget https://raw.githubusercontent.com/ultralytics/yolov5/master/data/images/bus.jpg
+
+# 执行
+./scaledyolov4_demo
+```
+
+执行完后可视化的结果保存在本地`vis_result.jpg`，同时会将检测框输出在终端，如下所示
+```
+DetectionResult: [xmin, ymin, xmax, ymax, score, label_id]
+665.666321,390.477173, 810.000000, 879.829346, 0.940627, 0
+48.266064,396.217163, 247.338425, 901.974915, 0.922277, 0
+221.351868,408.446259, 345.524017, 857.927917, 0.910516, 0
+14.989746,228.662842, 801.292236, 735.677490, 0.820487, 5
+0.000000,548.260864, 75.825439, 873.932495, 0.718777, 0
+134.789062,473.950195, 148.526367, 506.777344, 0.513963, 27
+```
diff --git a/model_zoo/vision/scaledyolov4/cpp/scaledyolov4.cc b/model_zoo/vision/scaledyolov4/cpp/scaledyolov4.cc
new file mode 100644
index 0000000000..13f9bc0c28
--- /dev/null
+++ b/model_zoo/vision/scaledyolov4/cpp/scaledyolov4.cc
@@ -0,0 +1,40 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/vision.h"
+
+int main() {
+  namespace vis = fastdeploy::vision;
+  auto model = vis::wongkinyiu::ScaledYOLOv4("scaledyolov4.onnx");
+  if (!model.Initialized()) {
+    std::cerr << "Init Failed." << std::endl;
+    return -1;
+  }
+  cv::Mat im = cv::imread("bus.jpg");
+  cv::Mat vis_im = im.clone();
+
+  vis::DetectionResult res;
+  if (!model.Predict(&im, &res)) {
+    std::cerr << "Prediction Failed." << std::endl;
+    return -1;
+  }
+
+  // 输出预测框结果
+  std::cout << res.Str() << std::endl;
+
+  // 可视化预测结果
+  vis::Visualize::VisDetection(&vis_im, res);
+  cv::imwrite("vis_result.jpg", vis_im);
+  return 0;
+}
diff --git a/model_zoo/vision/scaledyolov4/scaled_yolov4.py b/model_zoo/vision/scaledyolov4/scaled_yolov4.py
new file mode 100644
index 0000000000..3bcf0fa58e
--- /dev/null
+++ b/model_zoo/vision/scaledyolov4/scaled_yolov4.py
@@ -0,0 +1,21 @@
+import fastdeploy as fd
+import cv2
+
+# 下载模型和测试图片
+test_jpg_url = "https://raw.githubusercontent.com/ultralytics/yolov5/master/data/images/bus.jpg"
+fd.download(test_jpg_url, ".", show_progress=True)
+
+# 加载模型
+model = fd.vision.wongkinyiu.ScaledYOLOv4("scaledyolov4.onnx")
+
+# 预测图片
+im = cv2.imread("bus.jpg")
+result = model.predict(im, conf_threshold=0.25, nms_iou_threshold=0.5)
+
+# 可视化结果
+fd.vision.visualize.vis_detection(im, result)
+cv2.imwrite("vis_result.jpg", im)
+
+# 输出预测结果
+print(result)
+print(model.runtime_option)