Merge pull request PaddlePaddle#165 from jiweibo/tuned_dynamic_shape

Add tuned_dynamic_shape demo
zhoutianzi666 · Aug 31, 2021 · e2b5e26 · e2b5e26
2 parents 2aa854f + e2b32ef
commit e2b5e26
Show file tree

Hide file tree

Showing 9 changed files with 1,651 additions and 0 deletions.
diff --git a/c++/paddle-trt/tuned_dynamic_shape/README.md b/c++/paddle-trt/tuned_dynamic_shape/README.md
diff --git a/c++/paddle-trt/tuned_dynamic_shape/bert.cc b/c++/paddle-trt/tuned_dynamic_shape/bert.cc
@@ -0,0 +1,149 @@
+#include "paddle/include/paddle_inference_api.h"
+
+#include <functional>
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+#include <chrono>
+#include <iostream>
+#include <numeric>
+#include <thread>
+#include <utility>
+#include <vector>
+#include <unordered_map>
+#include <utility>
+
+DEFINE_string(model_file, "", "Directory of the inference model.");
+DEFINE_string(params_file, "", "Directory of the inference model.");
+DEFINE_string(model_dir, "", "Directory of the inference model.");
+DEFINE_int32(max_batch_size, 1, "max batch size");
+DEFINE_bool(use_gpu, true, "use gpu.");
+DEFINE_bool(use_trt, true, "use trt.");
+DEFINE_string(trt_precision, "trt_fp32", "trt_fp32, trt_fp16, etc.");
+DEFINE_bool(serialize, false, "serialize");
+DEFINE_bool(tuned_dynamic_shape, false, "use tuned dynamic shape");
+DEFINE_bool(tune, false, "tune to get shape range.");
+DEFINE_bool(allow_build_at_runtime, true, "allow rebuild trt engine at runtime");
+
+using Predictor = paddle_infer::Predictor;
+using Config = paddle_infer::Config;
+
+const std::string shape_range_info = "shape_range_info.pbtxt";
+
+paddle_infer::PrecisionType GetPrecisionType(const std::string& ptype) {
+  if (ptype == "trt_fp32")
+    return paddle_infer::PrecisionType::kFloat32;
+  if (ptype == "trt_fp16")
+    return paddle_infer::PrecisionType::kHalf;
+  return paddle_infer::PrecisionType::kFloat32;
+}
+
+std::vector<int> GetInputShape(const std::string& s, const std::string delimiter=":") {
+  std::vector<int> res;
+  size_t start = 0;
+  size_t end = s.find(delimiter);
+  while (end != std::string::npos) {
+    std::string val = s.substr(start, end - start);
+    res.push_back(std::stoi(val));
+    start = end + delimiter.length();
+    end = s.find(delimiter, start);
+  }
+  if (!s.substr(start, end).empty())
+    res.push_back(std::stoi(s.substr(start, end)));
+  return res;
+}
+
+void PrepareConfig(Config *config) {
+  if (FLAGS_model_dir != "") {
+    config->SetModel(FLAGS_model_dir);
+  } else {
+    config->SetModel(FLAGS_model_file, FLAGS_params_file);
+  }
+
+  if (FLAGS_use_gpu) {
+    config->EnableUseGpu(500, 0);
+    if (FLAGS_use_trt) {
+      config->EnableTensorRtEngine(1 << 30, FLAGS_max_batch_size, 3,
+                                   GetPrecisionType(FLAGS_trt_precision), FLAGS_serialize, false);
+      if (FLAGS_tuned_dynamic_shape) {
+        config->EnableTunedTensorRtDynamicShape(shape_range_info, FLAGS_allow_build_at_runtime);
+      }
+    }
+  }
+
+  if (FLAGS_tune) {
+    config->CollectShapeRangeInfo(shape_range_info);
+  }
+
+  LOG(INFO) << config->Summary();
+}
+
+void SingleThreadRun(std::shared_ptr<Predictor> predictor, const std::unordered_map<std::string, std::pair<std::vector<int>, std::vector<int64_t>>>& input_info,
+                     std::unordered_map<std::string, std::pair<std::vector<int>, std::vector<float>>>* output_info, int thread_id) {
+  auto in_names = predictor->GetInputNames();
+  for (auto& name : in_names) {
+    auto in_handle = predictor->GetInputHandle(name);
+    in_handle->Reshape(input_info.at(name).first);
+    in_handle->CopyFromCpu(input_info.at(name).second.data());
+  }
+
+  CHECK(predictor->Run());
+
+  output_info->clear();
+  auto out_names = predictor->GetOutputNames();
+  for (auto& name : out_names) {
+    auto out_handle = predictor->GetOutputHandle(name);
+    std::vector<int> shape = out_handle->shape();
+    int num = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
+    std::vector<float> out_data;
+    if (out_handle->type() == paddle_infer::DataType::FLOAT32) {
+        std::vector<float> tmp_out_data(num);
+        out_handle->CopyToCpu(tmp_out_data.data());
+        out_data.insert(out_data.begin(), tmp_out_data.begin(), tmp_out_data.end());
+    } else if (out_handle->type() == paddle_infer::DataType::INT32) {
+        std::vector<int32_t> tmp_out_data(num);
+        out_handle->CopyToCpu(tmp_out_data.data());
+        out_data.insert(out_data.begin(), tmp_out_data.begin(), tmp_out_data.end());
+    } else {
+        LOG(FATAL) << "not supported type.";
+    }
+    output_info->insert(std::make_pair(name, std::make_pair(shape, out_data)));
+  }
+  VLOG(1) << thread_id << " run done.";
+}
+
+int main(int argc, char **argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+
+  Config config;
+  PrepareConfig(&config);
+
+  auto predictor = paddle_infer::CreatePredictor(config);
+  auto in_names = predictor->GetInputNames();
+  auto out_name = predictor->GetOutputNames()[0]; // "save_infer_model/scale_0.tmp_1"
+
+  std::unordered_map<std::string, std::pair<std::vector<int>, std::vector<int64_t>>> input_infos;
+  std::unordered_map<std::string, std::pair<std::vector<int>, std::vector<float>>> output_infos;
+
+  std::vector<int32_t> features{32, 64, 128, 256};
+  for (size_t b = 1; b <= FLAGS_max_batch_size; b++) {
+    for (auto f : features) {
+      std::vector<int> shape{b, f};
+      int num = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
+      std::vector<int64_t> in_data(num);
+      for (int i = 0; i < num; ++i) {
+        in_data[i] = i % 50006;
+      }
+      input_infos[in_names[0]] = std::make_pair(shape, in_data);
+
+      std::vector<int64_t> token(num);
+      for (int i = 0; i < num; ++i) {
+          token[i] = i % 2;
+      }
+      input_infos[in_names[1]] = std::make_pair(shape, token);
+      SingleThreadRun(predictor, input_infos, &output_infos, 0);
+    }
+  }
+
+  LOG(INFO) << "Run done";
+}
diff --git a/c++/paddle-trt/tuned_dynamic_shape/clas.cc b/c++/paddle-trt/tuned_dynamic_shape/clas.cc
@@ -0,0 +1,167 @@
+#include "paddle/include/paddle_inference_api.h"
+
+#include <functional>
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+#include <chrono>
+#include <iostream>
+#include <numeric>
+#include <string>
+#include <thread>
+#include <utility>
+#include <vector>
+#include <unordered_map>
+#include <utility>
+
+DEFINE_string(model_file, "", "Directory of the inference model.");
+DEFINE_string(params_file, "", "Directory of the inference model.");
+DEFINE_string(model_dir, "", "Directory of the inference model.");
+DEFINE_int32(max_batch_size, 1, "max batch size");
+DEFINE_bool(use_gpu, true, "use gpu.");
+DEFINE_bool(use_trt, true, "use trt.");
+DEFINE_string(trt_precision, "trt_fp32", "trt_fp32, trt_fp16, etc.");
+DEFINE_bool(serialize, false, "serialize");
+DEFINE_bool(tuned_dynamic_shape, false, "use tuned dynamic shape");
+DEFINE_bool(tune, false, "tune to get shape range.");
+DEFINE_bool(allow_build_at_runtime, true, "allow rebuild trt engine at runtime");
+DEFINE_string(hs, "224", "input heights, separeted by ':'");
+DEFINE_string(ws, "224", "input widths, separeted by ':'");
+DEFINE_string(no_seen_hs, "224", "no seen input heights, separeted by ':'");
+DEFINE_string(no_seen_ws, "224", "no seen input widths, separeted by ':'");
+
+using Predictor = paddle_infer::Predictor;
+using Config = paddle_infer::Config;
+
+const std::string shape_range_info = "shape_range_info.pbtxt";
+
+paddle_infer::PrecisionType GetPrecisionType(const std::string& ptype) {
+  if (ptype == "trt_fp32")
+    return paddle_infer::PrecisionType::kFloat32;
+  if (ptype == "trt_fp16")
+    return paddle_infer::PrecisionType::kHalf;
+  return paddle_infer::PrecisionType::kFloat32;
+}
+
+std::vector<int> GetInputShape(const std::string& s, const std::string delimiter=":") {
+  std::vector<int> res;
+  size_t start = 0;
+  size_t end = s.find(delimiter);
+  while (end != std::string::npos) {
+    std::string val = s.substr(start, end - start);
+    res.push_back(std::stoi(val));
+    start = end + delimiter.length();
+    end = s.find(delimiter, start);
+  }
+  if (!s.substr(start, end).empty())
+    res.push_back(std::stoi(s.substr(start, end)));
+  return res;
+}
+
+void PrepareConfig(Config *config) {
+  if (FLAGS_model_dir != "") {
+    config->SetModel(FLAGS_model_dir);
+  } else {
+    config->SetModel(FLAGS_model_file, FLAGS_params_file);
+  }
+
+  if (FLAGS_use_gpu) {
+    config->EnableUseGpu(500, 0);
+    if (FLAGS_use_trt) {
+      config->EnableTensorRtEngine(1 << 30, FLAGS_max_batch_size, 3,
+                                   GetPrecisionType(FLAGS_trt_precision), FLAGS_serialize, false);
+      if (FLAGS_tuned_dynamic_shape) {
+	      // config->Exp_DisableTensorRtOPs({"elementwise_add"});
+        config->EnableTunedTensorRtDynamicShape(shape_range_info, FLAGS_allow_build_at_runtime);
+      }
+    }
+  }
+
+  if (FLAGS_tune) {
+    config->CollectShapeRangeInfo(shape_range_info);
+  }
+
+  LOG(INFO) << config->Summary();
+}
+
+void SingleThreadRun(std::shared_ptr<Predictor> predictor, const std::unordered_map<std::string, std::pair<std::vector<int>, std::vector<float>>>& input_info,
+                     std::unordered_map<std::string, std::pair<std::vector<int>, std::vector<float>>>* output_info, int thread_id) {
+  auto in_names = predictor->GetInputNames();
+  for (auto& name : in_names) {
+    auto in_handle = predictor->GetInputHandle(name);
+    in_handle->Reshape(input_info.at(name).first);
+    in_handle->CopyFromCpu(input_info.at(name).second.data());
+  }
+
+  CHECK(predictor->Run());
+
+  output_info->clear();
+  auto out_names = predictor->GetOutputNames();
+  for (auto& name : out_names) {
+    auto out_handle = predictor->GetOutputHandle(name);
+    std::vector<int> shape = out_handle->shape();
+    int num = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
+    std::vector<float> out_data(num);
+    out_handle->CopyToCpu(out_data.data());
+    output_info->insert(std::make_pair(name, std::make_pair(shape, out_data)));
+  }
+  VLOG(1) << thread_id << " run done.";
+}
+
+int main(int argc, char **argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  std::vector<int> hs = GetInputShape(FLAGS_hs);
+  std::vector<int> ws = GetInputShape(FLAGS_ws);
+  CHECK_EQ(hs.size(), ws.size()) << "The input height size and width size should be same";
+  std::vector<int> no_seen_hs = GetInputShape(FLAGS_no_seen_hs);
+  std::vector<int> no_seen_ws = GetInputShape(FLAGS_no_seen_ws);
+  CHECK_EQ(no_seen_hs.size(), no_seen_ws.size()) << "The input height size and width size should be same";
+
+  Config config;
+  PrepareConfig(&config);
+
+  auto predictor = paddle_infer::CreatePredictor(config);
+  auto in_name = predictor->GetInputNames()[0]; // "x"
+  auto out_name = predictor->GetOutputNames()[0]; // "save_infer_model/scale_0.tmp_1"
+
+  std::unordered_map<std::string, std::pair<std::vector<int>, std::vector<float>>> input_infos;
+  std::unordered_map<std::string, std::pair<std::vector<int>, std::vector<float>>> output_infos;
+  constexpr int channel = 3;
+
+  for (size_t b = 1; b <= FLAGS_max_batch_size; b++) {
+    for (size_t i = 0; i < hs.size(); ++i) {
+      int h = hs[i];
+      int w = ws[i];
+      std::vector<int> shape{b, channel, h, w};
+      int num = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
+      std::vector<float> in_data(num);
+      for (int i = 0; i < num; ++i) {
+        in_data[i] = i % 255 * 0.13f;
+      }
+      input_infos[in_name] = std::make_pair(shape, in_data);
+      SingleThreadRun(predictor, input_infos, &output_infos, 0);
+      LOG(INFO) << "Run input shape{" << b << ", " << channel << ", " << h << ", " << w << "} done.";
+    }
+  }
+
+  // if we support allow_build_at_runtime, test no seen shape and rebuild trt engine
+  if (!FLAGS_tune && FLAGS_allow_build_at_runtime) {
+    LOG(INFO) << "Test no seen shape and rebuild trt engine";
+    int b = FLAGS_max_batch_size;
+    for (size_t i = 0; i < no_seen_hs.size(); ++i) {
+      int h = no_seen_hs[i];
+      int w = no_seen_ws[i];
+      std::vector<int> shape{b, channel, h, w};
+      int num = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
+      std::vector<float> in_data(num);
+      for (int i = 0; i < num; ++i) {
+        in_data[i] = i % 255 * 0.13f;
+      }
+      input_infos[in_name] = std::make_pair(shape, in_data);
+      SingleThreadRun(predictor, input_infos, &output_infos, 0);
+      LOG(INFO) << "Run input shape{" << b << ", " << channel << ", " << h << ", " << w << "} done.";
+    }
+  }
+
+  LOG(INFO) << "Run done";
+}
diff --git a/c++/paddle-trt/tuned_dynamic_shape/compile.sh b/c++/paddle-trt/tuned_dynamic_shape/compile.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+set +x
+set -e
+
+work_path=$(dirname $(readlink -f $0))
+
+# 1. check paddle_inference exists
+if [ ! -d "${work_path}/../../lib/paddle_inference" ]; then
+  echo "Please download paddle_inference lib and move it in Paddle-Inference-Demo/lib"
+  exit 1
+fi
+
+# 2. check CMakeLists exists
+if [ ! -f "${work_path}/CMakeLists.txt" ]; then
+  cp -a "${work_path}/../../lib/CMakeLists.txt" "${work_path}/"
+fi
+
+# 3. compile
+mkdir -p build
+cd build
+rm -rf *
+
+if [ ! -n "$1" ]; then
+  DEMO_NAME=clas
+else
+  DEMO_NAME=$1
+fi
+
+WITH_MKL=ON
+WITH_GPU=ON
+USE_TENSORRT=ON
+
+LIB_DIR=${work_path}/../../lib/paddle_inference
+CUDNN_LIB=/usr/lib/x86_64-linux-gnu/
+CUDA_LIB=/usr/local/cuda/lib64
+TENSORRT_ROOT=/usr/local/TensorRT-7.2.3.4
+
+cmake .. -DPADDLE_LIB=${LIB_DIR} \
+  -DWITH_MKL=${WITH_MKL} \
+  -DDEMO_NAME=${DEMO_NAME} \
+  -DWITH_GPU=${WITH_GPU} \
+  -DWITH_STATIC_LIB=OFF \
+  -DUSE_TENSORRT=${USE_TENSORRT} \
+  -DCUDNN_LIB=${CUDNN_LIB} \
+  -DCUDA_LIB=${CUDA_LIB} \
+  -DTENSORRT_ROOT=${TENSORRT_ROOT}
+
+make -j