[Inference]Support TensorRT execute in PIR (#64995)

* adapt tensorrt * fix compile bugs * delete thirdparty * add unittest * fix py3 compile * fix kunlun200 * fix windows inference * fix windows bug * polish code * polish code * polish code * support build trt_op in python * rename construction params * fix bug * fix compile bugs * support collect shape * support re-collect shape * rename tensorrt op * polish code * add debug attr * delete member in tensorrt engine instruction * remove mutable_data * fix compile
PaddlePaddle · Jul 15, 2024 · 101bf6e · 101bf6e
1 parent 5bd7f4a
commit 101bf6e
Show file tree

Hide file tree

Showing 28 changed files with 4,212 additions and 3 deletions.
diff --git a/paddle/common/flags.cc b/paddle/common/flags.cc
@@ -1791,6 +1791,17 @@ PHI_DEFINE_EXPORTED_string(
     "",
     "Specify path for loading *.dll about cuda on windows");
 
+/**
+ * Collect shapes of value for TensorRTEngine
+ * Name: enable_collect_shape
+ * Since Version: 3.0.0
+ * Value Range: bool, default=false
+ * Example:
+ * Note: If True, will collect shapes of value when run executor.
+ */
+PHI_DEFINE_EXPORTED_bool(enable_collect_shape,
+                         false,
+                         "Collect shapes of value for TensorRTEngine");
 // Example: FLAGS_accuracy_check_atol=1e-3 would set the atol to 1e-3.
 PHI_DEFINE_EXPORTED_double(accuracy_check_atol_fp32,
                            1e-6,

diff --git a/paddle/fluid/framework/new_executor/CMakeLists.txt b/paddle/fluid/framework/new_executor/CMakeLists.txt
@@ -14,6 +14,11 @@ if(NOT WITH_ONEDNN)
     ${CMAKE_CURRENT_SOURCE_DIR}/instruction/onednn/onednn_mixed_instruction.cc)
 endif()
 
+if(NOT TENSORRT_FOUND OR NOT WITH_TENSORRT)
+  list(REMOVE_ITEM standalone_executor_srcs
+       ${CMAKE_CURRENT_SOURCE_DIR}/instruction/tensorrt_engine_instruction.cc)
+endif()
+
 set(standalone_executor_deps
     pir
     program_translator
@@ -38,6 +43,10 @@ if(WITH_CINN)
       ${DEVICE_EVENT_LIBS})
 endif()
 
+if(TENSORRT_FOUND AND WITH_TENSORRT)
+  set(standalone_executor_deps ${standalone_executor_deps} trt_engine)
+endif()
+
 if(WITH_CUSTOM_DEVICE)
   set(standalone_executor_deps ${standalone_executor_deps}
                                device_event_custom_device)

diff --git a/paddle/fluid/framework/new_executor/collect_shape_manager.cc b/paddle/fluid/framework/new_executor/collect_shape_manager.cc
@@ -0,0 +1,234 @@
+/* Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use
+this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/new_executor/collect_shape_manager.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/kernels/funcs/data_type_transform.h"
+
+namespace paddle {
+namespace framework {
+CollectShapeManager &CollectShapeManager::Instance() {
+  static CollectShapeManager instance;
+  return instance;
+}
+
+void CollectShapeManager::CollectShapeInfo(
+    framework::InstructionBase *instr,
+    framework::ValueExecutionInfo *value_exe_info,
+    framework::Scope *scope) {
+  is_shape_range_info_ready_ = false;
+  for (auto &input : instr->Inputs()) {
+    auto var_name = value_exe_info->GetVarName(input.first);
+    auto *var = scope->FindVar(var_name);
+    if (!var || !var->IsType<phi::DenseTensor>()) continue;
+    auto tensor = var->Get<phi::DenseTensor>();
+    if (!tensor.initialized()) continue;
+    paddle::platform::DeviceContextPool &pool =
+        paddle::platform::DeviceContextPool::Instance();
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    auto *dev_ctx = pool.Get(phi::GPUPlace());
+    auto stream = static_cast<phi::GPUContext *>(dev_ctx)->stream();
+#ifdef PADDLE_WITH_HIP
+    hipStreamSynchronize(stream);
+#else
+    cudaStreamSynchronize(stream);
+#endif
+#endif
+
+    framework::DDim dim = tensor.dims();
+    std::vector<int32_t> shape(dim.size());
+    for (int i = 0; i < static_cast<int>(shape.size()); ++i)
+      shape[i] = static_cast<int32_t>(dim[i]);
+    if (!shape.empty()) {
+      shape_info_[input.first].emplace_back(shape);
+    } else if (tensor.numel() > 0) {
+      // This must be a zero dimension tensor.
+      PADDLE_ENFORCE_EQ(tensor.numel(),
+                        1UL,
+                        platform::errors::PreconditionNotMet(
+                            "This tensor must have one element, but got %ld.",
+                            tensor.numel()));
+      std::vector<int32_t> zero_shape(1, 1);
+      shape_info_[input.first].emplace_back(zero_shape);
+    }
+
+    // We need collect value range for shape tensor for Paddle-TRT's use.
+    // To be noticed, this method to identify all shape tensors is based on
+    // assumption that all shape tensors in the model have numbers <= 8.
+    // This is a simple method to identify all shape tensors with some
+    // mistakes, but it doesn't matter.
+    auto is_shape_tensor = tensor.numel() <= 8 && tensor.numel() >= 1;
+    if ((tensor.dtype() == phi::DataType::INT32 ||
+         tensor.dtype() == phi::DataType::INT64) &&
+        is_shape_tensor) {
+      std::vector<int> int32_host(tensor.numel());
+
+      if (platform::is_cpu_place(tensor.place())) {
+        auto &int32_tensor = tensor;
+        if (tensor.dtype() == phi::DataType::INT64) {
+          auto *cpu_ctx = pool.Get(platform::CPUPlace());
+          int32_tensor = phi::funcs::TransDataType(
+              reinterpret_cast<const phi::CPUContext &>(*cpu_ctx),
+              tensor,
+              DataType::INT32);
+        }
+        paddle::memory::Copy(platform::CPUPlace(),
+                             int32_host.data(),
+                             platform::CPUPlace(),
+                             int32_tensor.data<int>(),
+                             int32_tensor.numel() * sizeof(int));
+      } else if (platform::is_gpu_place(tensor.place())) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+        auto *dev_ctx = pool.Get(tensor.place());
+        auto &int32_tensor = tensor;
+        if (tensor.dtype() == phi::DataType::INT64) {
+          int32_tensor = phi::funcs::TransDataType(
+              reinterpret_cast<const phi::GPUContext &>(*dev_ctx),
+              tensor,
+              DataType::INT32);
+        }
+        paddle::memory::Copy(platform::CPUPlace(),
+                             int32_host.data(),
+                             int32_tensor.place(),
+                             int32_tensor.data<int>(),
+                             int32_tensor.numel() * sizeof(int),
+                             nullptr);
+#endif
+      }
+      shape_tensor_info_[input.first].emplace_back(int32_host);
+    }
+  }
+}
+
+void CollectShapeManager::StatisticShapeRangeInfo() {
+  if (is_shape_range_info_ready_) {
+    return;
+  }
+  auto extract_min_max_opt =
+      [](std::map<pir::Value, std::vector<int32_t>> &min_data,
+         decltype(min_data) max_data,
+         decltype(min_data) opt_data,
+         decltype(shape_info_) shape_data) {
+        for (auto const &it : shape_data) {
+          auto val = it.first;
+          auto shapes = it.second;
+          std::vector<int32_t> min_shape(shapes[0].begin(), shapes[0].end());
+          std::vector<int32_t> max_shape(shapes[0].begin(), shapes[0].end());
+          std::vector<int32_t> opt_shape(shapes[0].begin(), shapes[0].end());
+
+          auto ShapeMaxFreq =
+              [](const std::map<int32_t, int32_t> &m) -> int32_t {
+            std::vector<std::pair<int32_t, int32_t>> counter;
+            for (auto &it : m) counter.emplace_back(it);
+            std::sort(counter.begin(),
+                      counter.end(),
+                      [](std::pair<int32_t, int32_t> &a,
+                         std::pair<int32_t, int32_t> &b) {
+                        return a.second > b.second;
+                      });
+            return counter[0].first;
+          };
+
+          for (size_t d = 0; d < shapes[0].size(); ++d) {
+            std::map<int32_t, int32_t> counter;
+            for (auto &shape : shapes) {
+              counter[shape[d]] += 1;
+              if (shape[d] < min_shape[d]) min_shape[d] = shape[d];
+              if (shape[d] > max_shape[d]) max_shape[d] = shape[d];
+            }
+            opt_shape[d] = ShapeMaxFreq(counter);
+          }
+
+          min_data[val] = min_shape;
+          max_data[val] = max_shape;
+          opt_data[val] = opt_shape;
+        }
+      };
+  extract_min_max_opt(min_shapes_, max_shapes_, opt_shapes_, shape_info_);
+  extract_min_max_opt(
+      min_values_, max_values_, opt_values_, shape_tensor_info_);
+  is_shape_range_info_ready_ = true;
+}
+
+std::vector<int32_t> CollectShapeManager::GetValueShapeRangeInfo(
+    pir::Value op_val, bool is_shape_tensor, ShapeMode shape_mode) {
+  PADDLE_ENFORCE_EQ(is_shape_range_info_ready_,
+                    true,
+                    ::common::errors::PreconditionNotMet(
+                        "Shape range info has not been calculated and "
+                        "StatisticShapeRangeInfo must be called first."));
+  PADDLE_ENFORCE_NE(op_value2kernel_value_.find(op_val),
+                    op_value2kernel_value_.end(),
+                    ::common::errors::NotFound(
+                        "Can't find kernel_value that corresponding to "
+                        "op_value, maybe origin program has changed or not "
+                        "open FLAGS_enable_collect_shape."));
+  auto kernel_val = op_value2kernel_value_[op_val];
+  if (shape_mode == ShapeMode::kMIN) {
+    if (is_shape_tensor) {
+      PADDLE_ENFORCE_NE(
+          min_values_.find(kernel_val),
+          min_values_.end(),
+          ::common::errors::NotFound("Can't find min shape according to the "
+                                     "input Value that is a shape tensor."));
+      return min_values_[kernel_val];
+    } else {
+      PADDLE_ENFORCE_NE(
+          min_shapes_.find(kernel_val),
+          min_shapes_.end(),
+          ::common::errors::NotFound("Can't find min shape according to the "
+                                     "input Value that isn't a shape tensor"));
+      return min_shapes_[kernel_val];
+    }
+  } else if (shape_mode == ShapeMode::kMAX) {
+    if (is_shape_tensor) {
+      PADDLE_ENFORCE_NE(
+          max_values_.find(kernel_val),
+          max_values_.end(),
+          ::common::errors::NotFound("Can't find max shape according to the "
+                                     "input Value that is a shape tensor."));
+      return max_values_[kernel_val];
+    } else {
+      PADDLE_ENFORCE_NE(
+          max_shapes_.find(kernel_val),
+          max_shapes_.end(),
+          ::common::errors::NotFound("Can't find max shape according to the "
+                                     "input Value that isn't a shape tensor"));
+      return max_shapes_[kernel_val];
+    }
+  } else if (shape_mode == ShapeMode::kOPT) {
+    if (is_shape_tensor) {
+      PADDLE_ENFORCE_NE(
+          opt_values_.find(kernel_val),
+          opt_values_.end(),
+          ::common::errors::NotFound("Can't find opt shape according to the "
+                                     "input Value that is a shape tensor."));
+      return opt_values_[kernel_val];
+    } else {
+      PADDLE_ENFORCE_NE(
+          opt_shapes_.find(kernel_val),
+          opt_shapes_.end(),
+          ::common::errors::NotFound("Can't find opt shape according to the "
+                                     "input Value that isn't a shape tensor"));
+      return opt_shapes_[kernel_val];
+    }
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "We only support ShapeMode::kMIN, ShapeMode::kMax and ShapeMode::kOpt "
+        "when GetValueShapeRangeInfo"));
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/collect_shape_manager.h b/paddle/fluid/framework/new_executor/collect_shape_manager.h
@@ -0,0 +1,73 @@
+/* Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/framework/new_executor/instruction/instruction_base.h"
+#include "paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h"
+#include "paddle/pir/include/core/value.h"
+
+namespace paddle {
+namespace framework {
+
+enum class ShapeMode {
+  kMIN,
+  kOPT,
+  kMAX,
+};
+
+// CollectShapeManager can get all shape of value when run executor and this
+// information will be used for TensorRTEngine
+class CollectShapeManager {
+ public:
+  static CollectShapeManager& Instance();
+
+  CollectShapeManager(const CollectShapeManager&) = delete;
+  CollectShapeManager(CollectShapeManager&&) = delete;
+  CollectShapeManager& operator=(const CollectShapeManager&) = delete;
+
+  void SetValueMap(
+      const std::unordered_map<pir::Value, pir::Value>& op_value2kernel_value) {
+    op_value2kernel_value_ = op_value2kernel_value;
+  }
+
+  void CollectShapeInfo(framework::InstructionBase* instr,
+                        framework::ValueExecutionInfo* value_exe_info,
+                        framework::Scope* scope);
+  void StatisticShapeRangeInfo();
+
+  std::vector<int32_t> GetValueShapeRangeInfo(pir::Value val,
+                                              bool is_shape_tensor,
+                                              ShapeMode shape_mode);
+
+ private:
+  CollectShapeManager() {}
+  std::unordered_map<pir::Value, pir::Value> op_value2kernel_value_;
+  std::map<pir::Value, std::vector<std::vector<int32_t>>> shape_info_;
+  std::map<pir::Value, std::vector<std::vector<int32_t>>> shape_tensor_info_;
+  std::map<pir::Value, std::vector<int32_t>> min_shapes_;
+  std::map<pir::Value, std::vector<int32_t>> max_shapes_;
+  std::map<pir::Value, std::vector<int32_t>> opt_shapes_;
+  std::map<pir::Value, std::vector<int32_t>> min_values_;
+  std::map<pir::Value, std::vector<int32_t>> max_values_;
+  std::map<pir::Value, std::vector<int32_t>> opt_values_;
+  bool is_shape_range_info_ready_ = false;
+};
+
+}  // namespace framework
+}  // namespace paddle