Merge branch 'PaddlePaddle:develop' into branch-clone

PaddlePaddle · Jul 2, 2024 · 429546d · 429546d
2 parents e07f2bf + 888f213
commit 429546d
Show file tree

Hide file tree

Showing 171 changed files with 2,574 additions and 1,895 deletions.
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc
@@ -246,10 +246,8 @@ void ApplyCinnPass(::pir::Program* program,
       .file_name("original_programs.py")
       .dump_symbolic_shape(FLAGS_logging_pir_py_code_dump_symbolic_dims)
       .SaveIfFlagEnabled();
-  ApplyPdToCinnPass(program, CreatePassManager);
-  // TODO(Hongqing-work): move ApplyShapeOptimizationPass before
-  // ApplyPdToCinnPass after fixing infer shape bug.
   ApplyShapeOptimizationPass(program, CreatePassManager);
+  ApplyPdToCinnPass(program, CreatePassManager);
   ApplyCinnPreprocessPass(program, CreatePassManager);
   ApplyBuildGroupOpPass(program, CreatePassManager);
   PirToPyCodeConverter(program)

diff --git a/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/broadcast_with_cf.cc b/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/broadcast_with_cf.cc
@@ -513,15 +513,11 @@ pir::Operation* CompileBroadcastTreeToConditionBlock(
                                                  rewriter,
                                                  rewriter.block(),
                                                  &group_map);
-  // 2. simply every condition block
-  auto* program = group->ops().front()->GetParentProgram();
-  VLOG(6) << "Before simply condition block: " << *program;
-
-  SimplyConditionBlock(rewriter, &group_map);
-  VLOG(6) << "After simply condition block: " << *program;
 
-  // 3. compile condition block to jit_kernel_op
+  // 2. compile condition block to jit_kernel_op
   CompileGroupToJitKernelOp(rewriter, &group_map);
+
+  auto* program = group->ops().front()->GetParentProgram();
   VLOG(6) << "compile condition block to jit_kernel_op: " << *program;
 
   return cond_op;

diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt
@@ -27,6 +27,12 @@ if(WITH_NCCL OR WITH_RCCL)
          collective_helper
          device_context
          ${DEVICE_EVENT_LIBS})
+
+  cc_library(
+    async_load
+    SRCS async_load.cc
+    DEPS device_context place ${DEVICE_EVENT_LIBS})
+
 endif()
 
 if(WITH_XPU_BKCL)

diff --git a/paddle/fluid/distributed/collective/async_load.cc b/paddle/fluid/distributed/collective/async_load.cc
@@ -0,0 +1,133 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/collective/async_load.h"
+#include "paddle/phi/common/memory_utils.h"
+
+namespace paddle {
+namespace distributed {
+
+AsyncLoad::Task::Task(const Place& place)
+    : load_event_(place, platform::GenerateDeviceEventFlag()),
+      task_place_(place) {}
+
+AsyncLoad::Task::~Task() {}
+
+bool AsyncLoad::Task::IsCompleted() { return load_event_.Query(); }
+
+void AsyncLoad::Task::Synchronize() {
+  const auto* calc_ctx =
+      platform::DeviceContextPool::Instance().Get(task_place_);
+  load_event_.Wait(platform::Place2DeviceType(task_place_), calc_ctx);
+}
+
+void AsyncLoad::Task::UpdateWaitChain(const phi::DeviceContext& ctx) {
+  load_event_.Record(&ctx);
+}
+
+std::shared_ptr<AsyncLoad::Task> AsyncLoad::CreateTask(const Place& place) {
+  return std::make_shared<AsyncLoad::Task>(place);
+}
+
+void AsyncLoad::SyncCalcuStream(const Place& place,
+                                phi::GPUContext* ctx,
+                                platform::DeviceEvent& calc_event) {  // NOLINT
+  const auto* calc_ctx = static_cast<phi::GPUContext*>(
+      platform::DeviceContextPool::Instance().Get(place));
+  calc_event.Record(calc_ctx);
+  calc_event.Wait(platform::Place2DeviceType(place), ctx);
+}
+
+std::shared_ptr<AsyncLoad::Task> AsyncLoad::Offload(
+    phi::DenseTensor* dst, const phi::DenseTensor& src) {
+  // GPU -> GPUPinned
+  const auto& place = src.place();
+
+  PADDLE_ENFORCE_EQ(
+      platform::is_gpu_place(place),
+      true,
+      platform::errors::InvalidArgument(
+          "AsyncLoad::Offload only support GPU -> GPUPinned now."));
+
+  dst->Resize(src.dims());
+  auto size = src.numel() * phi::SizeOf(src.dtype());
+  auto* dev_ctx = static_cast<phi::GPUContext*>(
+      platform::DeviceContextPool::Instance().Get(place));
+  auto* dst_ptr = dev_ctx->Alloc(dst, src.dtype(), size, true);
+  auto* src_ptr = src.data();
+
+  // 1. wait calc stream to finish
+  std::string key = "load";
+
+  if (!is_initialized_) {
+    is_initialized_ = true;
+    gpu_place_ = place;
+    place_to_calc_event_.emplace(
+        key, platform::DeviceEvent(place, platform::GenerateDeviceEventFlag()));
+    load_ctx_ = std::move(std::make_unique<phi::GPUContext>(place));
+  }
+  SyncCalcuStream(gpu_place_, load_ctx_.get(), place_to_calc_event_.at(key));
+
+  // 2. copy data from src to dst
+  auto stream = load_ctx_->stream();
+  phi::memory_utils::Copy(
+      dst->place(), dst_ptr, src.place(), src_ptr, size, stream);
+
+  // 3. record event on offload stream
+  auto task = CreateTask(place);
+  task->UpdateWaitChain(*load_ctx_);
+  return task;
+}
+
+std::shared_ptr<AsyncLoad::Task> AsyncLoad::Reload(
+    phi::DenseTensor* dst, const phi::DenseTensor& src) {
+  // GPUPinned -> GPU
+  const auto& place = src.place();
+  PADDLE_ENFORCE_EQ(
+      platform::is_cuda_pinned_place(place),
+      true,
+      platform::errors::InvalidArgument(
+          "AsyncLoad::Reload only support GPUPinned -> GPU now."));
+
+  PADDLE_ENFORCE_EQ(is_initialized_,
+                    true,
+                    platform::errors::PreconditionNotMet(
+                        "You should call Offload before Reload."));
+
+  auto* dev_ctx = static_cast<phi::GPUContext*>(
+      platform::DeviceContextPool::Instance().Get(gpu_place_));
+
+  dst->Resize(src.dims());
+  auto size = src.numel() * phi::SizeOf(src.dtype());
+  auto* dst_ptr = dev_ctx->Alloc(dst, src.dtype(), size, false);
+  auto* src_ptr = src.data();
+
+  // 1. wait calc stream to finish
+  std::string key = "load";
+
+  SyncCalcuStream(gpu_place_, load_ctx_.get(), place_to_calc_event_.at(key));
+
+  // 2. copy data from src to dst
+  auto stream = load_ctx_->stream();
+  phi::memory_utils::Copy(
+      dst->place(), dst_ptr, src.place(), src_ptr, size, stream);
+
+  // 3. record event on offload stream
+  auto task = CreateTask(gpu_place_);
+  task->UpdateWaitChain(*load_ctx_);
+  return task;
+}
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/collective/async_load.h b/paddle/fluid/distributed/collective/async_load.h
@@ -0,0 +1,70 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <chrono>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/common/enforce.h"
+#include "paddle/common/errors.h"
+#include "paddle/fluid/platform/device_event_base.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/core/tensor_utils.h"
+
+namespace paddle {
+namespace distributed {
+
+using Place = phi::Place;
+
+class AsyncLoad {
+ public:
+  class Task {
+   public:
+    explicit Task(const Place& place);
+    virtual ~Task();
+    bool IsCompleted();
+    void Synchronize();
+    void UpdateWaitChain(const phi::DeviceContext& ctx);
+
+   private:
+    platform::DeviceEvent load_event_;  // event on offload stream
+    Place task_place_;
+  };
+
+  std::shared_ptr<AsyncLoad::Task> Offload(phi::DenseTensor* dst,
+                                           const phi::DenseTensor& src);
+
+  void PrepareLoadEnv(const std::string& key, const Place& place);
+  void SyncCalcuStream(const Place& place,
+                       phi::GPUContext* ctx,
+                       platform::DeviceEvent& calc_event);  // NOLINT
+  std::shared_ptr<AsyncLoad::Task> Reload(phi::DenseTensor* dst,
+                                          const phi::DenseTensor& src);
+
+ private:
+  std::unordered_map<std::string, platform::DeviceEvent>
+      place_to_calc_event_;  // event on calc stream
+  bool is_initialized_{false};
+  std::unique_ptr<phi::GPUContext> load_ctx_;
+  Place gpu_place_;
+  std::shared_ptr<AsyncLoad::Task> CreateTask(const Place& place);
+};
+
+}  //  namespace distributed
+}  //  namespace paddle
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
@@ -302,25 +302,6 @@ struct Argument {
                       TensorRtOpsRunFloat,
                       std::unordered_set<std::string>);
 
-  DECL_ARGUMENT_FIELD(use_dlnne, UseDlnne, bool);
-  DECL_ARGUMENT_FIELD(dlnne_min_subgraph_size, DlnneMinSubgraphSize, int);
-  DECL_ARGUMENT_FIELD(dlnne_max_batch_size, DlnneMaxBatchSize, int);
-  DECL_ARGUMENT_FIELD(dlnne_use_static_batch, DlnneUseStaticBatch, bool);
-  DECL_ARGUMENT_FIELD(dlnne_weight_share_mode,
-                      DlnneWeightShareMode,
-                      std::string);
-  DECL_ARGUMENT_FIELD(dlnne_disable_nodes_by_outputs,
-                      DlnneDisableNodesByOutputs,
-                      std::unordered_set<std::string>);
-  DECL_ARGUMENT_FIELD(dlnne_use_calib_mode, DlnneUseCalibMode, bool);
-  DECL_ARGUMENT_FIELD(dlnne_precision_mode, DlnnePrecisionMode, int);
-
-  using dlnne_input_shape_type = std::map<std::string, std::vector<int64_t>>;
-  DECL_ARGUMENT_FIELD(dlnne_input_shape_dict,
-                      DlnneInputShapeDict,
-                      dlnne_input_shape_type);
-  DECL_ARGUMENT_FIELD(dlnne_workspace_size, DlnneWorkspaceSize, int);
-
   DECL_ARGUMENT_FIELD(use_xpu, UseXpu, bool);
   DECL_ARGUMENT_FIELD(xpu_locked, XpuLocked, bool);
   DECL_ARGUMENT_FIELD(xpu_precision, XpuPrecision, std::string);

diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
@@ -45,7 +45,6 @@ namespace paddle {
 struct MkldnnQuantizerConfig;
 
 extern const std::vector<std::string> kTRTSubgraphPasses;
-extern const std::vector<std::string> kDlnneSubgraphPasses;
 
 AnalysisConfig::AnalysisConfig() {
   // NOTE(liuyuanle): Why put the following code here?
@@ -496,16 +495,6 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(trt_optimization_level_);
   CP_MEMBER(trt_ops_run_float_);
   CP_MEMBER(trt_exclude_var_names_);
-  // Dlnne related
-  CP_MEMBER(use_dlnne_);
-  CP_MEMBER(dlnne_min_subgraph_size_);
-  CP_MEMBER(dlnne_max_batchsize_);
-  CP_MEMBER(dlnne_use_static_batch_);
-  CP_MEMBER(dlnne_weight_share_mode_);
-  CP_MEMBER(dlnne_use_calib_mode_);
-  CP_MEMBER(dlnne_precision_mode_);
-  CP_MEMBER(dlnne_disable_nodes_by_outputs_);
-  CP_MEMBER(dlnne_input_shape_dict_);
   // OneDNN related.
   CP_MEMBER(use_mkldnn_);
   CP_MEMBER(mkldnn_enabled_op_types_);
@@ -619,23 +608,6 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
       pass_builder_->AppendPass(pass);
     }
   }
-  if (use_dlnne_) {
-    auto all_passes = kDlnneSubgraphPasses;
-    auto other_passes = other.pass_builder()->AllPasses();
-    // We should sort them, because the user may call the SwitchIrDebug
-    // interface, which will change the pass.
-    std::sort(all_passes.begin(), all_passes.end());
-    std::sort(other_passes.begin(), other_passes.end());
-    std::vector<std::string> deleted_passes;
-    std::set_difference(all_passes.begin(),
-                        all_passes.end(),
-                        other_passes.begin(),
-                        other_passes.end(),
-                        std::inserter(deleted_passes, deleted_passes.begin()));
-    for (auto const &ps : deleted_passes) {
-      pass_builder_->DeletePass(ps);
-    }
-  }
 
   for (auto &delete_pass : other.pass_builder()->GetAllDeletedPasses()) {
     pass_builder_->DeletePass(delete_pass);
@@ -822,27 +794,6 @@ void AnalysisConfig::EnableLowPrecisionIO(bool x) {
   enable_low_precision_io_ = x;
 }
 
-void AnalysisConfig::EnableDlnne(
-    int min_subgraph_size,
-    int max_batch_size,
-    bool use_static_batch,
-    std::string weight_share_mode,
-    std::unordered_set<std::string> disable_nodes_by_outputs,
-    std::map<std::string, std::vector<int64_t>> dlnne_input_shape_dict,
-    bool use_calib_mode,
-    Precision precision_mode) {
-  use_dlnne_ = true;
-  dlnne_min_subgraph_size_ = min_subgraph_size;
-  dlnne_max_batchsize_ = max_batch_size;
-  dlnne_use_static_batch_ = use_static_batch;
-  dlnne_weight_share_mode_ = weight_share_mode;
-  dlnne_disable_nodes_by_outputs_ = disable_nodes_by_outputs;
-  dlnne_input_shape_dict_ = dlnne_input_shape_dict;
-  dlnne_use_calib_mode_ = use_calib_mode;
-  dlnne_precision_mode_ = precision_mode;
-  Update();
-}
-
 void AnalysisConfig::SetTRTDynamicShapeInfo(
     std::map<std::string, std::vector<int>> min_input_shape,
     std::map<std::string, std::vector<int>> max_input_shape,
@@ -1022,13 +973,6 @@ void AnalysisConfig::Update() {
     }
   }
 
-  if (use_dlnne_) {
-    pass_builder()->ClearPasses();
-    for (const auto &pass : kDlnneSubgraphPasses) {
-      pass_builder()->AppendPass(pass);
-    }
-  }
-
   if (use_gpu() && use_cudnn_) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (!enable_ir_optim_) {
@@ -1149,9 +1093,6 @@ std::string AnalysisConfig::SerializeInfoCache() {
   ss << ";";
   ss << trt_forbid_dynamic_op_;
 
-  ss << use_dlnne_;
-  ss << dlnne_min_subgraph_size_;
-
   for (auto &op : trt_disabled_ops_) ss << op.c_str();
   ss << ";";
 
@@ -1315,13 +1256,6 @@ void AnalysisConfig::DisableGlogInfo() {
   Update();
 }
 
-void AnalysisConfig::PartiallyRelease() {
-  prog_file_.clear();
-  prog_file_.shrink_to_fit();
-  params_file_.clear();
-  params_file_.shrink_to_fit();
-}
-
 void AnalysisConfig::EnableGpuMultiStream() { thread_local_stream_ = true; }
 
 std::string AnalysisConfig::Summary() {