Skip to content

Commit

Permalink
Merge branch 'PaddlePaddle:develop' into branch-clone
Browse files Browse the repository at this point in the history
  • Loading branch information
walkalone20 authored Jul 2, 2024
2 parents e07f2bf + 888f213 commit 429546d
Show file tree
Hide file tree
Showing 171 changed files with 2,574 additions and 1,895 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -246,10 +246,8 @@ void ApplyCinnPass(::pir::Program* program,
.file_name("original_programs.py")
.dump_symbolic_shape(FLAGS_logging_pir_py_code_dump_symbolic_dims)
.SaveIfFlagEnabled();
ApplyPdToCinnPass(program, CreatePassManager);
// TODO(Hongqing-work): move ApplyShapeOptimizationPass before
// ApplyPdToCinnPass after fixing infer shape bug.
ApplyShapeOptimizationPass(program, CreatePassManager);
ApplyPdToCinnPass(program, CreatePassManager);
ApplyCinnPreprocessPass(program, CreatePassManager);
ApplyBuildGroupOpPass(program, CreatePassManager);
PirToPyCodeConverter(program)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -513,15 +513,11 @@ pir::Operation* CompileBroadcastTreeToConditionBlock(
rewriter,
rewriter.block(),
&group_map);
// 2. simply every condition block
auto* program = group->ops().front()->GetParentProgram();
VLOG(6) << "Before simply condition block: " << *program;

SimplyConditionBlock(rewriter, &group_map);
VLOG(6) << "After simply condition block: " << *program;

// 3. compile condition block to jit_kernel_op
// 2. compile condition block to jit_kernel_op
CompileGroupToJitKernelOp(rewriter, &group_map);

auto* program = group->ops().front()->GetParentProgram();
VLOG(6) << "compile condition block to jit_kernel_op: " << *program;

return cond_op;
Expand Down
6 changes: 6 additions & 0 deletions paddle/fluid/distributed/collective/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,12 @@ if(WITH_NCCL OR WITH_RCCL)
collective_helper
device_context
${DEVICE_EVENT_LIBS})

cc_library(
async_load
SRCS async_load.cc
DEPS device_context place ${DEVICE_EVENT_LIBS})

endif()

if(WITH_XPU_BKCL)
Expand Down
133 changes: 133 additions & 0 deletions paddle/fluid/distributed/collective/async_load.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/fluid/distributed/collective/async_load.h"
#include "paddle/phi/common/memory_utils.h"

namespace paddle {
namespace distributed {

AsyncLoad::Task::Task(const Place& place)
: load_event_(place, platform::GenerateDeviceEventFlag()),
task_place_(place) {}

AsyncLoad::Task::~Task() {}

bool AsyncLoad::Task::IsCompleted() { return load_event_.Query(); }

void AsyncLoad::Task::Synchronize() {
const auto* calc_ctx =
platform::DeviceContextPool::Instance().Get(task_place_);
load_event_.Wait(platform::Place2DeviceType(task_place_), calc_ctx);
}

void AsyncLoad::Task::UpdateWaitChain(const phi::DeviceContext& ctx) {
load_event_.Record(&ctx);
}

std::shared_ptr<AsyncLoad::Task> AsyncLoad::CreateTask(const Place& place) {
return std::make_shared<AsyncLoad::Task>(place);
}

void AsyncLoad::SyncCalcuStream(const Place& place,
phi::GPUContext* ctx,
platform::DeviceEvent& calc_event) { // NOLINT
const auto* calc_ctx = static_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(place));
calc_event.Record(calc_ctx);
calc_event.Wait(platform::Place2DeviceType(place), ctx);
}

std::shared_ptr<AsyncLoad::Task> AsyncLoad::Offload(
phi::DenseTensor* dst, const phi::DenseTensor& src) {
// GPU -> GPUPinned
const auto& place = src.place();

PADDLE_ENFORCE_EQ(
platform::is_gpu_place(place),
true,
platform::errors::InvalidArgument(
"AsyncLoad::Offload only support GPU -> GPUPinned now."));

dst->Resize(src.dims());
auto size = src.numel() * phi::SizeOf(src.dtype());
auto* dev_ctx = static_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(place));
auto* dst_ptr = dev_ctx->Alloc(dst, src.dtype(), size, true);
auto* src_ptr = src.data();

// 1. wait calc stream to finish
std::string key = "load";

if (!is_initialized_) {
is_initialized_ = true;
gpu_place_ = place;
place_to_calc_event_.emplace(
key, platform::DeviceEvent(place, platform::GenerateDeviceEventFlag()));
load_ctx_ = std::move(std::make_unique<phi::GPUContext>(place));
}
SyncCalcuStream(gpu_place_, load_ctx_.get(), place_to_calc_event_.at(key));

// 2. copy data from src to dst
auto stream = load_ctx_->stream();
phi::memory_utils::Copy(
dst->place(), dst_ptr, src.place(), src_ptr, size, stream);

// 3. record event on offload stream
auto task = CreateTask(place);
task->UpdateWaitChain(*load_ctx_);
return task;
}

std::shared_ptr<AsyncLoad::Task> AsyncLoad::Reload(
phi::DenseTensor* dst, const phi::DenseTensor& src) {
// GPUPinned -> GPU
const auto& place = src.place();
PADDLE_ENFORCE_EQ(
platform::is_cuda_pinned_place(place),
true,
platform::errors::InvalidArgument(
"AsyncLoad::Reload only support GPUPinned -> GPU now."));

PADDLE_ENFORCE_EQ(is_initialized_,
true,
platform::errors::PreconditionNotMet(
"You should call Offload before Reload."));

auto* dev_ctx = static_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(gpu_place_));

dst->Resize(src.dims());
auto size = src.numel() * phi::SizeOf(src.dtype());
auto* dst_ptr = dev_ctx->Alloc(dst, src.dtype(), size, false);
auto* src_ptr = src.data();

// 1. wait calc stream to finish
std::string key = "load";

SyncCalcuStream(gpu_place_, load_ctx_.get(), place_to_calc_event_.at(key));

// 2. copy data from src to dst
auto stream = load_ctx_->stream();
phi::memory_utils::Copy(
dst->place(), dst_ptr, src.place(), src_ptr, size, stream);

// 3. record event on offload stream
auto task = CreateTask(gpu_place_);
task->UpdateWaitChain(*load_ctx_);
return task;
}

} // namespace distributed
} // namespace paddle
70 changes: 70 additions & 0 deletions paddle/fluid/distributed/collective/async_load.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include <chrono>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>

#include "paddle/common/enforce.h"
#include "paddle/common/errors.h"
#include "paddle/fluid/platform/device_event_base.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/device_context.h"
#include "paddle/phi/core/tensor_utils.h"

namespace paddle {
namespace distributed {

using Place = phi::Place;

class AsyncLoad {
public:
class Task {
public:
explicit Task(const Place& place);
virtual ~Task();
bool IsCompleted();
void Synchronize();
void UpdateWaitChain(const phi::DeviceContext& ctx);

private:
platform::DeviceEvent load_event_; // event on offload stream
Place task_place_;
};

std::shared_ptr<AsyncLoad::Task> Offload(phi::DenseTensor* dst,
const phi::DenseTensor& src);

void PrepareLoadEnv(const std::string& key, const Place& place);
void SyncCalcuStream(const Place& place,
phi::GPUContext* ctx,
platform::DeviceEvent& calc_event); // NOLINT
std::shared_ptr<AsyncLoad::Task> Reload(phi::DenseTensor* dst,
const phi::DenseTensor& src);

private:
std::unordered_map<std::string, platform::DeviceEvent>
place_to_calc_event_; // event on calc stream
bool is_initialized_{false};
std::unique_ptr<phi::GPUContext> load_ctx_;
Place gpu_place_;
std::shared_ptr<AsyncLoad::Task> CreateTask(const Place& place);
};

} // namespace distributed
} // namespace paddle
19 changes: 0 additions & 19 deletions paddle/fluid/inference/analysis/argument.h
Original file line number Diff line number Diff line change
Expand Up @@ -302,25 +302,6 @@ struct Argument {
TensorRtOpsRunFloat,
std::unordered_set<std::string>);

DECL_ARGUMENT_FIELD(use_dlnne, UseDlnne, bool);
DECL_ARGUMENT_FIELD(dlnne_min_subgraph_size, DlnneMinSubgraphSize, int);
DECL_ARGUMENT_FIELD(dlnne_max_batch_size, DlnneMaxBatchSize, int);
DECL_ARGUMENT_FIELD(dlnne_use_static_batch, DlnneUseStaticBatch, bool);
DECL_ARGUMENT_FIELD(dlnne_weight_share_mode,
DlnneWeightShareMode,
std::string);
DECL_ARGUMENT_FIELD(dlnne_disable_nodes_by_outputs,
DlnneDisableNodesByOutputs,
std::unordered_set<std::string>);
DECL_ARGUMENT_FIELD(dlnne_use_calib_mode, DlnneUseCalibMode, bool);
DECL_ARGUMENT_FIELD(dlnne_precision_mode, DlnnePrecisionMode, int);

using dlnne_input_shape_type = std::map<std::string, std::vector<int64_t>>;
DECL_ARGUMENT_FIELD(dlnne_input_shape_dict,
DlnneInputShapeDict,
dlnne_input_shape_type);
DECL_ARGUMENT_FIELD(dlnne_workspace_size, DlnneWorkspaceSize, int);

DECL_ARGUMENT_FIELD(use_xpu, UseXpu, bool);
DECL_ARGUMENT_FIELD(xpu_locked, XpuLocked, bool);
DECL_ARGUMENT_FIELD(xpu_precision, XpuPrecision, std::string);
Expand Down
66 changes: 0 additions & 66 deletions paddle/fluid/inference/api/analysis_config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ namespace paddle {
struct MkldnnQuantizerConfig;

extern const std::vector<std::string> kTRTSubgraphPasses;
extern const std::vector<std::string> kDlnneSubgraphPasses;

AnalysisConfig::AnalysisConfig() {
// NOTE(liuyuanle): Why put the following code here?
Expand Down Expand Up @@ -496,16 +495,6 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(trt_optimization_level_);
CP_MEMBER(trt_ops_run_float_);
CP_MEMBER(trt_exclude_var_names_);
// Dlnne related
CP_MEMBER(use_dlnne_);
CP_MEMBER(dlnne_min_subgraph_size_);
CP_MEMBER(dlnne_max_batchsize_);
CP_MEMBER(dlnne_use_static_batch_);
CP_MEMBER(dlnne_weight_share_mode_);
CP_MEMBER(dlnne_use_calib_mode_);
CP_MEMBER(dlnne_precision_mode_);
CP_MEMBER(dlnne_disable_nodes_by_outputs_);
CP_MEMBER(dlnne_input_shape_dict_);
// OneDNN related.
CP_MEMBER(use_mkldnn_);
CP_MEMBER(mkldnn_enabled_op_types_);
Expand Down Expand Up @@ -619,23 +608,6 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
pass_builder_->AppendPass(pass);
}
}
if (use_dlnne_) {
auto all_passes = kDlnneSubgraphPasses;
auto other_passes = other.pass_builder()->AllPasses();
// We should sort them, because the user may call the SwitchIrDebug
// interface, which will change the pass.
std::sort(all_passes.begin(), all_passes.end());
std::sort(other_passes.begin(), other_passes.end());
std::vector<std::string> deleted_passes;
std::set_difference(all_passes.begin(),
all_passes.end(),
other_passes.begin(),
other_passes.end(),
std::inserter(deleted_passes, deleted_passes.begin()));
for (auto const &ps : deleted_passes) {
pass_builder_->DeletePass(ps);
}
}

for (auto &delete_pass : other.pass_builder()->GetAllDeletedPasses()) {
pass_builder_->DeletePass(delete_pass);
Expand Down Expand Up @@ -822,27 +794,6 @@ void AnalysisConfig::EnableLowPrecisionIO(bool x) {
enable_low_precision_io_ = x;
}

void AnalysisConfig::EnableDlnne(
int min_subgraph_size,
int max_batch_size,
bool use_static_batch,
std::string weight_share_mode,
std::unordered_set<std::string> disable_nodes_by_outputs,
std::map<std::string, std::vector<int64_t>> dlnne_input_shape_dict,
bool use_calib_mode,
Precision precision_mode) {
use_dlnne_ = true;
dlnne_min_subgraph_size_ = min_subgraph_size;
dlnne_max_batchsize_ = max_batch_size;
dlnne_use_static_batch_ = use_static_batch;
dlnne_weight_share_mode_ = weight_share_mode;
dlnne_disable_nodes_by_outputs_ = disable_nodes_by_outputs;
dlnne_input_shape_dict_ = dlnne_input_shape_dict;
dlnne_use_calib_mode_ = use_calib_mode;
dlnne_precision_mode_ = precision_mode;
Update();
}

void AnalysisConfig::SetTRTDynamicShapeInfo(
std::map<std::string, std::vector<int>> min_input_shape,
std::map<std::string, std::vector<int>> max_input_shape,
Expand Down Expand Up @@ -1022,13 +973,6 @@ void AnalysisConfig::Update() {
}
}

if (use_dlnne_) {
pass_builder()->ClearPasses();
for (const auto &pass : kDlnneSubgraphPasses) {
pass_builder()->AppendPass(pass);
}
}

if (use_gpu() && use_cudnn_) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (!enable_ir_optim_) {
Expand Down Expand Up @@ -1149,9 +1093,6 @@ std::string AnalysisConfig::SerializeInfoCache() {
ss << ";";
ss << trt_forbid_dynamic_op_;

ss << use_dlnne_;
ss << dlnne_min_subgraph_size_;

for (auto &op : trt_disabled_ops_) ss << op.c_str();
ss << ";";

Expand Down Expand Up @@ -1315,13 +1256,6 @@ void AnalysisConfig::DisableGlogInfo() {
Update();
}

void AnalysisConfig::PartiallyRelease() {
prog_file_.clear();
prog_file_.shrink_to_fit();
params_file_.clear();
params_file_.shrink_to_fit();
}

void AnalysisConfig::EnableGpuMultiStream() { thread_local_stream_ = true; }

std::string AnalysisConfig::Summary() {
Expand Down
Loading

0 comments on commit 429546d

Please sign in to comment.