Skip to content

Commit

Permalink
New ir support legacy kernel instruction (#55880)
Browse files Browse the repository at this point in the history
* new ir remove fetch list

* fix pattern rewrite bug

* try to remove constant fold

* revert code

* add pattern rewrite test flag

* fix multi fetch

* remove usless code

* new ir support legacy kernel instraction

* new ir support legacy kernel instruction

* add scope prefix

* update

* update

* update

* update

* fix

* revert channel shuffl test

* polish code

* try to fix windows compile error

* polish code

* update

* update

* revert op test
  • Loading branch information
phlrain authored Aug 8, 2023
1 parent 393db4a commit f9c2f4c
Show file tree
Hide file tree
Showing 15 changed files with 589 additions and 211 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
cc_library(
instruction_base
SRCS instruction_base.cc phi_kernel_instruction.cc
legacy_kernel_instruction.cc instruction_util.cc
DEPS phi framework_proto)
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,15 @@
// limitations under the License.

#include "paddle/fluid/framework/new_executor/instruction/instruction_base.h"

#include "paddle/fluid/framework/new_executor/instruction/instruction_util.h"
#include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"

#include "paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h"
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/ir/core/builtin_attribute.h"

namespace paddle {
namespace framework {

Expand Down Expand Up @@ -93,5 +99,59 @@ void InstructionBase::SetOutputs(
output_index_ = outputs;
}

void InstructionBase::InitInputsOutputsIds(
::ir::Operation* op,
Scope* inner_scope,
const std::unordered_map<::ir::Value, std::string>& value_2_var_name,
const std::map<std::string, int>& var_name_2_id,
const std::unordered_map<const paddle::framework::Variable*, std::string>&
variable_2_var_name) {
auto op_attributes = op->attributes();
auto op_name =
op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString();
std::unordered_map<ir::Value, std::vector<int>> inputs;
for (size_t i = 0; i < op->num_operands(); i++) {
ir::Value value = op->operand_source(i);
if (value) {
PADDLE_ENFORCE_NE(
value_2_var_name.find(value),
value_2_var_name.end(),
phi::errors::PreconditionNotMet(
"input should in name map, [%d] 'th input of [%s] op",
i,
op_name));
std::vector<int> inputs_id = GetValueIds(value,
inner_scope,
value_2_var_name,
var_name_2_id,
variable_2_var_name);
inputs.emplace(value, inputs_id);
}
}
SetInputs(inputs);
VLOG(8) << "finish process inputs_index";
std::unordered_map<ir::Value, std::vector<int>> outputs;
for (size_t i = 0; i < op->num_results(); i++) {
ir::Value value = op->result(i);
if (value && value.type()) {
PADDLE_ENFORCE_NE(
value_2_var_name.find(value),
value_2_var_name.end(),
phi::errors::PreconditionNotMet(
"input should in name map, [%d] 'th input of [%s] op",
i,
op_name));
std::vector<int> outputs_id = GetValueIds(value,
inner_scope,
value_2_var_name,
var_name_2_id,
variable_2_var_name);
outputs.emplace(value, outputs_id);
}
}
SetOutputs(outputs);
VLOG(8) << "finish process outputs_index";
}

} // namespace framework
} // namespace paddle
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

#include "paddle/fluid/framework/new_executor/new_executor_defs.h"
#include "paddle/fluid/platform/event.h"
#include "paddle/ir/core/value.h"

namespace ir {
class Value;
Expand Down Expand Up @@ -137,7 +138,15 @@ class InstructionBase {

virtual const std::string& Name() const = 0;

private:
void InitInputsOutputsIds(
::ir::Operation* op,
Scope* inner_scope,
const std::unordered_map<::ir::Value, std::string>& value_2_var_name,
const std::map<std::string, int>& var_name_2_id,
const std::unordered_map<const paddle::framework::Variable*, std::string>&
variable_2_var_name);

protected:
size_t id_;

bool is_artificial_; // Instruction is artificial means that it is only used
Expand Down
175 changes: 175 additions & 0 deletions paddle/fluid/framework/new_executor/instruction/instruction_util.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <map>
#include <string>
#include <unordered_map>
#include <vector>

#include "paddle/fluid/framework/new_executor/instruction/instruction_util.h"

#include "paddle/fluid/framework/new_executor/new_executor_defs.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/event.h"
#include "paddle/ir/core/builtin_attribute.h"
#include "paddle/ir/core/operation.h"
#include "paddle/ir/core/value.h"

#include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
#include "paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h"
#include "paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h"
#include "paddle/fluid/platform/collective_helper.h"

namespace paddle {
namespace framework {

std::vector<int> GetValueIds(
ir::Value value,
Scope* inner_scope,
const std::unordered_map<::ir::Value, std::string>& value_2_var_name,
const std::map<std::string, int>& var_name_2_id,
const std::unordered_map<const paddle::framework::Variable*, std::string>&
variable_2_var_name) {
std::vector<int> ids;
std::string var_name = value_2_var_name.at(value);
ids.push_back(var_name_2_id.at(var_name));
// NOTE(zhangbo): Value maybe a VariableRefArray
auto var = inner_scope->FindVar(var_name);
if (var->IsType<paddle::framework::VariableRefArray>()) {
auto& var_array = var->Get<paddle::framework::VariableRefArray>();
for (auto item : var_array) {
ids.push_back(var_name_2_id.at(variable_2_var_name.at(item)));
}
}
return ids;
}

platform::DeviceContext* ParseDeviceContext(
ir::Operation* op,
platform::DeviceContext* origin_dev_ctx,
const platform::Place& place,
const std::string& execution_stream,
const int stream_priority) {
auto op_attributes = op->attributes();
auto op_name =
op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString();
interpreter::ContextManager& ctx_manager =
interpreter::ContextManager::Instance();

platform::DeviceContext* dev_ctx = nullptr;

// only gpu need update. xpu not need, because xpu memcpy op kernel is
// synchronous.
if (platform::is_gpu_place(place) || platform::is_custom_place(place)) {
VLOG(6) << "Parse DeviceContext for " << op_name
<< ", execution stream = " << execution_stream;
if (execution_stream != kDefaultStream) {
dev_ctx = ctx_manager
.Get(std::string(kCustomStream) + "-" + execution_stream,
place,
stream_priority)
.get()
.get();
interpreter::SetDeviceCommContext(op, dev_ctx);
return dev_ctx;
}

if (op_name == interpreter::kMemcpyD2H) {
dev_ctx = ctx_manager.Get(std::string(kD2HStream), place, stream_priority)
.get()
.get();
interpreter::SetDeviceCommContext(op, dev_ctx);
return dev_ctx;
} else if (op_name == interpreter::kMemcpyH2D) {
dev_ctx = ctx_manager.Get(std::string(kH2DStream), place, stream_priority)
.get()
.get();
interpreter::SetDeviceCommContext(op, dev_ctx);
return dev_ctx;
}

#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
// NOTE(Ruibiao): Here supports multi-stream overlap for c_allreduce_sum
// with use_cal_stream==false by returning a device context getting from the
// global NCCLCommContext instance. Because when use_calc_stream==false, in
// OP kernel, the NCCL communication will be launched to the stream directly
// getting from the global NCCLCommContext instance rather than the
// DeviceContext passed from executor (see CAllReduceOpCUDAKernel in
// c_allreduce_op.h). Now it is just a temporary solution for ONLY
// c_allreduce_sum which is used in ResNet50 distributed training.
if (op_name == "c_allreduce_sum" && op_attributes.at("use_calc_stream")
.dyn_cast<::ir::BoolAttribute>()
.data() == false) {
int ring_id =
op_attributes.at("ring_id").dyn_cast<::ir::Int32Attribute>().data();
return platform::NCCLCommContext::Instance()
.Get(ring_id, place)
->dev_context();
}
#endif
}

if (origin_dev_ctx != nullptr) {
interpreter::SetDeviceCommContext(op, origin_dev_ctx);
}
return origin_dev_ctx;
}

OpFuncType AnalyseOpFuncType(::ir::Operation* op,
const platform::Place& place) {
if (platform::is_cpu_place(place)) {
return OpFuncType::kCpuSync;
}

auto kernel_key = op->attributes()
.at("kernel_key")
.dyn_cast<dialect::KernelAttribute>()
.data();
if (phi::TransToPhiPlace(kernel_key.backend()).GetType() ==
phi::AllocationType::CPU) {
return OpFuncType::kCpuSync;
}

PADDLE_ENFORCE_EQ(interpreter::IsSupportedHeterPlace(place),
true,
phi::errors::Fatal("Unsupported current place %s", place));

// Some GPU OPs do not launch CUDA Kernel, but spend a lot of time on CPU
// computing. They execute serially in device thread and block CUDA kernel
// launching in other GPU OPs. To improve performance, set them as kGpuSync
// and so that they would be dispatched to host thread.
auto op_attributes = op->attributes();
auto op_name =
op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString();
if (op_name == kCoalesceTensor &&
(!platform::is_xpu_place(place) ||
op->attribute<ir::BoolAttribute>("persist_output").data() == false) &&
op->attribute<ir::BoolAttribute>("set_constant").data() == false &&
op->attribute<ir::BoolAttribute>("copy_data").data() == false) {
return OpFuncType::kGpuSync;
}

// for memcpy explicitly called by user
if (platform::is_gpu_place(place) && op_name == interpreter::kMemcpyD2H) {
return OpFuncType::kGpuSync;
}

if (op_name == "shape") {
return OpFuncType::kGpuSync;
}
return OpFuncType::kGpuAsync;
}

} // namespace framework
} // namespace paddle
49 changes: 49 additions & 0 deletions paddle/fluid/framework/new_executor/instruction/instruction_util.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include <map>
#include <string>
#include <unordered_map>
#include <vector>

#include "paddle/fluid/framework/new_executor/new_executor_defs.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/event.h"
#include "paddle/ir/core/builtin_attribute.h"
#include "paddle/ir/core/operation.h"
#include "paddle/ir/core/value.h"
namespace paddle {
namespace framework {

std::vector<int> GetValueIds(
ir::Value value,
Scope* inner_scope,
const std::unordered_map<::ir::Value, std::string>& value_2_var_name,
const std::map<std::string, int>& var_name_2_id,
const std::unordered_map<const paddle::framework::Variable*, std::string>&
variable_2_var_name);

platform::DeviceContext* ParseDeviceContext(
ir::Operation* op,
platform::DeviceContext* origin_dev_ctx,
const platform::Place& place,
const std::string& execution_stream,
const int stream_priority);

OpFuncType AnalyseOpFuncType(::ir::Operation* op, const platform::Place& place);

} // namespace framework
} // namespace paddle
Loading

0 comments on commit f9c2f4c

Please sign in to comment.