-
Notifications
You must be signed in to change notification settings - Fork 5.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
New ir support legacy kernel instruction (#55880)
* new ir remove fetch list * fix pattern rewrite bug * try to remove constant fold * revert code * add pattern rewrite test flag * fix multi fetch * remove usless code * new ir support legacy kernel instraction * new ir support legacy kernel instruction * add scope prefix * update * update * update * update * fix * revert channel shuffl test * polish code * try to fix windows compile error * polish code * update * update * revert op test
- Loading branch information
Showing
15 changed files
with
589 additions
and
211 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
cc_library( | ||
instruction_base | ||
SRCS instruction_base.cc phi_kernel_instruction.cc | ||
legacy_kernel_instruction.cc instruction_util.cc | ||
DEPS phi framework_proto) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
175 changes: 175 additions & 0 deletions
175
paddle/fluid/framework/new_executor/instruction/instruction_util.cc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,175 @@ | ||
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
#include <map> | ||
#include <string> | ||
#include <unordered_map> | ||
#include <vector> | ||
|
||
#include "paddle/fluid/framework/new_executor/instruction/instruction_util.h" | ||
|
||
#include "paddle/fluid/framework/new_executor/new_executor_defs.h" | ||
#include "paddle/fluid/platform/device_context.h" | ||
#include "paddle/fluid/platform/event.h" | ||
#include "paddle/ir/core/builtin_attribute.h" | ||
#include "paddle/ir/core/operation.h" | ||
#include "paddle/ir/core/value.h" | ||
|
||
#include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h" | ||
#include "paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h" | ||
#include "paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h" | ||
#include "paddle/fluid/platform/collective_helper.h" | ||
|
||
namespace paddle { | ||
namespace framework { | ||
|
||
std::vector<int> GetValueIds( | ||
ir::Value value, | ||
Scope* inner_scope, | ||
const std::unordered_map<::ir::Value, std::string>& value_2_var_name, | ||
const std::map<std::string, int>& var_name_2_id, | ||
const std::unordered_map<const paddle::framework::Variable*, std::string>& | ||
variable_2_var_name) { | ||
std::vector<int> ids; | ||
std::string var_name = value_2_var_name.at(value); | ||
ids.push_back(var_name_2_id.at(var_name)); | ||
// NOTE(zhangbo): Value maybe a VariableRefArray | ||
auto var = inner_scope->FindVar(var_name); | ||
if (var->IsType<paddle::framework::VariableRefArray>()) { | ||
auto& var_array = var->Get<paddle::framework::VariableRefArray>(); | ||
for (auto item : var_array) { | ||
ids.push_back(var_name_2_id.at(variable_2_var_name.at(item))); | ||
} | ||
} | ||
return ids; | ||
} | ||
|
||
platform::DeviceContext* ParseDeviceContext( | ||
ir::Operation* op, | ||
platform::DeviceContext* origin_dev_ctx, | ||
const platform::Place& place, | ||
const std::string& execution_stream, | ||
const int stream_priority) { | ||
auto op_attributes = op->attributes(); | ||
auto op_name = | ||
op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString(); | ||
interpreter::ContextManager& ctx_manager = | ||
interpreter::ContextManager::Instance(); | ||
|
||
platform::DeviceContext* dev_ctx = nullptr; | ||
|
||
// only gpu need update. xpu not need, because xpu memcpy op kernel is | ||
// synchronous. | ||
if (platform::is_gpu_place(place) || platform::is_custom_place(place)) { | ||
VLOG(6) << "Parse DeviceContext for " << op_name | ||
<< ", execution stream = " << execution_stream; | ||
if (execution_stream != kDefaultStream) { | ||
dev_ctx = ctx_manager | ||
.Get(std::string(kCustomStream) + "-" + execution_stream, | ||
place, | ||
stream_priority) | ||
.get() | ||
.get(); | ||
interpreter::SetDeviceCommContext(op, dev_ctx); | ||
return dev_ctx; | ||
} | ||
|
||
if (op_name == interpreter::kMemcpyD2H) { | ||
dev_ctx = ctx_manager.Get(std::string(kD2HStream), place, stream_priority) | ||
.get() | ||
.get(); | ||
interpreter::SetDeviceCommContext(op, dev_ctx); | ||
return dev_ctx; | ||
} else if (op_name == interpreter::kMemcpyH2D) { | ||
dev_ctx = ctx_manager.Get(std::string(kH2DStream), place, stream_priority) | ||
.get() | ||
.get(); | ||
interpreter::SetDeviceCommContext(op, dev_ctx); | ||
return dev_ctx; | ||
} | ||
|
||
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) | ||
// NOTE(Ruibiao): Here supports multi-stream overlap for c_allreduce_sum | ||
// with use_cal_stream==false by returning a device context getting from the | ||
// global NCCLCommContext instance. Because when use_calc_stream==false, in | ||
// OP kernel, the NCCL communication will be launched to the stream directly | ||
// getting from the global NCCLCommContext instance rather than the | ||
// DeviceContext passed from executor (see CAllReduceOpCUDAKernel in | ||
// c_allreduce_op.h). Now it is just a temporary solution for ONLY | ||
// c_allreduce_sum which is used in ResNet50 distributed training. | ||
if (op_name == "c_allreduce_sum" && op_attributes.at("use_calc_stream") | ||
.dyn_cast<::ir::BoolAttribute>() | ||
.data() == false) { | ||
int ring_id = | ||
op_attributes.at("ring_id").dyn_cast<::ir::Int32Attribute>().data(); | ||
return platform::NCCLCommContext::Instance() | ||
.Get(ring_id, place) | ||
->dev_context(); | ||
} | ||
#endif | ||
} | ||
|
||
if (origin_dev_ctx != nullptr) { | ||
interpreter::SetDeviceCommContext(op, origin_dev_ctx); | ||
} | ||
return origin_dev_ctx; | ||
} | ||
|
||
OpFuncType AnalyseOpFuncType(::ir::Operation* op, | ||
const platform::Place& place) { | ||
if (platform::is_cpu_place(place)) { | ||
return OpFuncType::kCpuSync; | ||
} | ||
|
||
auto kernel_key = op->attributes() | ||
.at("kernel_key") | ||
.dyn_cast<dialect::KernelAttribute>() | ||
.data(); | ||
if (phi::TransToPhiPlace(kernel_key.backend()).GetType() == | ||
phi::AllocationType::CPU) { | ||
return OpFuncType::kCpuSync; | ||
} | ||
|
||
PADDLE_ENFORCE_EQ(interpreter::IsSupportedHeterPlace(place), | ||
true, | ||
phi::errors::Fatal("Unsupported current place %s", place)); | ||
|
||
// Some GPU OPs do not launch CUDA Kernel, but spend a lot of time on CPU | ||
// computing. They execute serially in device thread and block CUDA kernel | ||
// launching in other GPU OPs. To improve performance, set them as kGpuSync | ||
// and so that they would be dispatched to host thread. | ||
auto op_attributes = op->attributes(); | ||
auto op_name = | ||
op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString(); | ||
if (op_name == kCoalesceTensor && | ||
(!platform::is_xpu_place(place) || | ||
op->attribute<ir::BoolAttribute>("persist_output").data() == false) && | ||
op->attribute<ir::BoolAttribute>("set_constant").data() == false && | ||
op->attribute<ir::BoolAttribute>("copy_data").data() == false) { | ||
return OpFuncType::kGpuSync; | ||
} | ||
|
||
// for memcpy explicitly called by user | ||
if (platform::is_gpu_place(place) && op_name == interpreter::kMemcpyD2H) { | ||
return OpFuncType::kGpuSync; | ||
} | ||
|
||
if (op_name == "shape") { | ||
return OpFuncType::kGpuSync; | ||
} | ||
return OpFuncType::kGpuAsync; | ||
} | ||
|
||
} // namespace framework | ||
} // namespace paddle |
49 changes: 49 additions & 0 deletions
49
paddle/fluid/framework/new_executor/instruction/instruction_util.h
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
#pragma once | ||
|
||
#include <map> | ||
#include <string> | ||
#include <unordered_map> | ||
#include <vector> | ||
|
||
#include "paddle/fluid/framework/new_executor/new_executor_defs.h" | ||
#include "paddle/fluid/platform/device_context.h" | ||
#include "paddle/fluid/platform/event.h" | ||
#include "paddle/ir/core/builtin_attribute.h" | ||
#include "paddle/ir/core/operation.h" | ||
#include "paddle/ir/core/value.h" | ||
namespace paddle { | ||
namespace framework { | ||
|
||
std::vector<int> GetValueIds( | ||
ir::Value value, | ||
Scope* inner_scope, | ||
const std::unordered_map<::ir::Value, std::string>& value_2_var_name, | ||
const std::map<std::string, int>& var_name_2_id, | ||
const std::unordered_map<const paddle::framework::Variable*, std::string>& | ||
variable_2_var_name); | ||
|
||
platform::DeviceContext* ParseDeviceContext( | ||
ir::Operation* op, | ||
platform::DeviceContext* origin_dev_ctx, | ||
const platform::Place& place, | ||
const std::string& execution_stream, | ||
const int stream_priority); | ||
|
||
OpFuncType AnalyseOpFuncType(::ir::Operation* op, const platform::Place& place); | ||
|
||
} // namespace framework | ||
} // namespace paddle |
Oops, something went wrong.