Skip to content

Commit

Permalink
use dynload for cufft (PaddlePaddle#48)
Browse files Browse the repository at this point in the history
1. use dynload for cufft
2. fix unittest;
3. temporarily disable Rocm.
  • Loading branch information
Feiyu Chan authored Sep 16, 2021
1 parent 5e33e7f commit cc9d3a0
Show file tree
Hide file tree
Showing 380 changed files with 23,423 additions and 5,019 deletions.
2 changes: 1 addition & 1 deletion cmake/external/xbyak.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ ExternalProject_Add(
DEPENDS ""
PREFIX ${XBYAK_PREFIX_DIR}
SOURCE_DIR ${XBYAK_SOURCE_DIR}
# UPDATE_COMMAND ""
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XBYAK_INSTALL_ROOT}
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${XBYAK_INSTALL_ROOT}
Expand Down
2 changes: 1 addition & 1 deletion cmake/external/xpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ ELSE ()
ENDIF()

SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210830")
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210909")
SET(XPU_XRE_URL "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
SET(XPU_XCCL_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210623/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/framework/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,7 @@ if(WITH_PYTHON)
py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto)
py_proto_compile(trainer_py_proto SRCS trainer_desc.proto data_feed.proto)
py_proto_compile(distributed_strategy_py_proto SRCS distributed_strategy.proto)
py_proto_compile(pass_desc_py_proto SRCS pass_desc.proto)
#Generate an empty \
#__init__.py to make framework_py_proto as a valid python module.
add_custom_target(fleet_proto_init ALL
Expand Down
21 changes: 0 additions & 21 deletions paddle/fluid/framework/details/share_tensor_buffer_functor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,27 +35,6 @@ namespace paddle {
namespace framework {
namespace details {

// TODO(zjl): support SelectedRows
static inline const Tensor &GetTensorFromVar(const Variable *var) {
if (var->IsType<LoDTensor>()) {
return var->Get<LoDTensor>();
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Variable must be type of LoDTensor, but received %s.",
framework::ToTypeName(var->Type())));
}
}

static inline Tensor *GetMutableTensorFromVar(Variable *var) {
if (var->IsType<LoDTensor>()) {
return var->GetMutable<LoDTensor>();
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Variable must be type of LoDTensor, but received %s.",
framework::ToTypeName(var->Type())));
}
}

ShareTensorBufferFunctor::ShareTensorBufferFunctor(
Scope *scope, size_t scope_idx, const std::string &op_type,
const std::vector<const ir::MemOptVarInfo *> &in_var_infos,
Expand Down
21 changes: 21 additions & 0 deletions paddle/fluid/framework/details/share_tensor_buffer_functor.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,27 @@ namespace paddle {
namespace framework {
namespace details {

// TODO(zjl): support SelectedRows
static inline const Tensor &GetTensorFromVar(const Variable *var) {
if (var->IsType<LoDTensor>()) {
return var->Get<LoDTensor>();
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Variable must be type of LoDTensor, but received %s.",
framework::ToTypeName(var->Type())));
}
}

static inline Tensor *GetMutableTensorFromVar(Variable *var) {
if (var->IsType<LoDTensor>()) {
return var->GetMutable<LoDTensor>();
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Variable must be type of LoDTensor, but received %s.",
framework::ToTypeName(var->Type())));
}
}

// NOTE(paddle-dev): ShareTensorBufferFunctor is responsible for
// performing memory reuse in run-time. ShareTensorBufferOpHandle
// is only a wrapper of ShareTensorBufferFunctor.
Expand Down
4 changes: 4 additions & 0 deletions paddle/fluid/framework/device_worker.h
Original file line number Diff line number Diff line change
Expand Up @@ -601,6 +601,10 @@ class SectionWorker : public DeviceWorker {
std::vector<std::string> backward_send_vars_;

std::vector<std::unique_ptr<OperatorBase>> ops_;
std::vector<OperatorBase*> forward_and_lr_ops_;
std::vector<OperatorBase*> forward_ops_;
std::vector<OperatorBase*> backward_ops_;
std::vector<OperatorBase*> optimizer_ops_;
std::shared_ptr<framework::ProgramDesc> program_;
std::unordered_map<const OperatorBase*, std::vector<std::string>>
unused_vars_;
Expand Down
2 changes: 2 additions & 0 deletions paddle/fluid/framework/distributed_strategy.proto
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ message ShardingConfig {
optional bool pp_allreduce_in_optimize = 10 [ default = false ];
optional int32 pp_degree = 11 [ default = 1 ];
optional bool optimize_cast = 12 [ default = false ];
// Optimizer sharding. Temporary plans and may be deprecated
optional bool _dp_as_optimizer_sharding = 13 [ default = false ];
}

message HybridConfig {
Expand Down
6 changes: 5 additions & 1 deletion paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -235,13 +235,15 @@ void PSGPUWrapper::BuildTask(std::shared_ptr<HeterContext> gpu_task) {

timeline.Start();
std::vector<std::vector<std::pair<uint64_t, char*>>> pass_values;
uint16_t pass_id = 0;

bool record_status = false;
#ifdef PADDLE_WITH_PSLIB
uint16_t pass_id = 0;
if (multi_node_) {
record_status = fleet_ptr->pslib_ptr_->_worker_ptr->take_sparse_record(
table_id_, pass_id, pass_values);
}
#endif
auto build_func = [device_num, record_status, &pass_values, &local_keys,
&local_ptr, &device_keys, &device_vals,
&device_mutex](int i) {
Expand All @@ -260,6 +262,7 @@ void PSGPUWrapper::BuildTask(std::shared_ptr<HeterContext> gpu_task) {
task_keys[shard].push_back(local_keys[i][j]);
task_ptrs[shard].push_back(local_ptr[i][j]);
}
#ifdef PADDLE_WITH_PSLIB
if (record_status) {
size_t local_keys_size = local_keys.size();
size_t pass_values_size = pass_values.size();
Expand All @@ -275,6 +278,7 @@ void PSGPUWrapper::BuildTask(std::shared_ptr<HeterContext> gpu_task) {
}
}
}
#endif
for (int dev = 0; dev < device_num; dev++) {
device_mutex[dev]->lock();

Expand Down
5 changes: 4 additions & 1 deletion paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
#include <string>

#include "paddle/fluid/framework/op_version_registry.h"

#include "paddle/fluid/string/pretty_log.h"
namespace paddle {
namespace framework {
class Scope;
Expand Down Expand Up @@ -335,6 +335,9 @@ void FCGRUFusePass::ApplyImpl(ir::Graph* graph) const {
graph, name_scope_, param_scope(), true /*with_fc_bias*/);

AddStatis(fusion_count);

string::PrettyLogDetail("--- fused %d pairs of fc gru patterns",
fusion_count);
}

} // namespace ir
Expand Down
4 changes: 4 additions & 0 deletions paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <string>

#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/string/pretty_log.h"

namespace paddle {
namespace framework {
Expand Down Expand Up @@ -348,6 +349,9 @@ void FCLstmFusePass::ApplyImpl(ir::Graph* graph) const {
BuildFusion(graph, name_scope_, param_scope(), true /*with_fc_bias*/);

AddStatis(fusion_count);

string::PrettyLogDetail("--- fused %d pairs of fc lstm patterns",
fusion_count);
}

} // namespace ir
Expand Down
85 changes: 85 additions & 0 deletions paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,19 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
op->SetInput("WeightX", {inputs[2]});
op->SetInput("WeightH", {inputs[3]});
op->SetOutput("Hidden", {outputs[0]});
op->SetAttr("mkldnn_data_type", mkldnn_data_type);
op->SetAttr("Scale_data", 1.0f);
op->SetAttr("Shift_data", 0.0f);
op->SetAttr("Weight_scale", std::vector<float>{1.0f});
} else if (type == "fusion_lstm") {
op->SetInput("X", {inputs[0]});
op->SetInput("Bias", {inputs[1]});
op->SetInput("WeightX", {inputs[2]});
op->SetInput("WeightH", {inputs[3]});

op->SetOutput("Hidden", {outputs[0]});
op->SetOutput("Cell", {outputs[1]});

op->SetAttr("mkldnn_data_type", mkldnn_data_type);
op->SetAttr("Scale_data", 1.0f);
op->SetAttr("Shift_data", 0.0f);
Expand Down Expand Up @@ -418,6 +431,25 @@ ProgramDesc BuildProgramDescFusionGru() {
return prog;
}

static const std::initializer_list<std::string> variable_names_fusion_lstm = {
"x", "wx", "wh", "b", "h", "c"};

// (x, wx, wh, b)->Fusion_lstm_1->h
ProgramDesc BuildProgramDescFusionLSTM() {
ProgramDesc prog;
for (auto& v : variable_names_transpose) {
auto* var = prog.MutableBlock(0)->Var(v);
if (v.find("wx") == 0 || v.find("wh") || v.find("b")) {
var->SetPersistable(true);
}
}

SetOp(&prog, "fusion_lstm", "Fusion_lstm_1", {"x", "wx", "wh", "b"},
{"h", "c"}, true, "int8");

return prog;
}

void MainTestFusionGru(const ProgramDesc& prog, int gru_count, int quant_count,
int dequant_count, int added_nodes_count, float scale,
float shift) {
Expand Down Expand Up @@ -470,6 +502,59 @@ TEST(CpuQuantizePass, fusion_gru) {
dequant_count, added_nodes_count, 2. * 127, 128.);
}

void MainTestFusionLSTM(const ProgramDesc& prog, int expect_lstm_count,
int quant_count, int dequant_count,
int added_nodes_count, float scale, float shift) {
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
int original_nodes_num, current_nodes_num;
PreparePass(&graph, prog, variable_names_fusion_lstm, &original_nodes_num,
&current_nodes_num);

int quantize_nodes_count = 0;
int dequantize_nodes_count = 0;
int lstm_nodes_count = 0;
for (auto* node : graph->Nodes()) {
if (node->IsOp()) {
auto* op = node->Op();
if (op->Type() == "fusion_lstm") {
lstm_nodes_count++;

auto op_name = BOOST_GET_CONST(std::string, op->GetAttr("name"));
EXPECT_EQ(BOOST_GET_CONST(float, op->GetAttr("Scale_data")), scale)
<< "Scale_data for node '" + op_name + "'.";
EXPECT_EQ(BOOST_GET_CONST(float, op->GetAttr("Shift_data")), shift)
<< "Shift_data for node '" + op_name + "'.";
EXPECT_EQ(BOOST_GET_CONST(std::vector<float>,
op->GetAttr("Scale_weights"))[0],
scale)
<< "Scale_weights for node '" + op_name + "'.";
EXPECT_EQ(BOOST_GET_CONST(bool, op->GetAttr("force_fp32_output")), true)
<< "force_fp32_output for node '" + op_name + "'.";
} else if (op->Type() == "quantize") {
quantize_nodes_count++;
} else if (op->Type() == "dequantize") {
dequantize_nodes_count++;
}
}
}
EXPECT_EQ(lstm_nodes_count, expect_lstm_count);
EXPECT_EQ(quantize_nodes_count, quant_count);
EXPECT_EQ(dequantize_nodes_count, dequant_count);
EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num);
}

TEST(CpuQuantizePass, fusion_lstm) {
// (x, wx, wh, b)->Fusion_lstm->h
int expect_lstm_count = 1;
int expect_quant_count = 1;
int dequant_count = 0;
// 1 Quant + 1 IN + 0 DeQuant + 0 OUT
int added_nodes_count = 1 + 1 + 0 + 0;
MainTestFusionLSTM(BuildProgramDescFusionLSTM(), expect_lstm_count,
expect_quant_count, dequant_count, added_nodes_count,
2. * 127, 128.);
}

const std::vector<std::string> churn_out_vars(ProgramDesc* prog,
const std::string& prefix,
int number) {
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/framework/ir/op_compat_sensible_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ std::unordered_set<std::string> global_extra_attrs = {
"op_callstack", "op_device", "@ENABLE_CACHE_RUNTIME_CONTEXT@",
"is_test", "use_mkldnn", "mkldnn_data_type",
"use_quantizer", "mkldnn_data_type", "use_cudnn",
"name"};
"name", "with_quant_attr"};
}

namespace paddle {
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ bool SimplifyWithBasicOpsPass::SimplifyDropout(
float scale =
1.0f - BOOST_GET_CONST(float, dropout_op_desc->GetAttr("dropout_prob"));

framework::OpDesc new_op_desc;
framework::OpDesc new_op_desc(dropout_op_desc->Block());
new_op_desc.SetType("scale");
new_op_desc.SetInput("X", {dropout_x->Name()});
new_op_desc.SetOutput("Out", {dropout_out->Name()});
Expand Down
6 changes: 4 additions & 2 deletions paddle/fluid/framework/new_executor/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@ lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper l
graph_to_program_pass variable_helper timer monitor)

cc_library(workqueue SRCS workqueue.cc DEPS enforce)
cc_library(interpretercore_garbage_collector SRCS interpretercore_garbage_collector.cc DEPS workqueue ${DEVICE_EVENT_LIBS})
cc_library(interpretercore_util SRCS interpretercore_util.cc DEPS ${INTERPRETERCORE_DEPS})
cc_library(interpretercore SRCS interpretercore.cc DEPS workqueue ${DEVICE_EVENT_LIBS} interpretercore_util)

cc_library(event_manager SRCS event_manager.cc DEPS ${DEVICE_EVENT_LIBS} glog)
cc_library(stream_analyzer SRCS stream_analyzer.cc DEPS ${DEVICE_EVENT_LIBS} glog device_context)
cc_library(interpretercore SRCS interpretercore.cc DEPS workqueue ${DEVICE_EVENT_LIBS} interpretercore_util interpretercore_garbage_collector stream_analyzer event_manager)
cc_library(standalone_executor SRCS standalone_executor.cc DEPS interpretercore)
cc_test(workqueue_test SRCS workqueue_test.cc DEPS workqueue)
# cc_binary(standalone_executor_test SRCS standalone_executor_test.cc DEPS interpretercore standalone_executor operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler)
58 changes: 58 additions & 0 deletions paddle/fluid/framework/new_executor/event_manager.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/fluid/framework/new_executor/event_manager.h"

namespace paddle {
namespace framework {

void EventManager::WaitEvent(const Instruction& instruction,
const platform::Place& place) {
// If InterpreterCore in on CPUPlace, do nothing.
if (platform::is_cpu_place(place)) return;

VLOG(3) << "Deal StreamWaitEventOrSync for "
<< instruction.kernel_func_.operator_base_->Type();
auto* dev_ctx = instruction.dev_ctx_;

WaitOrSync(instruction.intput_events_, dev_ctx);
}

void EventManager::RecordEvent(const Instruction& instruction,
const OpFuncNode& op_func_node,
const platform::Place& place) {
// If InterpreterCore in on CPUPlace, do nothing.
if (platform::is_cpu_place(place)) return;

for (auto& event : instruction.output_events_) {
VLOG(3) << "Record event in out_var_id: " << event.var_id_;
event.event_->Record(instruction.dev_ctx_);
}
}

void EventManager::WaitOrSync(const std::vector<EventInter>& events,
const platform::DeviceContext* dev_ctx) {
for (auto& event_iter : events) {
if (event_iter.is_sync_) {
VLOG(3) << "host sync wait in_var_id " << event_iter.var_id_;
event_iter.event_->Wait(platform::kCPU, dev_ctx);
} else {
VLOG(3) << "stream async wait in_var_id " << event_iter.var_id_;
event_iter.event_->Wait(platform::kCUDA, dev_ctx);
}
}
}

} // namespace framework
} // namespace paddle
Loading

0 comments on commit cc9d3a0

Please sign in to comment.