Skip to content

Commit

Permalink
Merge pull request #20 from PaddlePaddle/develop
Browse files Browse the repository at this point in the history
update
  • Loading branch information
AnnaTrainingG authored Jul 12, 2021
2 parents 3ce9983 + 0b20b76 commit 61842ed
Show file tree
Hide file tree
Showing 152 changed files with 6,550 additions and 1,535 deletions.
3 changes: 3 additions & 0 deletions AUTHORS.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,3 +78,6 @@
| zhaopu7 | Pu Zhao |
| zhouxiao-coder | Xiao Zhou |
| Zrachel | Rui-Qing Zhang |
| jeng1220 | Bai-Cheng(Ryan) Jeng (NVIDIA) |
| mingxu1067 | Ming Huang (NVIDIA) |
| zlsh80826 | Reese Wang (NVIDIA) |
1 change: 1 addition & 0 deletions cmake/cuda.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -233,3 +233,4 @@ endif()
mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)

include(thrust)
2 changes: 1 addition & 1 deletion cmake/external/xpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ ELSE ()
ENDIF()

SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210625")
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210701")
SET(XPU_XRE_URL "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
SET(XPU_XCCL_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210623/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
Expand Down
2 changes: 2 additions & 0 deletions cmake/hip.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -85,3 +85,5 @@ message(STATUS "HIP library name: ${hip_library_name}")
# set HIP link libs
find_library(ROCM_HIPRTC_LIB ${hip_library_name} HINTS ${HIP_PATH}/lib)
message(STATUS "ROCM_HIPRTC_LIB: ${ROCM_HIPRTC_LIB}")

include(thrust)
24 changes: 24 additions & 0 deletions cmake/thrust.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
function(add_thrust_patches_if_necessary)
set(thrust_detect_file ${PROJECT_BINARY_DIR}/detect_thrust.cu)
file(WRITE ${thrust_detect_file} ""
"#include \"thrust/version.h\"\n"
"#include \"thrust/shuffle.h\"\n"
"#include \"stdio.h\"\n"
"int main() {\n"
" int version = THRUST_VERSION;\n"
" printf(\"%d\", version);\n"
" return 0;\n"
"}\n")

execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}"
"--run" "${thrust_detect_file}"
WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
RESULT_VARIABLE nvcc_res ERROR_QUIET)
if(NOT nvcc_res EQUAL 0)
set(thrust_patches "${PADDLE_SOURCE_DIR}/patches/thrust")
message(STATUS "Add thrust patches: ${thrust_patches}")
include_directories(${thrust_patches})
endif()
endfunction()

add_thrust_patches_if_necessary()
4 changes: 2 additions & 2 deletions paddle/fluid/extension/include/ext_all.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ limitations under the License. */

#pragma once

#if !defined(_MSC_VER) && __cplusplus < 199711L
#error C++11 or later compatible compiler is required to use Paddle.
#if !defined(_MSC_VER) && __cplusplus < 201402L
#error C++14 or later compatible compiler is required to use Paddle.
#endif

#ifdef _WIN32
Expand Down
68 changes: 68 additions & 0 deletions paddle/fluid/framework/data_feed.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@ USE_INT_STAT(STAT_total_feasign_num_in_mem);
namespace paddle {
namespace framework {

DLManager& global_dlmanager_pool() {
static DLManager manager;
return manager;
}

void RecordCandidateList::ReSize(size_t length) {
mutex_.lock();
capacity_ = length;
Expand Down Expand Up @@ -366,6 +371,10 @@ void InMemoryDataFeed<T>::SetParseInsId(bool parse_ins_id) {
template <typename T>
void InMemoryDataFeed<T>::LoadIntoMemory() {
#ifdef _LINUX
if (!so_parser_name_.empty()) {
LoadIntoMemoryFromSo();
return;
}
VLOG(3) << "LoadIntoMemory() begin, thread_id=" << thread_id_;
std::string filename;
while (this->PickOneFile(&filename)) {
Expand Down Expand Up @@ -408,6 +417,51 @@ void InMemoryDataFeed<T>::LoadIntoMemory() {
#endif
}

template <typename T>
void InMemoryDataFeed<T>::LoadIntoMemoryFromSo() {
#ifdef _LINUX
VLOG(3) << "LoadIntoMemoryFromSo() begin, thread_id=" << thread_id_;

string::LineFileReader reader;
paddle::framework::CustomParser* parser =
global_dlmanager_pool().Load(so_parser_name_, slot_conf_);

std::string filename;
while (this->PickOneFile(&filename)) {
VLOG(3) << "PickOneFile, filename=" << filename
<< ", thread_id=" << thread_id_;
int err_no = 0;
this->fp_ = fs_open_read(filename, &err_no, this->pipe_command_);
CHECK(this->fp_ != nullptr);
__fsetlocking(&*(this->fp_), FSETLOCKING_BYCALLER);

paddle::framework::ChannelWriter<T> writer(input_channel_);
T instance;
platform::Timer timeline;
timeline.Start();

while (1) {
if (!reader.getline(&*(fp_.get()))) {
break;
} else {
const char* str = reader.get();
ParseOneInstanceFromSo(str, &instance, parser);
}

writer << std::move(instance);
instance = T();
}

writer.Flush();
timeline.Pause();
VLOG(3) << "LoadIntoMemoryFromSo() read all lines, file=" << filename
<< ", cost time=" << timeline.ElapsedSec()
<< " seconds, thread_id=" << thread_id_;
}
VLOG(3) << "LoadIntoMemoryFromSo() end, thread_id=" << thread_id_;
#endif
}

// explicit instantiation
template class InMemoryDataFeed<Record>;

Expand Down Expand Up @@ -827,16 +881,23 @@ void MultiSlotInMemoryDataFeed::Init(
inductive_shape_index_.resize(all_slot_num);
use_slots_.clear();
use_slots_is_dense_.clear();
slot_conf_.resize(all_slot_num);
for (size_t i = 0; i < all_slot_num; ++i) {
const auto& slot = multi_slot_desc.slots(i);
all_slots_[i] = slot.name();
all_slots_type_[i] = slot.type();
use_slots_index_[i] = slot.is_used() ? use_slots_.size() : -1;

slot_conf_[i].name = slot.name();
slot_conf_[i].type = slot.type();
slot_conf_[i].use_slots_index = use_slots_index_[i];

total_dims_without_inductive_[i] = 1;
inductive_shape_index_[i] = -1;
if (slot.is_used()) {
use_slots_.push_back(all_slots_[i]);
use_slots_is_dense_.push_back(slot.is_dense());
slot_conf_[i].use_slots_is_dense = slot.is_dense();
std::vector<int> local_shape;
if (slot.is_dense()) {
for (int j = 0; j < slot.shape_size(); ++j) {
Expand Down Expand Up @@ -869,6 +930,7 @@ void MultiSlotInMemoryDataFeed::Init(
}
visit_.resize(all_slot_num, false);
pipe_command_ = data_feed_desc.pipe_command();
so_parser_name_ = data_feed_desc.so_parser_name();
finish_init_ = true;
input_type_ = data_feed_desc.input_type();
}
Expand All @@ -887,6 +949,12 @@ void MultiSlotInMemoryDataFeed::GetMsgFromLogKey(const std::string& log_key,
*rank = (uint32_t)strtoul(rank_str.c_str(), NULL, 16);
}

void MultiSlotInMemoryDataFeed::ParseOneInstanceFromSo(const char* str,
Record* instance,
CustomParser* parser) {
parser->ParseOneInstance(str, instance);
}

bool MultiSlotInMemoryDataFeed::ParseOneInstanceFromPipe(Record* instance) {
#ifdef _LINUX
thread_local string::LineFileReader reader;
Expand Down
95 changes: 95 additions & 0 deletions paddle/fluid/framework/data_feed.h
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,94 @@ using PvInstance = PvInstanceObject*;

inline PvInstance make_pv_instance() { return new PvInstanceObject(); }

struct SlotConf {
std::string name;
std::string type;
int use_slots_index;
int use_slots_is_dense;
};

class CustomParser {
public:
CustomParser() {}
virtual ~CustomParser() {}
virtual void Init(const std::vector<SlotConf>& slots) = 0;
virtual void ParseOneInstance(const char* str, Record* instance) = 0;
};

typedef paddle::framework::CustomParser* (*CreateParserObjectFunc)();

class DLManager {
struct DLHandle {
void* module;
paddle::framework::CustomParser* parser;
};

public:
DLManager() {}

~DLManager() {
#ifdef _LINUX
std::lock_guard<std::mutex> lock(mutex_);
for (auto it = handle_map_.begin(); it != handle_map_.end(); ++it) {
delete it->second.parser;
dlclose(it->second.module);
}
#endif
}

bool Close(const std::string& name) {
#ifdef _LINUX
auto it = handle_map_.find(name);
if (it == handle_map_.end()) {
return true;
}
delete it->second.parser;
dlclose(it->second.module);
#endif
VLOG(0) << "Not implement in windows";
return false;
}

paddle::framework::CustomParser* Load(const std::string& name,
std::vector<SlotConf>& conf) {
#ifdef _LINUX
std::lock_guard<std::mutex> lock(mutex_);
DLHandle handle;
std::map<std::string, DLHandle>::iterator it = handle_map_.find(name);
if (it != handle_map_.end()) {
return it->second.parser;
}

handle.module = dlopen(name.c_str(), RTLD_NOW);
if (handle.module == nullptr) {
VLOG(0) << "Create so of " << name << " fail";
return nullptr;
}

CreateParserObjectFunc create_parser_func =
(CreateParserObjectFunc)dlsym(handle.module, "CreateParserObject");
handle.parser = create_parser_func();
handle.parser->Init(conf);
handle_map_.insert({name, handle});

return handle.parser;
#endif
VLOG(0) << "Not implement in windows";
return nullptr;
}

paddle::framework::CustomParser* ReLoad(const std::string& name,
std::vector<SlotConf>& conf) {
Close(name);
return Load(name, conf);
}

private:
std::mutex mutex_;
std::map<std::string, DLHandle> handle_map_;
};

class DataFeed {
public:
DataFeed() {
Expand Down Expand Up @@ -252,6 +340,8 @@ class DataFeed {
bool finish_set_filelist_;
bool finish_start_;
std::string pipe_command_;
std::string so_parser_name_;
std::vector<SlotConf> slot_conf_;
std::vector<std::string> ins_id_vec_;
std::vector<std::string> ins_content_vec_;
platform::Place place_;
Expand Down Expand Up @@ -324,10 +414,13 @@ class InMemoryDataFeed : public DataFeed {
virtual void SetEnablePvMerge(bool enable_pv_merge);
virtual void SetCurrentPhase(int current_phase);
virtual void LoadIntoMemory();
virtual void LoadIntoMemoryFromSo();

protected:
virtual bool ParseOneInstance(T* instance) = 0;
virtual bool ParseOneInstanceFromPipe(T* instance) = 0;
virtual void ParseOneInstanceFromSo(const char* str, T* instance,
CustomParser* parser) {}
virtual void PutToFeedVec(const std::vector<T>& ins_vec) = 0;

int thread_id_;
Expand Down Expand Up @@ -688,6 +781,8 @@ class MultiSlotInMemoryDataFeed : public InMemoryDataFeed<Record> {
protected:
virtual bool ParseOneInstance(Record* instance);
virtual bool ParseOneInstanceFromPipe(Record* instance);
virtual void ParseOneInstanceFromSo(const char* str, Record* instance,
CustomParser* parser);
virtual void PutToFeedVec(const std::vector<Record>& ins_vec);
virtual void GetMsgFromLogKey(const std::string& log_key, uint64_t* search_id,
uint32_t* cmatch, uint32_t* rank);
Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/framework/data_feed.proto
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,5 @@ message DataFeedDesc {
optional string rank_offset = 6;
optional int32 pv_batch_size = 7 [ default = 32 ];
optional int32 input_type = 8 [ default = 0 ];
optional string so_parser_name = 9;
}
3 changes: 3 additions & 0 deletions paddle/fluid/framework/device_worker.h
Original file line number Diff line number Diff line change
Expand Up @@ -581,6 +581,7 @@ class SectionWorker : public DeviceWorker {
void RunUpdate(
std::unique_ptr<GarbageCollector>&,
std::unordered_map<const OperatorBase*, std::vector<std::string>>&);
void PrepareUnusedVar();

protected:
int section_id_;
Expand All @@ -595,6 +596,8 @@ class SectionWorker : public DeviceWorker {

std::vector<std::unique_ptr<OperatorBase>> ops_;
std::shared_ptr<framework::ProgramDesc> program_;
std::unordered_map<const OperatorBase*, std::vector<std::string>>
unused_vars_;
static uint64_t batch_id_;

platform::DeviceContext* dev_ctx_ = nullptr;
Expand Down
2 changes: 2 additions & 0 deletions paddle/fluid/framework/distributed_strategy.proto
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
// Copyright (c) 2021 NVIDIA Corporation. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -189,6 +190,7 @@ message DistributedStrategy {
optional bool without_graph_optimization = 30 [ default = false ];
optional int32 fuse_grad_size_in_num = 31 [ default = 1 ];
optional bool calc_comm_same_stream = 32 [ default = false ];
optional bool asp = 33 [ default = false ];

optional RecomputeConfig recompute_configs = 101;
optional AMPConfig amp_configs = 102;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,8 @@ ConvElementwiseAdd2ActFusePass::ConvElementwiseAdd2ActFusePass() {
.End()
.AddAttr("axis")
// the first elementwise_add-axis needs to be 1, the second has to be -1
.IsIntIn({1, -1})
// or 0
.IsIntIn({1, -1, 0})
.End();

AddOpCompat(OpCompat("relu"))
Expand Down
25 changes: 20 additions & 5 deletions paddle/fluid/framework/ir/graph_pattern_detector.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2262,11 +2262,26 @@ PDNode *patterns::QuantizePlacement::operator()(
PDNode *patterns::Bfloat16Placement::operator()(
const std::unordered_set<std::string> &bfloat16_enabled_op_types) {
std::unordered_set<std::string> supported_op_types =
std::unordered_set<std::string>(
{"concat", "conv2d", "conv2d_transpose", "elementwise_add",
"elementwise_mul", "fc", "fusion_gru", "fusion_lstm", "gelu",
"layer_norm", "matmul", "matmul_v2", "pool2d", "relu", "reshape2",
"softmax", "split", "sum", "transpose2"});
std::unordered_set<std::string>({"concat",
"conv2d",
"conv2d_transpose",
"elementwise_add",
"elementwise_mul",
"fc",
"fusion_gru",
"fusion_lstm",
"gelu",
"layer_norm",
"matmul",
"matmul_v2",
"pool2d",
"prelu",
"relu",
"reshape2",
"softmax",
"split",
"sum",
"transpose2"});
if (!bfloat16_enabled_op_types.empty()) {
supported_op_types = bfloat16_enabled_op_types;
}
Expand Down
Loading

0 comments on commit 61842ed

Please sign in to comment.