diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 0a20856c0fd05..f8a4d09924435 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -354,10 +354,10 @@ cc_library(executor_cache SRCS executor_cache.cc DEPS parallel_executor) if(WITH_PSCORE) get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS) cc_test(dist_multi_trainer_test SRCS dist_multi_trainer_test.cc DEPS - conditional_block_op executor ${RPC_DEPS}) + conditional_block_op executor gloo_wrapper ${RPC_DEPS}) else() cc_test(dist_multi_trainer_test SRCS dist_multi_trainer_test.cc DEPS - conditional_block_op executor) + conditional_block_op executor gloo_wrapper) endif() cc_library(prune SRCS prune.cc DEPS framework_proto boost) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc index 87afda459624f..fdb24ee18eca7 100644 --- a/paddle/fluid/framework/data_feed.cc +++ b/paddle/fluid/framework/data_feed.cc @@ -257,6 +257,11 @@ bool InMemoryDataFeed::Start() { output_channel_->Write(std::move(data)); } #endif + if (batch_offsets_.size() > 0) { + VLOG(3) << "batch_size offsets: " << batch_offsets_.size(); + enable_heterps_ = true; + this->offset_index_ = 0; + } this->finish_start_ = true; return true; } @@ -265,34 +270,64 @@ template int InMemoryDataFeed::Next() { #ifdef _LINUX this->CheckStart(); - CHECK(output_channel_ != nullptr); - CHECK(consume_channel_ != nullptr); - VLOG(3) << "output_channel_ size=" << output_channel_->Size() - << ", consume_channel_ size=" << consume_channel_->Size() - << ", thread_id=" << thread_id_; - int index = 0; - T instance; - std::vector ins_vec; - ins_vec.reserve(this->default_batch_size_); - while (index < this->default_batch_size_) { - if (output_channel_->Size() == 0) { - break; + if (!enable_heterps_) { + CHECK(output_channel_ != nullptr); + CHECK(consume_channel_ != nullptr); + VLOG(3) << "output_channel_ size=" << output_channel_->Size() + << ", consume_channel_ size=" << consume_channel_->Size() + << ", thread_id=" << thread_id_; + int index = 0; + T instance; + std::vector ins_vec; + ins_vec.reserve(this->default_batch_size_); + while (index < this->default_batch_size_) { + if (output_channel_->Size() == 0) { + break; + } + output_channel_->Get(instance); + ins_vec.push_back(instance); + ++index; + consume_channel_->Put(std::move(instance)); + } + this->batch_size_ = index; + VLOG(3) << "batch_size_=" << this->batch_size_ + << ", thread_id=" << thread_id_; + if (this->batch_size_ != 0) { + PutToFeedVec(ins_vec); + } else { + VLOG(3) << "finish reading, output_channel_ size=" + << output_channel_->Size() + << ", consume_channel_ size=" << consume_channel_->Size() + << ", thread_id=" << thread_id_; } - output_channel_->Get(instance); - ins_vec.push_back(instance); - ++index; - consume_channel_->Put(std::move(instance)); - } - this->batch_size_ = index; - VLOG(3) << "batch_size_=" << this->batch_size_ - << ", thread_id=" << thread_id_; - if (this->batch_size_ != 0) { - PutToFeedVec(ins_vec); } else { - VLOG(3) << "finish reading, output_channel_ size=" - << output_channel_->Size() - << ", consume_channel_ size=" << consume_channel_->Size() + VLOG(3) << "enable heter NEXT: " << offset_index_ + << " batch_offsets: " << batch_offsets_.size(); + if (offset_index_ >= batch_offsets_.size()) { + VLOG(3) << "offset_index: " << offset_index_ + << " batch_offsets: " << batch_offsets_.size(); + return 0; + } + auto& batch = batch_offsets_[offset_index_++]; + this->batch_size_ = batch.second; + VLOG(3) << "batch_size_=" << this->batch_size_ << ", thread_id=" << thread_id_; + if (this->batch_size_ != 0) { + PutToFeedVec(&records_[batch.first], this->batch_size_); + } else { + VLOG(3) << "finish reading for heterps, batch size zero, thread_id=" + << thread_id_; + } + /* + if (offset_index_ == batch_offsets_.size() - 1) { + std::vector data; + output_channel_->ReadAll(data); + consume_channel_->Write(std::move(data)); + } + */ + VLOG(3) << "#15 enable heter NEXT: " << offset_index_ + << " batch_offsets: " << batch_offsets_.size() + << " baych_size: " << this->batch_size_; } return this->batch_size_; #else @@ -1141,6 +1176,103 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstance(Record* instance) { return false; } +void MultiSlotInMemoryDataFeed::PutToFeedVec(const Record* ins_vec, int num) { +#ifdef _LINUX + for (size_t i = 0; i < batch_float_feasigns_.size(); ++i) { + batch_float_feasigns_[i].clear(); + batch_uint64_feasigns_[i].clear(); + offset_[i].clear(); + offset_[i].push_back(0); + } + ins_content_vec_.clear(); + ins_content_vec_.reserve(num); + ins_id_vec_.clear(); + ins_id_vec_.reserve(num); + for (int i = 0; i < num; ++i) { + auto& r = ins_vec[i]; + ins_id_vec_.push_back(r.ins_id_); + ins_content_vec_.push_back(r.content_); + for (auto& item : r.float_feasigns_) { + batch_float_feasigns_[item.slot()].push_back(item.sign().float_feasign_); + visit_[item.slot()] = true; + } + for (auto& item : r.uint64_feasigns_) { + batch_uint64_feasigns_[item.slot()].push_back( + item.sign().uint64_feasign_); + visit_[item.slot()] = true; + } + for (size_t j = 0; j < use_slots_.size(); ++j) { + const auto& type = all_slots_type_[j]; + if (visit_[j]) { + visit_[j] = false; + } else { + // fill slot value with default value 0 + if (type[0] == 'f') { // float + batch_float_feasigns_[j].push_back(0.0); + } else if (type[0] == 'u') { // uint64 + batch_uint64_feasigns_[j].push_back(0); + } + } + // get offset of this ins in this slot + if (type[0] == 'f') { // float + offset_[j].push_back(batch_float_feasigns_[j].size()); + } else if (type[0] == 'u') { // uint64 + offset_[j].push_back(batch_uint64_feasigns_[j].size()); + } + } + } + + for (size_t i = 0; i < use_slots_.size(); ++i) { + if (feed_vec_[i] == nullptr) { + continue; + } + int total_instance = offset_[i].back(); + const auto& type = all_slots_type_[i]; + if (type[0] == 'f') { // float + float* feasign = batch_float_feasigns_[i].data(); + float* tensor_ptr = + feed_vec_[i]->mutable_data({total_instance, 1}, this->place_); + CopyToFeedTensor(tensor_ptr, feasign, total_instance * sizeof(float)); + } else if (type[0] == 'u') { // uint64 + // no uint64_t type in paddlepaddle + uint64_t* feasign = batch_uint64_feasigns_[i].data(); + int64_t* tensor_ptr = feed_vec_[i]->mutable_data( + {total_instance, 1}, this->place_); + CopyToFeedTensor(tensor_ptr, feasign, total_instance * sizeof(int64_t)); + } + auto& slot_offset = offset_[i]; + if (this->input_type_ == 0) { + LoD data_lod{slot_offset}; + feed_vec_[i]->set_lod(data_lod); + } else if (this->input_type_ == 1) { + if (!use_slots_is_dense_[i]) { + std::vector tmp_offset; + PADDLE_ENFORCE_EQ(slot_offset.size(), 2, + platform::errors::InvalidArgument( + "In batch reader, the sparse tensor lod size " + "must be 2, but received %d.", + slot_offset.size())); + const auto& max_size = slot_offset[1]; + tmp_offset.reserve(max_size + 1); + for (unsigned int k = 0; k <= max_size; k++) { + tmp_offset.emplace_back(k); + } + slot_offset = tmp_offset; + LoD data_lod{slot_offset}; + feed_vec_[i]->set_lod(data_lod); + } + } + if (use_slots_is_dense_[i]) { + if (inductive_shape_index_[i] != -1) { + use_slots_shape_[i][inductive_shape_index_[i]] = + total_instance / total_dims_without_inductive_[i]; + } + feed_vec_[i]->Resize(framework::make_ddim(use_slots_shape_[i])); + } + } +#endif +} + void MultiSlotInMemoryDataFeed::PutToFeedVec( const std::vector& ins_vec) { #ifdef _LINUX diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h index 04a5b9b4d3ada..198bc51463af3 100644 --- a/paddle/fluid/framework/data_feed.h +++ b/paddle/fluid/framework/data_feed.h @@ -167,7 +167,7 @@ class DLManager { } paddle::framework::CustomParser* Load(const std::string& name, - std::vector& conf) { + const std::vector& conf) { #ifdef _LINUX std::lock_guard lock(mutex_); DLHandle handle; @@ -195,7 +195,7 @@ class DLManager { } paddle::framework::CustomParser* ReLoad(const std::string& name, - std::vector& conf) { + const std::vector& conf) { Close(name); return Load(name, conf); } @@ -422,6 +422,7 @@ class InMemoryDataFeed : public DataFeed { virtual void ParseOneInstanceFromSo(const char* str, T* instance, CustomParser* parser) {} virtual void PutToFeedVec(const std::vector& ins_vec) = 0; + virtual void PutToFeedVec(const T* ins_vec, int num) = 0; int thread_id_; int thread_num_; @@ -439,6 +440,11 @@ class InMemoryDataFeed : public DataFeed { paddle::framework::ChannelObject* input_pv_channel_; paddle::framework::ChannelObject* output_pv_channel_; paddle::framework::ChannelObject* consume_pv_channel_; + + std::vector> batch_offsets_; + uint64_t offset_index_ = 0; + bool enable_heterps_ = false; + T* records_ = nullptr; }; // This class define the data type of instance(ins_vec) in MultiSlotDataFeed @@ -601,7 +607,7 @@ paddle::framework::Archive& operator>>(paddle::framework::Archive& ar, for (size_t& x : offset) { uint64_t t; ar >> t; - x = (size_t)t; + x = static_cast(t); } #endif ar >> ins.MutableFloatData(); @@ -777,6 +783,11 @@ class MultiSlotInMemoryDataFeed : public InMemoryDataFeed { MultiSlotInMemoryDataFeed() {} virtual ~MultiSlotInMemoryDataFeed() {} virtual void Init(const DataFeedDesc& data_feed_desc); + void SetRecord(Record* records) { records_ = records; } + int GetDefaultBatchSize() { return default_batch_size_; } + void AddBatchOffset(const std::pair& offset) { + batch_offsets_.push_back(offset); + } protected: virtual bool ParseOneInstance(Record* instance); @@ -786,6 +797,7 @@ class MultiSlotInMemoryDataFeed : public InMemoryDataFeed { virtual void PutToFeedVec(const std::vector& ins_vec); virtual void GetMsgFromLogKey(const std::string& log_key, uint64_t* search_id, uint32_t* cmatch, uint32_t* rank); + virtual void PutToFeedVec(const Record* ins_vec, int num); std::vector> batch_float_feasigns_; std::vector> batch_uint64_feasigns_; std::vector> offset_; diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc index a9903f164bda3..08c42a93d1fcb 100644 --- a/paddle/fluid/framework/data_set.cc +++ b/paddle/fluid/framework/data_set.cc @@ -216,6 +216,180 @@ void DatasetImpl::RegisterClientToClientMsgHandler() { }); VLOG(3) << "RegisterClientToClientMsgHandler done"; } +static void compute_left_batch_num(const int ins_num, const int thread_num, + std::vector>* offset, + const int start_pos) { + int cur_pos = start_pos; + int batch_size = ins_num / thread_num; + int left_num = ins_num % thread_num; + for (int i = 0; i < thread_num; ++i) { + int batch_num_size = batch_size; + if (i == 0) { + batch_num_size = batch_num_size + left_num; + } + offset->push_back(std::make_pair(cur_pos, batch_num_size)); + cur_pos += batch_num_size; + } +} + +static void compute_batch_num(const int64_t ins_num, const int batch_size, + const int thread_num, + std::vector>* offset) { + int thread_batch_num = batch_size * thread_num; + // less data + if (static_cast(thread_batch_num) > ins_num) { + compute_left_batch_num(ins_num, thread_num, offset, 0); + return; + } + + int cur_pos = 0; + int offset_num = static_cast(ins_num / thread_batch_num) * thread_num; + int left_ins_num = static_cast(ins_num % thread_batch_num); + if (left_ins_num > 0 && left_ins_num < thread_num) { + offset_num = offset_num - thread_num; + left_ins_num = left_ins_num + thread_batch_num; + for (int i = 0; i < offset_num; ++i) { + offset->push_back(std::make_pair(cur_pos, batch_size)); + cur_pos += batch_size; + } + // split data to thread avg two rounds + compute_left_batch_num(left_ins_num, thread_num * 2, offset, cur_pos); + } else { + for (int i = 0; i < offset_num; ++i) { + offset->push_back(std::make_pair(cur_pos, batch_size)); + cur_pos += batch_size; + } + if (left_ins_num > 0) { + compute_left_batch_num(left_ins_num, thread_num, offset, cur_pos); + } + } +} + +static int compute_thread_batch_nccl( + const int thr_num, const int64_t total_instance_num, + const int minibatch_size, std::vector>* nccl_offsets) { + int thread_avg_batch_num = 0; + if (total_instance_num < static_cast(thr_num)) { + LOG(WARNING) << "compute_thread_batch_nccl total ins num:[" + << total_instance_num << "], less thread num:[" << thr_num + << "]"; + return thread_avg_batch_num; + } + + auto& offset = (*nccl_offsets); + // split data avg by thread num + compute_batch_num(total_instance_num, minibatch_size, thr_num, &offset); + thread_avg_batch_num = static_cast(offset.size() / thr_num); +#ifdef PADDLE_WITH_GLOO + auto gloo_wrapper = paddle::framework::GlooWrapper::GetInstance(); + if (!gloo_wrapper->IsInitialized()) { + VLOG(0) << "GLOO is not inited"; + gloo_wrapper->Init(); + } + + if (gloo_wrapper->Size() > 1) { + // adjust batch num per thread for NCCL + std::vector thread_avg_batch_num_vec(1, thread_avg_batch_num); + std::vector total_instance_num_vec(1, total_instance_num); + auto thread_max_batch_num_vec = + gloo_wrapper->AllReduce(thread_avg_batch_num_vec, "max"); + auto sum_total_ins_num_vec = + gloo_wrapper->AllReduce(total_instance_num_vec, "sum"); + int thread_max_batch_num = thread_max_batch_num_vec[0]; + int64_t sum_total_ins_num = sum_total_ins_num_vec[0]; + int diff_batch_num = thread_max_batch_num - thread_avg_batch_num; + VLOG(3) << "diff batch num: " << diff_batch_num + << " thread max batch num: " << thread_max_batch_num + << " thread avg batch num: " << thread_avg_batch_num; + if (diff_batch_num == 0) { + LOG(WARNING) << "total sum ins " << sum_total_ins_num << ", thread_num " + << thr_num << ", ins num " << total_instance_num + << ", batch num " << offset.size() + << ", thread avg batch num " << thread_avg_batch_num; + return thread_avg_batch_num; + } + + int need_ins_num = thread_max_batch_num * thr_num; + // data is too less + if ((int64_t)need_ins_num > total_instance_num) { + PADDLE_THROW(platform::errors::InvalidArgument( + "error instance num:[%d] less need ins num:[%d]", total_instance_num, + need_ins_num)); + return thread_avg_batch_num; + } + + int need_batch_num = (diff_batch_num + 1) * thr_num; + int offset_split_index = static_cast(offset.size() - thr_num); + int split_left_num = total_instance_num - offset[offset_split_index].first; + while (split_left_num < need_batch_num) { + need_batch_num += thr_num; + offset_split_index -= thr_num; + split_left_num = total_instance_num - offset[offset_split_index].first; + } + int split_start = offset[offset_split_index].first; + offset.resize(offset_split_index); + compute_left_batch_num(split_left_num, need_batch_num, &offset, + split_start); + LOG(WARNING) << "total sum ins " << sum_total_ins_num << ", thread_num " + << thr_num << ", ins num " << total_instance_num + << ", batch num " << offset.size() << ", thread avg batch num " + << thread_avg_batch_num << ", thread max batch num " + << thread_max_batch_num + << ", need batch num: " << (need_batch_num / thr_num) + << "split begin (" << split_start << ")" << split_start + << ", num " << split_left_num; + thread_avg_batch_num = thread_max_batch_num; + } else { + LOG(WARNING) << "thread_num " << thr_num << ", ins num " + << total_instance_num << ", batch num " << offset.size() + << ", thread avg batch num " << thread_avg_batch_num; + } +#else + PADDLE_THROW(platform::errors::Unavailable( + "dataset compute nccl batch number need compile with GLOO")); +#endif + return thread_avg_batch_num; +} + +template +void DatasetImpl::SetHeterPs(bool enable_heterps) { +#ifdef PADDLE_WITH_GLOO + enable_heterps_ = enable_heterps; + if (enable_heterps_) { + if (input_records_.size() == 0 && input_channel_ != nullptr && + input_channel_->Size() != 0) { + input_channel_->ReadAll(input_records_); + VLOG(3) << "read from channel to records with records size: " + << input_records_.size(); + } + VLOG(3) << "input records size: " << input_records_.size(); + int64_t total_ins_num = input_records_.size(); + std::vector> offset; + int default_batch_size = + reinterpret_cast(readers_[0].get()) + ->GetDefaultBatchSize(); + VLOG(3) << "thread_num: " << thread_num_ + << " memory size: " << total_ins_num + << " default batch_size: " << default_batch_size; + compute_thread_batch_nccl(thread_num_, total_ins_num, default_batch_size, + &offset); + VLOG(3) << "offset size: " << offset.size(); + for (int i = 0; i < thread_num_; i++) { + reinterpret_cast(readers_[i].get()) + ->SetRecord(&input_records_[0]); + } + for (size_t i = 0; i < offset.size(); i++) { + reinterpret_cast( + readers_[i % thread_num_].get()) + ->AddBatchOffset(offset[i]); + } + } +#else + PADDLE_THROW(platform::errors::Unavailable( + "dataset set heterps need compile with GLOO")); +#endif + return; +} // load data into memory, Dataset hold this memory, // which will later be fed into readers' channel @@ -319,6 +493,13 @@ void DatasetImpl::ReleaseMemory() { multi_pv_consume_[i]->Clear(); multi_pv_consume_[i] = nullptr; } + if (enable_heterps_) { + input_records_.clear(); + input_records_.shrink_to_fit(); + std::vector().swap(input_records_); + VLOG(3) << "release heterps input records records size: " + << input_records_.size(); + } std::vector>().swap(multi_pv_consume_); std::vector>().swap(readers_); @@ -654,6 +835,9 @@ void DatasetImpl::CreateReaders() { channel_idx = 0; } } + if (enable_heterps_) { + SetHeterPs(true); + } VLOG(3) << "readers size: " << readers_.size(); } diff --git a/paddle/fluid/framework/data_set.h b/paddle/fluid/framework/data_set.h index 1c9869fa5afe2..f3ee96fab8297 100644 --- a/paddle/fluid/framework/data_set.h +++ b/paddle/fluid/framework/data_set.h @@ -24,6 +24,10 @@ #include #include #include +#ifdef PADDLE_WITH_GLOO +#include +#include "paddle/fluid/framework/fleet/gloo_wrapper.h" +#endif #include "paddle/fluid/framework/data_feed.h" @@ -145,6 +149,7 @@ class Dataset { virtual void DynamicAdjustReadersNum(int thread_num) = 0; // set fleet send sleep seconds virtual void SetFleetSendSleepSeconds(int seconds) = 0; + virtual void SetHeterPs(bool enable_heterps) = 0; protected: virtual int ReceiveFromClient(int msg_type, int client_id, @@ -228,6 +233,7 @@ class DatasetImpl : public Dataset { bool discard_remaining_ins = false); virtual void DynamicAdjustReadersNum(int thread_num); virtual void SetFleetSendSleepSeconds(int seconds); + virtual void SetHeterPs(bool enable_heterps); std::vector>& GetMultiOutputChannel() { return multi_output_channel_; @@ -292,6 +298,7 @@ class DatasetImpl : public Dataset { int64_t global_index_ = 0; std::vector> consume_task_pool_; std::vector input_records_; // only for paddleboxdatafeed + bool enable_heterps_ = false; }; // use std::vector or Record as data type diff --git a/paddle/fluid/framework/details/multi_devices_helper.cc b/paddle/fluid/framework/details/multi_devices_helper.cc index 01ef83518af5d..4587c6d3e4f2a 100644 --- a/paddle/fluid/framework/details/multi_devices_helper.cc +++ b/paddle/fluid/framework/details/multi_devices_helper.cc @@ -40,6 +40,7 @@ static std::unordered_set kMultiDeviceOps{ "c_broadcast", "c_comm_init", "c_comm_init_all", + "c_comm_init_multitrainer", "c_gen_nccl_id", "c_sync_comm_stream", "send", diff --git a/paddle/fluid/framework/dist_multi_trainer_test.cc b/paddle/fluid/framework/dist_multi_trainer_test.cc index 0e3292df3cf79..06d84bca1273d 100644 --- a/paddle/fluid/framework/dist_multi_trainer_test.cc +++ b/paddle/fluid/framework/dist_multi_trainer_test.cc @@ -14,7 +14,9 @@ #include "gtest/gtest.h" #include "paddle/fluid/framework/trainer.h" - +#ifdef PADDLE_WITH_GLOO +#include "paddle/fluid/framework/fleet/gloo_wrapper.h" +#endif #if defined _WIN32 || defined __APPLE__ #else #define _LINUX diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt index a9e4691dd0a01..36c5b13701361 100644 --- a/paddle/fluid/framework/fleet/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/CMakeLists.txt @@ -12,15 +12,15 @@ endif(WITH_PSLIB) if(WITH_HETERPS) if(WITH_NCCL) nv_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc - DEPS heter_ps ${BRPC_DEPS}) + DEPS heter_ps gloo_wrapper ${BRPC_DEPS}) add_subdirectory(heter_ps) elseif(WITH_RCCL) hip_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc - DEPS heter_ps ${BRPC_DEPS}) + DEPS heter_ps gloo_wrapper ${BRPC_DEPS}) add_subdirectory(heter_ps) endif(WITH_NCCL) else() - cc_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cc) + cc_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cc DEPS gloo_wrapper) endif(WITH_HETERPS) if(WITH_NCCL OR WITH_RCCL) diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index f8dfccf58ff96..7b3131003da79 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -123,7 +123,7 @@ void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task) { } timeline.Pause(); - VLOG(1) << "GpuPs task unique11111 cost " << timeline.ElapsedSec() + VLOG(1) << "GpuPs task add keys cost " << timeline.ElapsedSec() << " seconds."; timeline.Start(); gpu_task->UniqueKeys(); @@ -138,19 +138,74 @@ void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task) { timeline.Start(); auto ptl_func = [this, &local_keys, &local_ptr, &fleet_ptr](int i) { size_t key_size = local_keys[i].size(); + int32_t status = -1; #ifdef PADDLE_WITH_PSLIB - auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr( - reinterpret_cast(local_ptr[i].data()), this->table_id_, - local_keys[i].data(), key_size); + // auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr( + // reinterpret_cast(local_ptr[i].data()), this->table_id_, + // local_keys[i].data(), key_size); + int32_t cnt = 0; + while (true) { + auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr( + reinterpret_cast(local_ptr[i].data()), this->table_id_, + local_keys[i].data(), key_size); + bool flag = true; + + tt.wait(); + + try { + status = tt.get(); + } catch (const std::future_error& e) { + VLOG(0) << "Caught a future_error with code" << e.code() + << ", Message:" << e.what(); + } + if (status != 0) { + VLOG(0) << "fleet pull sparse failed, status[" << status << "]"; + sleep(sleep_seconds_before_fail_exit_); + flag = false; + cnt++; + } + if (cnt > 3) { + VLOG(0) << "fleet pull sparse failed, retry 3 times"; + exit(-1); + } + + if (flag) { + break; + } + } #endif #ifdef PADDLE_WITH_PSCORE - auto tt = fleet_ptr->_worker_ptr->pull_sparse_ptr( - reinterpret_cast(local_ptr[i].data()), this->table_id_, - local_keys[i].data(), key_size); + int32_t cnt = 0; + while (true) { + auto tt = fleet_ptr->_worker_ptr->pull_sparse_ptr( + reinterpret_cast(local_ptr[i].data()), this->table_id_, + local_keys[i].data(), key_size); + bool flag = true; + + tt.wait(); + + try { + status = tt.get(); + } catch (const std::future_error& e) { + VLOG(0) << "Caught a future_error with code" << e.code() + << ", Message:" << e.what(); + } + if (status != 0) { + VLOG(0) << "fleet pull sparse failed, status[" << status << "]"; + sleep(sleep_seconds_before_fail_exit_); + flag = false; + cnt++; + } + if (cnt > 3) { + VLOG(0) << "fleet pull sparse failed, retry 3 times"; + exit(-1); + } + + if (flag) { + break; + } + } #endif - tt.wait(); - auto status = tt.get(); - // auto status = 0; if (status != 0) { LOG(ERROR) << "fleet pull sparse failed, status[" << status << "]"; sleep(300); @@ -169,10 +224,27 @@ void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task) { timeline.Pause(); VLOG(1) << "pull sparse from CpuPS into GpuPS cost " << timeline.ElapsedSec() << " seconds."; + if (multi_node_) { + auto gloo_wrapper = paddle::framework::GlooWrapper::GetInstance(); + if (!gloo_wrapper->IsInitialized()) { + VLOG(0) << "GLOO is not inited"; + gloo_wrapper->Init(); + } + gloo_wrapper->Barrier(); + } timeline.Start(); - auto build_func = [device_num, &local_keys, &local_ptr, &device_keys, - &device_vals, &device_mutex](int i) { + std::vector>> pass_values; + uint16_t pass_id = 0; + + bool record_status = false; + if (multi_node_) { + record_status = fleet_ptr->pslib_ptr_->_worker_ptr->take_sparse_record( + table_id_, pass_id, pass_values); + } + auto build_func = [device_num, record_status, &pass_values, &local_keys, + &local_ptr, &device_keys, &device_vals, + &device_mutex](int i) { std::vector> task_keys(device_num); #ifdef PADDLE_WITH_PSLIB std::vector> task_ptrs( @@ -188,7 +260,21 @@ void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task) { task_keys[shard].push_back(local_keys[i][j]); task_ptrs[shard].push_back(local_ptr[i][j]); } - + if (record_status) { + size_t local_keys_size = local_keys.size(); + size_t pass_values_size = pass_values.size(); + for (size_t j = 0; j < pass_values_size; j += local_keys_size) { + auto& shard_values = pass_values[j]; + for (size_t pair_idx = 0; pair_idx < pass_values[j].size(); + pair_idx++) { + auto& cur_pair = shard_values[pair_idx]; + int shard = cur_pair.first % device_num; + task_keys[shard].push_back(cur_pair.first); + task_ptrs[shard].push_back( + (paddle::ps::DownpourFixedFeatureValue*)cur_pair.second); + } + } + } for (int dev = 0; dev < device_num; dev++) { device_mutex[dev]->lock(); diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc index 5c23e826a2dec..a8147fd466b52 100644 --- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc +++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc @@ -903,8 +903,6 @@ int MultiHeadMatmulV2FusePass::BuildFusionV2(Graph* graph, float, softmax_qk_op_desc->GetAttr("out_threshold")); multihead_op_desc.SetAttr("dp_probs", qkv_plugin_scale); } - } else { - multihead_op_desc.SetAttr("qkv2context_plugin_int8", false); } auto* multihead = graph->CreateOpNode(&multihead_op_desc); diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc index 2a9b015ce982c..a073acc96c0d4 100644 --- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc @@ -42,8 +42,7 @@ class MultiheadMatMulOpConverter : public OpConverter { float* weight_data = nullptr; bool enable_int8 = op_desc.HasAttr("enable_int8"); - bool qkv2context_plugin_int8 = - BOOST_GET_CONST(bool, op_desc.GetAttr("qkv2context_plugin_int8")); + bool qkv2context_plugin_int8 = op_desc.HasAttr("qkv2context_plugin_int8"); float in_scale = 0.; if (enable_int8) { diff --git a/paddle/fluid/inference/tests/infer_ut/README.md b/paddle/fluid/inference/tests/infer_ut/README.md new file mode 100644 index 0000000000000..886c9f1eb1484 --- /dev/null +++ b/paddle/fluid/inference/tests/infer_ut/README.md @@ -0,0 +1,37 @@ +# Inference Model UT + +There are several model tests currently: +- test_ernie_text_cls.cc +- test_LeViT.cc +- test_ppyolo_mbv3.cc +- test_ppyolov2_r50vd.cc +- test_resnet50.cc +- test_resnet50_quant.cc +- test_yolov3.cc + +To build and execute tests on Linux, simply run +``` +./run.sh $PADDLE_ROOT $TURN_ON_MKL $TEST_GPU_CPU $DATA_DIR +``` +To build on windows, run command with busybox +``` +busybox bash ./run.sh $PADDLE_ROOT $TURN_ON_MKL $TEST_GPU_CPU $DATA_DIR +``` + +- After run command, it will build and execute tests and download to ${DATA_DIR} automatically. +- `$PADDLE_ROOT`: paddle library path +- `$TURN_ON_MKL`: use MKL or Openblas +- `$TEST_GPU_CPU`: test both GPU/CPU mode or only CPU mode +- `$DATA_DIR`: download data path + +now only support 4 kinds of tests which controled by `--gtest_filter` argument, test suite name should be same as following. +- `TEST(gpu_tester_*, test_name)` +- `TEST(cpu_tester_*, test_name)` +- `TEST(mkldnn_tester_*, test_name)` +- `TEST(tensorrt_tester_*, test_name)` + +skpied test suite name. +- `TEST(DISABLED_gpu_tester_*, test_name)` +- `TEST(DISABLED_cpu_tester_*, test_name)` +- `TEST(DISABLED_mkldnn_tester_*, test_name)` +- `TEST(DISABLED_tensorrt_tester_*, test_name)` diff --git a/paddle/fluid/inference/tests/infer_ut/run.sh b/paddle/fluid/inference/tests/infer_ut/run.sh index ec744b358d130..1547071e75d49 100755 --- a/paddle/fluid/inference/tests/infer_ut/run.sh +++ b/paddle/fluid/inference/tests/infer_ut/run.sh @@ -24,6 +24,7 @@ MSVC_STATIC_CRT=$6 inference_install_dir=${PADDLE_ROOT}/build/paddle_inference_install_dir EXIT_CODE=0 # init default exit code WIN_DETECT=$(echo `uname` | grep "Win") # detect current platform +test_suite_list="cpu_tester*" # init test suite list, pass to --gtest_filter export RED='\033[0;31m' # red color export NC='\033[0m' # no color @@ -33,23 +34,30 @@ cd `dirname $0` current_dir=`pwd` build_dir=${current_dir}/build log_dir=${current_dir}/log + +# check mkldnn installation if [ $2 == ON ]; then # You can export yourself if move the install path MKL_LIB=${inference_install_dir}/third_party/install/mklml/lib export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${MKL_LIB} + test_suite_list="${test_suite_list}:mkldnn_tester*" fi + if [ $3 == ON ]; then use_gpu_list='true false' + test_suite_list="${test_suite_list}:gpu_tester*" else use_gpu_list='false' fi +# check tensorrt installation +TENSORRT_COMPILED=$(cat "${inference_install_dir}/version.txt" | grep "WITH_TENSORRT") USE_TENSORRT=OFF -if [ -d "$TENSORRT_ROOT_DIR" ]; then +if [ -d "$TENSORRT_ROOT_DIR" ] && [ ! -z "$TENSORRT_COMPILED" ] ; then USE_TENSORRT=ON + test_suite_list="${test_suite_list}:tensorrt_tester*" fi - function download() { url_prefix=$1 model_name=$2 @@ -146,104 +154,119 @@ mkdir -p ${log_dir} cd ${build_dir} rm -rf * -# ---------tensorrt gpu tests on linux--------- -if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then - rm -rf * +if [ $WIN_DETECT != "" ]; then + exe_dir=${build_dir}/Release +else + exe_dir=${build_dir} +fi; - if [ $WIN_DETECT != "" ]; then - exe_dir=${build_dir}/Release - else - exe_dir=${build_dir} - fi; +printf "${YELLOW} start test_resnet50 ${NC} \n"; +compile_test "test_resnet50" +${exe_dir}/test_resnet50 \ + --modeldir=$DATA_DIR/resnet50/resnet50 \ + --gtest_filter=${test_suite_list} \ + --gtest_output=xml:${log_dir}/test_resnet50.xml +if [ $? -ne 0 ]; then + echo "${RED} test_resnet50 runs failed ${NC}" >> ${exe_dir}/test_summary.txt + EXIT_CODE=8 +fi - printf "${YELLOW} start test_resnet50 ${NC} \n"; - compile_test "test_resnet50" - ${exe_dir}/test_resnet50 \ - --modeldir=$DATA_DIR/resnet50/resnet50 \ - --gtest_output=xml:${log_dir}/test_resnet50.xml - if [ $? -ne 0 ]; then - echo "${RED} test_resnet50 runs failed ${NC}" >> ${exe_dir}/test_summary.txt - EXIT_CODE=8 - fi +printf "${YELLOW} start test_det_mv3_db ${NC} \n"; +compile_test "test_det_mv3_db" +${exe_dir}/test_det_mv3_db \ + --modeldir=$DATA_DIR/ocr_det_mv3_db/ocr_det_mv3_db \ + --gtest_filter=${test_suite_list} \ + --gtest_output=xml:${log_dir}/test_det_mv3_db.xml +if [ $? -ne 0 ]; then + echo "${RED} test_det_mv3_db runs failed ${NC}" >> ${exe_dir}/test_summary.txt + EXIT_CODE=8 +fi - printf "${YELLOW} start test_det_mv3_db ${NC} \n"; - compile_test "test_det_mv3_db" - ${exe_dir}/test_det_mv3_db \ - --modeldir=$DATA_DIR/ocr_det_mv3_db/ocr_det_mv3_db \ - --gtest_output=xml:${log_dir}/test_det_mv3_db.xml - if [ $? -ne 0 ]; then - echo "${RED} test_det_mv3_db runs failed ${NC}" >> ${exe_dir}/test_summary.txt - EXIT_CODE=8 - fi +printf "${YELLOW} start test_LeViT ${NC} \n"; +compile_test "test_LeViT" +${exe_dir}/test_LeViT \ + --modeldir=$DATA_DIR/LeViT/LeViT \ + --gtest_filter=${test_suite_list} \ + --gtest_output=xml:${log_dir}/test_LeViT.xml +if [ $? -ne 0 ]; then + echo "${RED} test_LeViT runs failed ${NC}" >> ${exe_dir}/test_summary.txt + EXIT_CODE=8 +fi - printf "${YELLOW} start test_LeViT ${NC} \n"; - compile_test "test_LeViT" - ${exe_dir}/test_LeViT \ - --modeldir=$DATA_DIR/LeViT/LeViT \ - --gtest_output=xml:${log_dir}/test_LeViT.xml +if [ $WIN_DETECT != "" ]; then + #TODO(OliverLPH): enable test_ernie_text_cls on windows after fix compile issue + echo " skip test_ernie_text_cls " +else + printf "${YELLOW} start test_ernie_text_cls ${NC} \n"; + compile_test "test_ernie_text_cls" + ${exe_dir}/test_ernie_text_cls \ + --modeldir=$DATA_DIR/ernie_text_cls/ernie_text_cls \ + --gtest_filter=${test_suite_list} \ + --gtest_output=xml:${log_dir}/test_ernie_text_cls.xml if [ $? -ne 0 ]; then - echo "${RED} test_LeViT runs failed ${NC}" >> ${exe_dir}/test_summary.txt + echo "${RED} test_ernie_text_cls runs failed ${NC}" >> ${exe_dir}/test_summary.txt EXIT_CODE=8 fi +fi; - if [ $WIN_DETECT != "" ]; then - echo " skip test_ernie_text_cls " - else - printf "${YELLOW} start test_ernie_text_cls ${NC} \n"; - compile_test "test_ernie_text_cls" - ${exe_dir}/test_ernie_text_cls \ - --modeldir=$DATA_DIR/ernie_text_cls/ernie_text_cls \ - --gtest_output=xml:${log_dir}/test_ernie_text_cls.xml - if [ $? -ne 0 ]; then - echo "${RED} test_ernie_text_cls runs failed ${NC}" >> ${exe_dir}/test_summary.txt - EXIT_CODE=8 - fi - fi; - - printf "${YELLOW} start test_yolov3 ${NC} \n"; - compile_test "test_yolov3" - ${exe_dir}/test_yolov3 \ - --modeldir=$DATA_DIR/yolov3/yolov3 \ - --gtest_output=xml:${log_dir}/test_yolov3.xml - if [ $? -ne 0 ]; then - echo "${RED} test_yolov3 runs failed ${NC}" >> ${exe_dir}/test_summary.txt - EXIT_CODE=8 - fi +printf "${YELLOW} start test_yolov3 ${NC} \n"; +compile_test "test_yolov3" +${exe_dir}/test_yolov3 \ + --modeldir=$DATA_DIR/yolov3/yolov3 \ + --gtest_filter=${test_suite_list} \ + --gtest_output=xml:${log_dir}/test_yolov3.xml +if [ $? -ne 0 ]; then + echo "${RED} test_yolov3 runs failed ${NC}" >> ${exe_dir}/test_summary.txt + EXIT_CODE=8 +fi - printf "${YELLOW} start test_ppyolo_mbv3 ${NC} \n"; - compile_test "test_ppyolo_mbv3" - ${exe_dir}/test_ppyolo_mbv3 \ - --modeldir=$DATA_DIR/ppyolo_mbv3/ppyolo_mbv3 \ - --gtest_output=xml:${log_dir}/test_ppyolo_mbv3.xml - if [ $? -ne 0 ]; then - echo "${RED} test_ppyolo_mbv3 runs failed ${NC}" >> ${exe_dir}/test_summary.txt - EXIT_CODE=8 - fi +printf "${YELLOW} start test_ppyolo_mbv3 ${NC} \n"; +compile_test "test_ppyolo_mbv3" +${exe_dir}/test_ppyolo_mbv3 \ + --modeldir=$DATA_DIR/ppyolo_mbv3/ppyolo_mbv3 \ + --gtest_filter=${test_suite_list} \ + --gtest_output=xml:${log_dir}/test_ppyolo_mbv3.xml +if [ $? -ne 0 ]; then + echo "${RED} test_ppyolo_mbv3 runs failed ${NC}" >> ${exe_dir}/test_summary.txt + EXIT_CODE=8 +fi - printf "${YELLOW} start test_ppyolov2_r50vd ${NC} \n"; - compile_test "test_ppyolov2_r50vd" - ${exe_dir}/test_ppyolov2_r50vd \ - --modeldir=$DATA_DIR/ppyolov2_r50vd/ppyolov2_r50vd \ - --gtest_output=xml:${log_dir}/test_ppyolov2_r50vd.xml - if [ $? -ne 0 ]; then - echo "${RED} test_ppyolov2_r50vd runs failed ${NC}" >> ${exe_dir}/test_summary.txt - EXIT_CODE=8 - fi +printf "${YELLOW} start test_ppyolov2_r50vd ${NC} \n"; +compile_test "test_ppyolov2_r50vd" +${exe_dir}/test_ppyolov2_r50vd \ + --modeldir=$DATA_DIR/ppyolov2_r50vd/ppyolov2_r50vd \ + --gtest_filter=${test_suite_list} \ + --gtest_output=xml:${log_dir}/test_ppyolov2_r50vd.xml +if [ $? -ne 0 ]; then + echo "${RED} test_ppyolov2_r50vd runs failed ${NC}" >> ${exe_dir}/test_summary.txt + EXIT_CODE=8 +fi - printf "${YELLOW} start test_resnet50_quant ${NC} \n"; - compile_test "test_resnet50_quant" - ${exe_dir}/test_resnet50_quant \ - --int8dir=$DATA_DIR/resnet50_quant/resnet50_quant/resnet50_quant \ - --modeldir=$DATA_DIR/resnet50/resnet50 \ - --datadir=$DATA_DIR/resnet50_quant/resnet50_quant/imagenet-eval-binary/9.data \ - --gtest_output=xml:${log_dir}/test_resnet50_quant.xml - if [ $? -ne 0 ]; then - echo "${RED} test_resnet50_quant runs failed ${NC}" >> ${exe_dir}/test_summary.txt - EXIT_CODE=8 - fi +printf "${YELLOW} start test_resnet50_quant ${NC} \n"; +compile_test "test_resnet50_quant" +${exe_dir}/test_resnet50_quant \ + --int8dir=$DATA_DIR/resnet50_quant/resnet50_quant/resnet50_quant \ + --modeldir=$DATA_DIR/resnet50/resnet50 \ + --datadir=$DATA_DIR/resnet50_quant/resnet50_quant/imagenet-eval-binary/9.data \ + --gtest_filter=${test_suite_list} \ + --gtest_output=xml:${log_dir}/test_resnet50_quant.xml +if [ $? -ne 0 ]; then + echo "${RED} test_resnet50_quant runs failed ${NC}" >> ${exe_dir}/test_summary.txt + EXIT_CODE=8 fi set +x + +test_suites=$(echo ${test_suite_list} | sed 's/:/ /g') +echo " " +echo "CI Tested Following Patterns: " +echo "=====================test patterns======================" +for test_suite in ${test_suites}; do + echo " ${test_suite}" +done +echo "========================================================" +echo " " + if [[ -f ${exe_dir}/test_summary.txt ]];then echo " " echo "Summary Failed Tests ..." diff --git a/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc b/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc index f115d1f898c94..a7ff5af1bdc24 100644 --- a/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc +++ b/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc @@ -32,7 +32,7 @@ paddle::test::Record PrepareInput(int batch_size) { return image_Record; } -TEST(test_LeViT, analysis_gpu_bz1) { +TEST(gpu_tester_LeViT, analysis_gpu_bz1) { // init input data std::map my_input_data_map; my_input_data_map["x"] = PrepareInput(1); @@ -60,7 +60,7 @@ TEST(test_LeViT, analysis_gpu_bz1) { std::cout << "finish test" << std::endl; } -TEST(test_LeViT, trt_fp32_bz2) { +TEST(tensorrt_tester_LeViT, trt_fp32_bz2) { // init input data std::map my_input_data_map; my_input_data_map["x"] = PrepareInput(2); @@ -91,7 +91,7 @@ TEST(test_LeViT, trt_fp32_bz2) { std::cout << "finish test" << std::endl; } -TEST(test_LeViT, serial_diff_batch_trt_fp32) { +TEST(tensorrt_tester_LeViT, serial_diff_batch_trt_fp32) { int max_batch_size = 5; // prepare groudtruth config paddle_infer::Config config, config_no_ir; @@ -127,7 +127,7 @@ TEST(test_LeViT, serial_diff_batch_trt_fp32) { std::cout << "finish test" << std::endl; } -TEST(test_LeViT, multi_thread4_trt_fp32_bz2) { +TEST(tensorrt_tester_LeViT, multi_thread4_trt_fp32_bz2) { int thread_num = 4; // init input data std::map my_input_data_map; diff --git a/paddle/fluid/inference/tests/infer_ut/test_det_mv3_db.cc b/paddle/fluid/inference/tests/infer_ut/test_det_mv3_db.cc index ce7b8ce463727..67c2eeb0be5f9 100644 --- a/paddle/fluid/inference/tests/infer_ut/test_det_mv3_db.cc +++ b/paddle/fluid/inference/tests/infer_ut/test_det_mv3_db.cc @@ -77,7 +77,7 @@ void PrepareDynamicShape(paddle_infer::Config* config, int max_batch_size = 4) { opt_input_shape); } -TEST(test_det_mv3_db, analysis_gpu_bz4) { +TEST(gpu_tester_det_mv3_db, analysis_gpu_bz4) { // init input data std::map my_input_data_map; my_input_data_map["x"] = PrepareInput(4, 640); @@ -105,7 +105,7 @@ TEST(test_det_mv3_db, analysis_gpu_bz4) { std::cout << "finish test" << std::endl; } -TEST(test_det_mv3_db, multi_thread2_trt_fp32_dynamic_shape_bz2) { +TEST(tensorrt_tester_det_mv3_db, multi_thread2_trt_fp32_dynamic_shape_bz2) { int thread_num = 2; // thread > 2 may OOM // init input data std::map my_input_data_map; @@ -149,7 +149,7 @@ TEST(test_det_mv3_db, multi_thread2_trt_fp32_dynamic_shape_bz2) { std::cout << "finish multi-thread test" << std::endl; } -TEST(test_det_mv3_db, multi_thread2_mkl_fp32_bz2) { +TEST(mkldnn_tester_det_mv3_db, multi_thread2_mkl_fp32_bz2) { int thread_num = 2; // thread > 2 may OOM // init input data std::map my_input_data_map; diff --git a/paddle/fluid/inference/tests/infer_ut/test_ernie_text_cls.cc b/paddle/fluid/inference/tests/infer_ut/test_ernie_text_cls.cc index f73803fe59335..6ef894cc3d1d6 100644 --- a/paddle/fluid/inference/tests/infer_ut/test_ernie_text_cls.cc +++ b/paddle/fluid/inference/tests/infer_ut/test_ernie_text_cls.cc @@ -52,7 +52,7 @@ std::map PrepareInput(int batch_size) { return my_input_data_map; } -TEST(test_ernie_text_cls, analysis_gpu_bz2_buffer) { +TEST(gpu_tester_ernie_text_cls, analysis_gpu_bz2_buffer) { // init input data auto my_input_data_map = PrepareInput(2); // init output data @@ -84,7 +84,7 @@ TEST(test_ernie_text_cls, analysis_gpu_bz2_buffer) { std::cout << "finish test" << std::endl; } -TEST(test_ernie_text_cls, multi_thread4_mkl_fp32_bz2) { +TEST(mkldnn_tester_ernie_text_cls, multi_thread4_mkl_fp32_bz2) { int thread_num = 4; // init input data auto my_input_data_map = PrepareInput(2); diff --git a/paddle/fluid/inference/tests/infer_ut/test_ppyolo_mbv3.cc b/paddle/fluid/inference/tests/infer_ut/test_ppyolo_mbv3.cc index d845e5da15dd4..0a24975d62ff4 100644 --- a/paddle/fluid/inference/tests/infer_ut/test_ppyolo_mbv3.cc +++ b/paddle/fluid/inference/tests/infer_ut/test_ppyolo_mbv3.cc @@ -55,7 +55,7 @@ std::map PrepareInput(int batch_size) { return input_data_map; } -TEST(test_ppyolo_mbv3, multi_thread4_trt_fp32_bz2) { +TEST(tensorrt_tester_ppyolo_mbv3, multi_thread4_trt_fp32_bz2) { int thread_num = 4; // init input data auto input_data_map = PrepareInput(2); @@ -101,7 +101,7 @@ TEST(test_ppyolo_mbv3, multi_thread4_trt_fp32_bz2) { std::cout << "finish multi-thread test" << std::endl; } -TEST(test_ppyolo_mbv3, multi_thread4_mkl_bz2) { +TEST(mkldnn_tester_ppyolo_mbv3, multi_thread4_mkl_bz2) { // TODO(OliverLPH): mkldnn multi thread will fail int thread_num = 4; // init input data diff --git a/paddle/fluid/inference/tests/infer_ut/test_ppyolov2_r50vd.cc b/paddle/fluid/inference/tests/infer_ut/test_ppyolov2_r50vd.cc index b2cb4ca32238c..d74a333232473 100644 --- a/paddle/fluid/inference/tests/infer_ut/test_ppyolov2_r50vd.cc +++ b/paddle/fluid/inference/tests/infer_ut/test_ppyolov2_r50vd.cc @@ -55,7 +55,7 @@ std::map PrepareInput(int batch_size) { return input_data_map; } -TEST(test_ppyolov2_r50vd, multi_thread2_trt_fp32_bz1) { +TEST(tensorrt_tester_ppyolov2_r50vd, multi_thread2_trt_fp32_bz1) { int thread_num = 2; // thread > 2 may OOM // init input data auto input_data_map = PrepareInput(1); @@ -100,7 +100,7 @@ TEST(test_ppyolov2_r50vd, multi_thread2_trt_fp32_bz1) { std::cout << "finish multi-thread test" << std::endl; } -TEST(test_ppyolov2_r50vd, multi_thread2_mkl_bz2) { +TEST(mkldnn_tester_ppyolov2_r50vd, multi_thread2_mkl_bz2) { int thread_num = 2; // init input data auto input_data_map = PrepareInput(2); diff --git a/paddle/fluid/inference/tests/infer_ut/test_resnet50.cc b/paddle/fluid/inference/tests/infer_ut/test_resnet50.cc index 035bc3f34f3e4..6157fdbdb108a 100644 --- a/paddle/fluid/inference/tests/infer_ut/test_resnet50.cc +++ b/paddle/fluid/inference/tests/infer_ut/test_resnet50.cc @@ -32,7 +32,7 @@ paddle::test::Record PrepareInput(int batch_size) { return image_Record; } -TEST(test_resnet50, analysis_gpu_bz1) { +TEST(gpu_tester_resnet50, analysis_gpu_bz1) { // init input data std::map my_input_data_map; my_input_data_map["inputs"] = PrepareInput(1); @@ -60,7 +60,7 @@ TEST(test_resnet50, analysis_gpu_bz1) { std::cout << "finish test" << std::endl; } -TEST(test_resnet50, trt_fp32_bz2) { +TEST(tensorrt_tester_resnet50, trt_fp32_bz2) { // init input data std::map my_input_data_map; my_input_data_map["inputs"] = PrepareInput(2); @@ -91,7 +91,7 @@ TEST(test_resnet50, trt_fp32_bz2) { std::cout << "finish test" << std::endl; } -TEST(test_resnet50, serial_diff_batch_trt_fp32) { +TEST(tensorrt_tester_resnet50, serial_diff_batch_trt_fp32) { int max_batch_size = 5; // prepare groudtruth config paddle_infer::Config config, config_no_ir; @@ -127,7 +127,7 @@ TEST(test_resnet50, serial_diff_batch_trt_fp32) { std::cout << "finish test" << std::endl; } -TEST(test_resnet50, multi_thread4_trt_fp32_bz2) { +TEST(tensorrt_tester_resnet50, multi_thread4_trt_fp32_bz2) { int thread_num = 4; // init input data std::map my_input_data_map; @@ -170,7 +170,7 @@ TEST(test_resnet50, multi_thread4_trt_fp32_bz2) { std::cout << "finish multi-thread test" << std::endl; } -TEST(test_resnet50, trt_int8_bz2) { +TEST(tensorrt_tester_resnet50, trt_int8_bz2) { // init input data std::map my_input_data_map; my_input_data_map["inputs"] = PrepareInput(2); @@ -199,6 +199,39 @@ TEST(test_resnet50, trt_int8_bz2) { std::cout << "finish test" << std::endl; } +TEST(DISABLED_tensorrt_tester_resnet50, profile_multi_thread_trt_fp32) { + int batch_size = 2; + int thread_num = 4; + int repeat_time = 1000; + // init input data + std::map my_input_data_map; + my_input_data_map["inputs"] = PrepareInput(batch_size); + // init output data + std::map infer_output_data; + // prepare inference config + paddle_infer::Config config; + config.SetModel(FLAGS_modeldir + "/inference.pdmodel", + FLAGS_modeldir + "/inference.pdiparams"); + config.EnableUseGpu(100, 0); + config.EnableTensorRtEngine( + 1 << 20, 2, 3, paddle_infer::PrecisionType::kFloat32, false, false); + // get infer results from multi threads + services::PredictorPool pred_pool(config, thread_num); + std::vector> calcs; + for (int i = 0; i < thread_num; ++i) { + calcs.push_back(std::async(&paddle::test::SingleThreadProfile, + pred_pool.Retrive(i), &my_input_data_map, + repeat_time)); + } + double total_time_ = 0.0; + for (auto&& fut : calcs) { + total_time_ += fut.get(); + } + std::cout << total_time_ << std::endl; + + std::cout << "finish multi-thread profile" << std::endl; +} + } // namespace paddle_infer int main(int argc, char** argv) { diff --git a/paddle/fluid/inference/tests/infer_ut/test_resnet50_quant.cc b/paddle/fluid/inference/tests/infer_ut/test_resnet50_quant.cc index bc33c817b3c10..ed7ab7b5eee7b 100644 --- a/paddle/fluid/inference/tests/infer_ut/test_resnet50_quant.cc +++ b/paddle/fluid/inference/tests/infer_ut/test_resnet50_quant.cc @@ -52,7 +52,7 @@ paddle::test::Record PrepareInput(int batch_size) { return image_Record; } -TEST(DISABLED_test_resnet50_quant, multi_thread4_trt_int8_bz1) { +TEST(DISABLED_tensorrt_tester_resnet50_quant, multi_thread4_trt_int8_bz1) { int thread_num = 4; // init input data std::map input_data_map; @@ -94,7 +94,7 @@ TEST(DISABLED_test_resnet50_quant, multi_thread4_trt_int8_bz1) { std::cout << "finish test" << std::endl; } -TEST(DISABLED_test_resnet50_quant, multi_thread_multi_instance) { +TEST(DISABLED_tensorrt_tester_resnet50_quant, multi_thread_multi_instance) { int thread_num = 4; // init input data std::map input_data_fp32, input_data_quant; diff --git a/paddle/fluid/inference/tests/infer_ut/test_suite.h b/paddle/fluid/inference/tests/infer_ut/test_suite.h index b2546b180b976..0b580cd7c7e86 100644 --- a/paddle/fluid/inference/tests/infer_ut/test_suite.h +++ b/paddle/fluid/inference/tests/infer_ut/test_suite.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -148,5 +149,97 @@ void CompareRecord(std::map *truth_output_data, } } +// Timer, count in ms +class Timer { + public: + Timer() { reset(); } + void start() { start_t = std::chrono::high_resolution_clock::now(); } + void stop() { + auto end_t = std::chrono::high_resolution_clock::now(); + typedef std::chrono::microseconds ms; + auto diff = end_t - start_t; + ms counter = std::chrono::duration_cast(diff); + total_time += counter.count(); + } + void reset() { total_time = 0.; } + double report() { return total_time / 1000.0; } + + private: + double total_time; + std::chrono::high_resolution_clock::time_point start_t; +}; + +// single thread inference benchmark, return double time in ms +double SingleThreadProfile(paddle_infer::Predictor *predictor, + std::map *input_data_map, + int repeat_times = 2) { + // prepare input tensor + auto input_names = predictor->GetInputNames(); + for (const auto & [ key, value ] : *input_data_map) { + switch (value.type) { + case paddle::PaddleDType::INT64: { + std::vector input_value = + std::vector(value.data.begin(), value.data.end()); + auto input_tensor = predictor->GetInputHandle(key); + input_tensor->Reshape(value.shape); + input_tensor->CopyFromCpu(input_value.data()); + break; + } + case paddle::PaddleDType::INT32: { + std::vector input_value = + std::vector(value.data.begin(), value.data.end()); + auto input_tensor = predictor->GetInputHandle(key); + input_tensor->Reshape(value.shape); + input_tensor->CopyFromCpu(input_value.data()); + break; + } + case paddle::PaddleDType::FLOAT32: { + std::vector input_value = + std::vector(value.data.begin(), value.data.end()); + auto input_tensor = predictor->GetInputHandle(key); + input_tensor->Reshape(value.shape); + input_tensor->CopyFromCpu(input_value.data()); + break; + } + } + } + + Timer timer; // init prediction timer + timer.start(); + // inference + for (size_t i = 0; i < repeat_times; ++i) { + CHECK(predictor->Run()); + auto output_names = predictor->GetOutputNames(); + for (auto &output_name : output_names) { + auto output_tensor = predictor->GetOutputHandle(output_name); + std::vector output_shape = output_tensor->shape(); + int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1, + std::multiplies()); + switch (output_tensor->type()) { + case paddle::PaddleDType::INT64: { + std::vector out_data; + out_data.resize(out_num); + output_tensor->CopyToCpu(out_data.data()); + break; + } + case paddle::PaddleDType::FLOAT32: { + std::vector out_data; + out_data.resize(out_num); + output_tensor->CopyToCpu(out_data.data()); + break; + } + case paddle::PaddleDType::INT32: { + std::vector out_data; + out_data.resize(out_num); + output_tensor->CopyToCpu(out_data.data()); + break; + } + } + } + } + timer.stop(); + return timer.report(); +} + } // namespace test } // namespace paddle diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 7b98da04bc35f..60b3dbb25c45e 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -415,7 +415,8 @@ class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker { .SetDefault(0.02f); AddAttr("use_mkldnn", "(bool, default false) Only used in mkldnn kernel") - .SetDefault(false); + .SetDefault(false) + .AsExtra(); AddComment(R"DOC( LeakyRelu Activation Operator. @@ -439,11 +440,13 @@ class SoftplusOpMaker : public framework::OpProtoAndCheckerMaker { .SetDefault(20.0f); AddAttr("use_mkldnn", "(bool, default false) Only used in mkldnn kernel.") - .SetDefault(false); + .SetDefault(false) + .AsExtra(); AddAttr( "use_cudnn", "(bool, default false) Only used in cudnn kernel, need install cudnn.") - .SetDefault(false); + .SetDefault(false) + .AsExtra(); AddComment(R"DOC( :strong:`Softplus Activation Operator` diff --git a/paddle/fluid/operators/batch_norm_op_npu.cc b/paddle/fluid/operators/batch_norm_op_npu.cc index b4dc10777c651..dfb620a4e96bd 100644 --- a/paddle/fluid/operators/batch_norm_op_npu.cc +++ b/paddle/fluid/operators/batch_norm_op_npu.cc @@ -11,25 +11,30 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/batch_norm_op.h" +#include "paddle/fluid/operators/batch_norm_op.h" #include "paddle/fluid/operators/npu_op_runner.h" namespace paddle { namespace operators { +using NPUDeviceContext = platform::NPUDeviceContext; + template class NPUBatchNormOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { - auto &dev_ctx = ctx.template device_context(); const float epsilon = ctx.Attr("epsilon"); float momentum = ctx.Attr("momentum"); const bool is_test = ctx.Attr("is_test"); const bool use_global_stats = ctx.Attr("use_global_stats"); const bool trainable_stats = ctx.Attr("trainable_statistics"); - const bool test_mode = is_test && (!trainable_stats); - const std::string data_layout = ctx.Attr("data_layout"); + + bool test_mode = is_test && (!trainable_stats); + bool training = !test_mode && !use_global_stats; + + const std::string data_layout_str = ctx.Attr("data_layout"); + DataLayout data_layout = framework::StringToDataLayout(data_layout_str); const auto *x = ctx.Input("X"); const auto &x_dims = x->dims(); @@ -38,48 +43,30 @@ class NPUBatchNormOpKernel : public framework::OpKernel { "The input tensor X's dimension must equal to 4. But " "received X's shape = [%s], X's dimension = [%d].", x_dims, x_dims.size())); + const auto *running_mean = ctx.Input("Mean"); + const auto *running_var = ctx.Input("Variance"); + const auto *scale = ctx.Input("Scale"); + const auto *bias = ctx.Input("Bias"); auto *y = ctx.Output("Y"); y->mutable_data(ctx.GetPlace()); - const auto *scale = ctx.Input("Scale"); - const auto *bias = ctx.Input("Bias"); - - Tensor x_tensor, y_tesnor; + Tensor x_tensor(x->type()); + Tensor y_tesnor(y->type()); x_tensor.ShareDataWith(*x); y_tesnor.ShareDataWith(*y); - if (data_layout == "NHWC") { + if (data_layout == DataLayout::kNHWC) { x_tensor.set_layout(DataLayout::kNHWC); y_tesnor.set_layout(DataLayout::kNHWC); } - bool training = !test_mode && !use_global_stats; + auto stream = ctx.template device_context().stream(); if (!training) { - const auto *est_mean = ctx.Input("Mean"); - const auto *est_var = ctx.Input("Variance"); - framework::Tensor reserve_space1, reserve_space2; - reserve_space1.mutable_data(est_mean->dims(), ctx.GetPlace()); - reserve_space2.mutable_data(est_var->dims(), ctx.GetPlace()); - - const auto &runner = NpuOpRunner( - "BatchNorm", {x_tensor, *scale, *bias, *est_mean, *est_var}, - {y_tesnor, reserve_space1, reserve_space2, reserve_space1, - reserve_space2}, - {{"epsilon", epsilon}, - {"is_training", training}, - {"data_format", data_layout}}); - auto stream = dev_ctx.stream(); - runner.Run(stream); + const auto &runner_infer = NpuOpRunner( + "BNInfer", {x_tensor, *scale, *bias, *running_mean, *running_var}, + {y_tesnor}, {{"epsilon", epsilon}}); + runner_infer.Run(stream); } else { - // if MomentumTensor is set, use MomentumTensor value, momentum - // is only used in this training branch - if (ctx.HasInput("MomentumTensor")) { - const auto *mom_tensor = ctx.Input("MomentumTensor"); - Tensor mom_cpu; - TensorCopySync(*mom_tensor, platform::CPUPlace(), &mom_cpu); - momentum = mom_cpu.data()[0]; - } - auto *mean_out = ctx.Output("MeanOut"); auto *variance_out = ctx.Output("VarianceOut"); auto *saved_mean = ctx.Output("SavedMean"); @@ -89,45 +76,30 @@ class NPUBatchNormOpKernel : public framework::OpKernel { saved_mean->mutable_data(ctx.GetPlace()); saved_variance->mutable_data(ctx.GetPlace()); - framework::Tensor mean_tmp, variance_tmp; - mean_tmp.mutable_data(mean_out->dims(), ctx.GetPlace()); - variance_tmp.mutable_data(variance_out->dims(), ctx.GetPlace()); - - const auto &runner = NpuOpRunner( - "BatchNorm", {x_tensor, *scale, *bias}, - {y_tesnor, mean_tmp, variance_tmp, *saved_mean, *saved_variance}, - {{"epsilon", epsilon}, - {"is_training", training}, - {"data_format", data_layout}}); - auto stream = dev_ctx.stream(); - runner.Run(stream); - // Ascend can't output the estimated mean and variance - framework::Tensor this_factor_tensor; - this_factor_tensor.mutable_data(framework::make_ddim({1}), - ctx.GetPlace()); - framework::TensorFromVector({static_cast(1. - momentum)}, - dev_ctx, &this_factor_tensor); - framework::Tensor momentum_tensor; - momentum_tensor.mutable_data(framework::make_ddim({1}), - ctx.GetPlace()); - framework::TensorFromVector({static_cast(momentum)}, - dev_ctx, &momentum_tensor); - framework::Tensor ones_tensor; - ones_tensor.mutable_data(mean_out->dims(), ctx.GetPlace()); - framework::TensorFromVector( - std::vector(framework::product(mean_out->dims()), 1.0f), - dev_ctx, &ones_tensor); - - const auto &runner1 = NpuOpRunner("AddMatMatElements", - {*mean_out, *saved_mean, ones_tensor, - momentum_tensor, this_factor_tensor}, - {*mean_out}, {}); - runner1.Run(stream); - const auto &runner2 = NpuOpRunner( - "AddMatMatElements", {*variance_out, *saved_variance, ones_tensor, - momentum_tensor, this_factor_tensor}, - {*variance_out}, {}); - runner2.Run(stream); + // if MomentumTensor is set, use MomentumTensor value, momentum + // is only used in this training branch + if (ctx.HasInput("MomentumTensor")) { + const auto *mom_tensor = ctx.Input("MomentumTensor"); + Tensor mom_cpu; + TensorCopySync(*mom_tensor, platform::CPUPlace(), &mom_cpu); + momentum = mom_cpu.data()[0]; + } + + framework::Tensor sum, square_sum; + sum.mutable_data(running_mean->dims(), ctx.GetPlace()); + square_sum.mutable_data(running_mean->dims(), ctx.GetPlace()); + + const auto &runner_reduce = + NpuOpRunner("BNTrainingReduce", {x_tensor}, {sum, square_sum}, + {{"epsilon", epsilon}}); + runner_reduce.Run(stream); + + const auto &runner_update = NpuOpRunner( + "BNTrainingUpdate", {x_tensor, sum, square_sum, *scale, *bias, + *running_mean, *running_var}, + {y_tesnor, *mean_out, *variance_out, *saved_mean, *saved_variance}, + {{"factor", momentum}, {"epsilon", epsilon}}); + runner_update.Run(stream); } } }; @@ -136,85 +108,82 @@ template class NPUBatchNormGradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { - auto &dev_ctx = ctx.template device_context(); - const float epsilon = ctx.Attr("epsilon"); - const std::string data_layout = ctx.Attr("data_layout"); - bool use_global_stats = ctx.Attr("use_global_stats"); - - const auto *y_grad = ctx.Input(framework::GradVarName("Y")); + const auto *x = ctx.Input("X"); + const auto *d_y = ctx.Input(framework::GradVarName("Y")); const auto *scale = ctx.Input("Scale"); const auto *bias = ctx.Input("Bias"); - auto *saved_mean = ctx.Input("SavedMean"); - auto *saved_variance = ctx.Input("SavedVariance"); + const auto *saved_mean = ctx.Input("SavedMean"); + // SavedVariance have been reverted in forward operator + const auto *saved_inv_variance = ctx.Input("SavedVariance"); + const std::string data_layout_str = ctx.Attr("data_layout"); + bool use_global_stats = ctx.Attr("use_global_stats"); + const bool is_test = ctx.Attr("is_test"); + const float epsilon = ctx.Attr("epsilon"); + DataLayout data_layout = framework::StringToDataLayout(data_layout_str); - auto *x_grad = ctx.Output(framework::GradVarName("X")); - auto *scale_grad = ctx.Output(framework::GradVarName("Scale")); - auto *bias_grad = ctx.Output(framework::GradVarName("Bias")); + auto *d_x = ctx.Output(framework::GradVarName("X")); + auto *d_scale = ctx.Output(framework::GradVarName("Scale")); + auto *d_bias = ctx.Output(framework::GradVarName("Bias")); - const bool is_test = ctx.Attr("is_test"); use_global_stats = is_test || use_global_stats; - const Tensor *x = ctx.Input("X"); - const auto &x_dims = x->dims(); - PADDLE_ENFORCE_EQ(x_dims.size(), 4, - platform::errors::InvalidArgument( - "The input tensor X's dimension must equal to 4. But " - "received X's shape = [%s], X's dimension = [%d].", - x_dims, x_dims.size())); - - // init output - Tensor scale_grad_tmp, bias_grad_tmp, x_grad_tmp; - if (scale_grad && bias_grad) { - scale_grad->mutable_data(ctx.GetPlace()); - bias_grad->mutable_data(ctx.GetPlace()); - scale_grad_tmp.ShareDataWith(*scale_grad); - bias_grad_tmp.ShareDataWith(*bias_grad); - } else { - scale_grad_tmp.mutable_data(scale->dims(), ctx.GetPlace()); - bias_grad_tmp.mutable_data(bias->dims(), ctx.GetPlace()); + Tensor x_tensor(x->type()); + Tensor dy_tensor(d_y->type()); + x_tensor.ShareDataWith(*x); + dy_tensor.ShareDataWith(*d_y); + if (data_layout == DataLayout::kNHWC) { + x_tensor.set_layout(DataLayout::kNHWC); + dy_tensor.set_layout(DataLayout::kNHWC); } - Tensor x_tensor, y_grad_tensor, x_grad_tensor; - x_tensor.ShareDataWith(*x); - y_grad_tensor.ShareDataWith(*y_grad); - if (x_grad) { - x_grad->mutable_data(ctx.GetPlace()); - x_grad_tensor.ShareDataWith(*x_grad); - } else { - x_grad_tensor.mutable_data(x->dims(), ctx.GetPlace()); + Tensor scale_grad_tmp(scale->type()); + Tensor bias_grad_tmp(bias->type()); + if (d_scale == nullptr) { + scale_grad_tmp.Resize(scale->dims()); + d_scale = &scale_grad_tmp; } - if (data_layout == "NHWC") { - x_tensor.set_layout(DataLayout::kNHWC); - y_grad_tensor.set_layout(DataLayout::kNHWC); - x_grad_tensor.set_layout(DataLayout::kNHWC); + if (d_bias == nullptr) { + bias_grad_tmp.Resize(bias->dims()); + d_bias = &bias_grad_tmp; } - if (!use_global_stats) { - const auto &runner = NpuOpRunner( - "BatchNormGrad", - {y_grad_tensor, x_tensor, *scale, *saved_mean, *saved_variance}, - {x_grad_tensor, scale_grad_tmp, bias_grad_tmp, *saved_mean, - *saved_variance}, // segment fault if no reserve_space_3 and - // reserve_space_4 - {{"epsilon", epsilon}, - {"is_training", true}, - {"data_format", data_layout}}); - auto stream = dev_ctx.stream(); - runner.Run(stream); - } else { - const auto *running_mean = ctx.Input("Mean"); - const auto *running_var = ctx.Input("Variance"); - - const auto &runner = NpuOpRunner( - "BatchNormGrad", - {y_grad_tensor, x_tensor, *scale, *running_mean, *running_var}, - {x_grad_tensor, scale_grad_tmp, bias_grad_tmp, *running_mean, - *running_var}, // segment fault if no reserve_space_3 and - // reserve_space_4 - {{"epsilon", epsilon}, - {"is_training", true}, - {"data_format", data_layout}}); - auto stream = dev_ctx.stream(); - runner.Run(stream); + + auto stream = ctx.template device_context().stream(); + if (d_scale && d_bias) { + d_scale->mutable_data(ctx.GetPlace()); + d_bias->mutable_data(ctx.GetPlace()); + if (use_global_stats) { + const auto *running_mean = ctx.Input("Mean"); + const auto *running_variance = ctx.Input("Variance"); + const auto &runner_update = + NpuOpRunner("BNTrainingUpdateGrad", + {dy_tensor, x_tensor, *running_mean, *running_variance}, + {*d_scale, *d_bias}, {{"epsilon", epsilon}}); + runner_update.Run(stream); + } else { + const auto &runner_update = + NpuOpRunner("BNTrainingUpdateGrad", + {dy_tensor, x_tensor, *saved_mean, *saved_inv_variance}, + {*d_scale, *d_bias}, {{"epsilon", epsilon}}); + runner_update.Run(stream); + } + } + if (d_x) { + d_x->mutable_data(ctx.GetPlace()); + Tensor dx_tensor(d_x->type()); + dx_tensor.ShareDataWith(*d_x); + if (use_global_stats) { + const auto *running_var = ctx.Input("Variance"); + const auto &runner_infer = + NpuOpRunner("BNInferGrad", {dy_tensor, *scale, *running_var}, + {dx_tensor}, {{"epsilon", epsilon}}); + runner_infer.Run(stream); + } else { + const auto &runner_reduce = NpuOpRunner( + "BNTrainingReduceGrad", {dy_tensor, x_tensor, *d_scale, *d_bias, + *scale, *saved_mean, *saved_inv_variance}, + {dx_tensor}, {{"epsilon", epsilon}}); + runner_reduce.Run(stream); + } } } }; diff --git a/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc b/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc new file mode 100644 index 0000000000000..aee10dcdc2732 --- /dev/null +++ b/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc @@ -0,0 +1,104 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#if defined(PADDLE_WITH_NCCL) +#include +#endif +#include +#include +#include + +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/threadpool.h" +// #include "paddle/fluid/operators/distributed/distributed.h" +// #include "paddle/fluid/operators/distributed/request_handler_impl.h" +#if defined(PADDLE_WITH_NCCL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/nccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +class CCommInitMultiTrainerInferShape : public framework::InferShapeBase { + public: + ~CCommInitMultiTrainerInferShape() {} + void operator()(framework::InferShapeContext* ctx) const override{}; +}; + +class CCommInitMultiTrainerOp : public framework::OperatorBase { + public: + CCommInitMultiTrainerOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void RunImpl(const framework::Scope& scope, + const platform::Place& place) const override { + auto var = scope.FindVar(Input("X")); + PADDLE_ENFORCE_NOT_NULL( + var, platform::errors::InvalidArgument("Input X must be provided.")); +#if defined(PADDLE_WITH_NCCL) + ncclUniqueId* nccl_id = var->GetMutable(); + + int ntrainers = Attr("ntrainers"); + int train_id = Attr("trainer_id"); + int rid = Attr("ring_id"); + + std::vector devices = Attr>("devices"); + + if (devices.empty()) { + devices = platform::GetSelectedDevices(); + } + platform::NCCLCommContext::Instance().CreateNCCLCommMultiTrainer( + devices, nccl_id, ntrainers, train_id, rid); +#else + PADDLE_THROW(platform::errors::Unimplemented( + "PaddlePaddle should compile with GPU.")); +#endif + } +}; + +class CCommInitMultiTrainerOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "Raw variable contains a NCCL UniqueId instaces."); + AddComment(R"DOC( +CCommInitMultiTrainer operator + +Initialize collective communicatoin context within this trainer +)DOC"); + AddAttr("ntrainers", + "(int) The number of trainers of distributed trainers"); + AddAttr("trainer_id", + "(int) The id of the trainer in distributed training."); + AddAttr>("devices", + "(std::vector) which devices does the nccl " + "comm initialized on in each trainer") + .SetDefault({}); + AddAttr("ring_id", "(int default 0) user specified ring id") + .SetDefault(0); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(c_comm_init_multitrainer, ops::CCommInitMultiTrainerOp, + ops::CCommInitMultiTrainerInferShape, + ops::CCommInitMultiTrainerOpMaker); diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index bef3826e728fe..a28f32b6abbc3 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -116,6 +116,10 @@ std::vector ConvOp::ComputeOutputShape( "the output channels is %d, the filter's shape is [%s], " "the groups is %d.", filter_dims[0], filter_dims, groups)); + PADDLE_ENFORCE_GT( + filter_dims[0], 0, + platform::errors::InvalidArgument( + "the size of filter at axis 0 should be greater than 0")); framework::DDim in_data_dims; if (channel_last) { diff --git a/paddle/fluid/operators/data_norm_op.cc b/paddle/fluid/operators/data_norm_op.cc index 91e8c04a3d3d8..a89d2949a2bb4 100644 --- a/paddle/fluid/operators/data_norm_op.cc +++ b/paddle/fluid/operators/data_norm_op.cc @@ -232,7 +232,8 @@ class DataNormOpMaker : public framework::OpProtoAndCheckerMaker { .SetDefault(false); AddAttr("use_mkldnn", "(bool, default false) Only used in mkldnn kernel") - .SetDefault(false); + .SetDefault(false) + .AsExtra(); AddInput("X", "The input tensor"); AddInput("BatchSize", "BatchSize is a 1-dimensional tensor of size C " diff --git a/paddle/fluid/operators/deformable_conv_op.cu b/paddle/fluid/operators/deformable_conv_op.cu index 0a771627e060f..67f5ee332eeb2 100644 --- a/paddle/fluid/operators/deformable_conv_op.cu +++ b/paddle/fluid/operators/deformable_conv_op.cu @@ -126,7 +126,8 @@ __global__ void ModulatedDeformableCol2imGpuKernel( DmcnGetGradientWeight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width); - atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); + platform::CudaAtomicAdd(grad_im + cur_bottom_grad_pos, + weight * cur_top_grad); } } } @@ -748,6 +749,8 @@ namespace ops = paddle::operators; using CUDA = paddle::platform::CUDADeviceContext; REGISTER_OP_CUDA_KERNEL(deformable_conv, - ops::DeformableConvCUDAKernel); + ops::DeformableConvCUDAKernel, + ops::DeformableConvCUDAKernel); REGISTER_OP_CUDA_KERNEL(deformable_conv_grad, - ops::DeformableConvGradCUDAKernel); + ops::DeformableConvGradCUDAKernel, + ops::DeformableConvGradCUDAKernel); diff --git a/paddle/fluid/operators/deformable_conv_v1_op.cc b/paddle/fluid/operators/deformable_conv_v1_op.cc index dfba2070aac77..090d8a1fab0b9 100644 --- a/paddle/fluid/operators/deformable_conv_v1_op.cc +++ b/paddle/fluid/operators/deformable_conv_v1_op.cc @@ -307,6 +307,8 @@ REGISTER_OPERATOR(deformable_conv_v1, ops::DeformableConvV1Op, REGISTER_OPERATOR(deformable_conv_v1_grad, ops::DeformableConvV1GradOp); REGISTER_OP_CPU_KERNEL(deformable_conv_v1, - ops::DeformableConvV1CPUKernel); + ops::DeformableConvV1CPUKernel, + ops::DeformableConvV1CPUKernel); REGISTER_OP_CPU_KERNEL(deformable_conv_v1_grad, - ops::DeformableConvV1GradCPUKernel); + ops::DeformableConvV1GradCPUKernel, + ops::DeformableConvV1GradCPUKernel); diff --git a/paddle/fluid/operators/deformable_conv_v1_op.cu b/paddle/fluid/operators/deformable_conv_v1_op.cu index a865766f9adbb..e399a1fafdb71 100644 --- a/paddle/fluid/operators/deformable_conv_v1_op.cu +++ b/paddle/fluid/operators/deformable_conv_v1_op.cu @@ -99,7 +99,8 @@ __global__ void DeformableCol2imCUDAKernel( DmcnGetGradientWeight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width); - atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); + platform::CudaAtomicAdd(grad_im + cur_bottom_grad_pos, + weight * cur_top_grad); } } } @@ -604,6 +605,8 @@ class DeformableConvV1GradCUDAKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL(deformable_conv_v1, - ops::DeformableConvV1CUDAKernel); + ops::DeformableConvV1CUDAKernel, + ops::DeformableConvV1CUDAKernel); REGISTER_OP_CUDA_KERNEL(deformable_conv_v1_grad, - ops::DeformableConvV1GradCUDAKernel); + ops::DeformableConvV1GradCUDAKernel, + ops::DeformableConvV1GradCUDAKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index d6cf58f7a157f..3614602156f4d 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -165,31 +165,39 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker { "for broadcasting Y onto X. ") .SetDefault(-1); AddAttr("use_mkldnn", "(bool, default false). Used by MKLDNN.") - .SetDefault(false); + .SetDefault(false) + .AsExtra(); AddAttr("x_data_format", "This parameter is no longer used.") - .SetDefault(""); + .SetDefault("") + .AsExtra(); AddAttr("y_data_format", "This parameter is no longer used.") - .SetDefault(""); + .SetDefault("") + .AsExtra(); AddAttr( "use_quantizer", "(bool, default false) " "This parameter is no longer used. Use 'mkldnn_data_type' instead.") - .SetDefault(false); + .SetDefault(false) + .AsExtra(); AddAttr( "mkldnn_data_type", "(string, default \"float32\"). Data type of mkldnn kernel") .SetDefault("float32") - .InEnum({"float32", "int8", "bfloat16"}); + .InEnum({"float32", "int8", "bfloat16"}) + .AsExtra(); /* int8 parameters */ AddAttr("Scale_x", "(float, default 1.0f), The quantize scale of X tensor") - .SetDefault(1.0f); + .SetDefault(1.0f) + .AsExtra(); AddAttr("Scale_y", "(float, default 1.0f), The quantize scale of Y tensor") - .SetDefault(1.0f); + .SetDefault(1.0f) + .AsExtra(); AddAttr("Scale_out", "(float, default 1.0f), The quantize scale of output data") - .SetDefault(1.0f); + .SetDefault(1.0f) + .AsExtra(); AddOpComment(); } diff --git a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h index 17cf7c762def2..129c90a22be6b 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h @@ -15,10 +15,14 @@ #pragma once #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" - +#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h" namespace paddle { namespace operators { +#define MAX_INPUT_NUM 3 // the max num of ET for BroadcacstConfig + +namespace kps = paddle::operators::kernel_primitives; + struct DimensionsTransform { using DimVector = std::vector; typedef void (*MergeFunctor)(bool &, std::vector &, DimVector &, @@ -161,202 +165,113 @@ struct DimensionsTransform { } }; -struct StridesCalculation { - std::vector> strides; - std::vector divmoders; - - private: - // To calculate the strides of each input_tensor. - __inline__ void CalculateStrides( - int N, int dim_size, const std::vector> &in_dims) { - for (int j = 0; j < N; ++j) { - for (int i = 0; i < dim_size; ++i) { - strides[j][i] = in_dims[j][i] == 1 ? 0 : strides[j][i]; - strides[j][i] = - (i != 0 && strides[j][i] != 0) - ? std::accumulate(in_dims[j].begin(), in_dims[j].begin() + i, 1, - std::multiplies()) - : strides[j][i]; - } - } - } - - public: - explicit StridesCalculation(const int64_t &dim_size, - const std::vector> &in_dims, - const std::vector &out_dims) { - const auto N = in_dims.size(); - divmoders.resize(dim_size); - strides.resize(N, std::vector(dim_size, 1)); - - for (int i = 0; i < dim_size; ++i) { - divmoders[i] = platform::FastDivMod(out_dims[i]); - } - CalculateStrides(N, dim_size, in_dims); - } -}; - -template -struct BroadcastArgsWrapper { - using InVecType = platform::AlignedVector; - using OutVecType = platform::AlignedVector; - - OutT *out_data; - OutVecType *vec_out_data; - const InT *__restrict__ in_data[ET]; - const InVecType *__restrict__ vec_in_data[ET]; - bool no_broadcast[ET]; - platform::FastDivMod divmoders[kDims]; - uint32_t strides[ET][framework::DDim::kMaxRank]; - uint32_t scalar_cal_offset; - Functor func; - - HOSTDEVICE BroadcastArgsWrapper( - const std::vector &ins, framework::Tensor *out, - int scalar_cal_offset, Functor func, - const StridesCalculation &offset_calculator) - : scalar_cal_offset(scalar_cal_offset), func(func) { - for (int j = 0; j < ET; ++j) { - in_data[j] = ins[j]->data(); - vec_in_data[j] = reinterpret_cast(in_data[j]); - no_broadcast[j] = ins[j]->dims() == out->dims() ? true : false; - memcpy(strides[j], offset_calculator.strides[j].data(), - kDims * sizeof(uint32_t)); - } - out_data = out->data(); - vec_out_data = reinterpret_cast(out_data); - memcpy(divmoders, offset_calculator.divmoders.data(), - kDims * sizeof(platform::FastDivMod)); - } - - __device__ __forceinline__ uint32_t GetOffsetByDivmod(int idx, int in_idx) { - uint32_t offset = 0; - -#pragma unroll(kDims) - for (int i = 0; i < kDims; ++i) { - auto fast_divmoder = divmoders[i].Divmod(idx); - idx = fast_divmoder.val[0]; - offset += fast_divmoder.val[1] * strides[in_idx][i]; - } - return offset; - } - - __device__ __forceinline__ void LoadVectorizedDataCommon( - InVecType *vector_args, int tid, int idx) { - *vector_args = vec_in_data[idx][tid]; - } - - __device__ __forceinline__ void LoadVectorizedDataByDivmod(InT *scalar_args, - int tid, int idx) { - int index = tid * VecSize; -#pragma unroll(VecSize) - for (int i = 0; i < VecSize; ++i) { - uint32_t offset = GetOffsetByDivmod(index + i, idx); - scalar_args[i] = in_data[idx][offset]; - } - } - - __device__ __forceinline__ void LoadScalarizedDataCommon(InT args[], int tid, - int idx) { - args[idx] = in_data[idx][tid + scalar_cal_offset]; - } - - __device__ __forceinline__ void LoadScalarizedDataByDivmod(InT args[], - int tid, int idx) { - auto offset = GetOffsetByDivmod(tid + scalar_cal_offset, idx); - args[idx] = in_data[idx][offset]; - } - - __device__ __forceinline__ void LoadVectorizedData(InT (*args)[VecSize], - int tid) { -#pragma unroll(ET) - for (int j = 0; j < ET; ++j) { - if (no_broadcast[j]) { - InVecType *vector_args = reinterpret_cast(args[j]); - LoadVectorizedDataCommon(vector_args, tid, j); - } else { - LoadVectorizedDataByDivmod(args[j], tid, j); - } - } +template +__device__ __forceinline__ void LoadData( + T *dst, const T *__restrict__ src, uint32_t block_offset, + const kps::details::BroadcastConfig &config, int numel, int num, + bool need_broadcast) { + // numel : whole num of output + // num: how many data will be deal with in this time + if (need_broadcast) { + kps::ReadDataBc( + dst, src, block_offset, config, numel, 1, 1); + } else { + kps::ReadData(dst, src + block_offset, num); } +} - __device__ __forceinline__ void LoadScalarizedData(InT args[], int tid) { -#pragma unroll(ET) - for (int j = 0; j < ET; ++j) { - if (no_broadcast[j]) { - LoadScalarizedDataCommon(args, tid, j); - } else { - LoadScalarizedDataByDivmod(args, tid, j); - } - } +template +__device__ void DealSegment( + const framework::Array &in, OutT *out, + const framework::Array &use_broadcast, uint32_t numel, + const framework::Array, + MAX_INPUT_NUM> &configlists, + int num, Functor func) { + InT args[ET][VecSize]; + OutT result[VecSize]; + int block_offset = blockIdx.x * blockDim.x * VecSize; +// load +#pragma unroll + for (int i = 0; i < ET; i++) { + kps::Init(args[i], static_cast(1.0f)); + LoadData(args[i], in[i], block_offset, + configlists[i], numel, num, + use_broadcast[i]); } - - __device__ __forceinline__ void StoreVectorizedData(OutVecType vec_args_out, - int tid) { - vec_out_data[tid] = vec_args_out; + // compute + if (ET == kUnary) { + kps::ElementwiseUnary(result, args[0], + func); + } else if (ET == kBinary) { + kps::ElementwiseBinary(result, args[0], + args[1], func); + } else { + kps::ElementwiseTernary( + result, args[0], args[1], args[2], func); } + // compute + kps::WriteData(out + block_offset, result, + num); +} - __device__ __forceinline__ void StoreScalarizedData(OutT args_out, int tid) { - out_data[scalar_cal_offset + tid] = args_out; +template +__global__ void BroadcastKernel( + framework::Array in, OutT *out, + framework::Array use_broadcast, uint32_t numel, + framework::Array, MAX_INPUT_NUM> + configlists, + int main_tid, int tail_tid, Functor func) { + int block_offset = blockIdx.x * blockDim.x * VecSize; + // data offset of this block + if (blockIdx.x < main_tid) { + int num = blockDim.x * VecSize; // blockIdx.x < main_tid + DealSegment( + in, out, use_broadcast, numel, configlists, num, func); + } else { // reminder + int num = tail_tid; + DealSegment( + in, out, use_broadcast, numel, configlists, num, func); } -}; - -template -__device__ inline void ScalarizedBroadcastKernelImpl( - BroadcastArgsWrapper broadcast_wrapper, int tid) { - InT args[ET]; - OutT args_out; - broadcast_wrapper.LoadScalarizedData(args, tid); - - // Calcualtion of the in_tensor data. - args_out = broadcast_wrapper.func(args); - - broadcast_wrapper.StoreScalarizedData(args_out, tid); } -template -__device__ inline void VectorizedBroadcastKernelImpl( - BroadcastArgsWrapper broadcast_wrapper, int tid) { - using OutVecType = platform::AlignedVector; - OutVecType args_out; - InT ins[ET]; - InT args[ET][VecSize]; - broadcast_wrapper.LoadVectorizedData(args, tid); +template +void LaunchKernel(const platform::CUDADeviceContext &ctx, + const std::vector &ins, + framework::Tensor *out, Functor func, + DimensionsTransform merge_dims) { + int numel = out->numel(); + const int threads = 256; + int blocks = ((numel + VecSize - 1) / VecSize + threads - 1) / threads; -#pragma unroll(VecSize) - for (int i = 0; i < VecSize; ++i) { -#pragma unroll(ET) - for (int j = 0; j < ET; ++j) { - ins[j] = args[j][i]; + int main_tid = numel / (VecSize * threads); + int tail_tid = numel % (VecSize * threads); + auto stream = ctx.stream(); + OutT *out_data = out->data(); + + framework::Array, MAX_INPUT_NUM> + configlists; + framework::Array use_broadcast; + framework::Array ins_data; + + for (int i = 0; i < ET; i++) { + use_broadcast[i] = (ins[i]->numel() != numel); + ins_data[i] = ins[i]->data(); + if (use_broadcast[i]) { + // get the broadcast config, + // if data shape is[m, n], then you should set data_dim = {n, m} + // eg: out's shape [3, 45, 1]. then out_dims = {1, 45, 3} + configlists[i] = kps::details::BroadcastConfig( + merge_dims.out_dims, merge_dims.in_dims[i], merge_dims.dim_size); } - args_out.val[i] = broadcast_wrapper.func(ins); } - broadcast_wrapper.StoreVectorizedData(args_out, tid); -} -template -__global__ void ElementwiseBroadcastKernel( - BroadcastArgsWrapper broadcast_wrapper, int main_tid, int tail_tid) { - int tid = threadIdx.x + blockIdx.x * blockDim.x; - // Vectorized calculation of major data whose length is the max multipler of - // VecSize, - // eg: Calcualting the front 1024-length data in total 1027 data once VecSize - // is 4. - if (tid < main_tid) { - VectorizedBroadcastKernelImpl( - broadcast_wrapper, tid); - } - // Scalarzed calculation of rest data whose lenght cannot fulfill VecSize. - // eg: Calcualting the rest 3-length data in total 1027 data once VecSize is - // 4. - if (tid < tail_tid) { - ScalarizedBroadcastKernelImpl( - broadcast_wrapper, tid); - } + BroadcastKernel<<>>( + ins_data, out_data, use_broadcast, numel, configlists, main_tid, tail_tid, + func); } template &ins, framework::Tensor *out, int axis, Functor func) { - int numel = out->numel(); - int threads = GetThreadsConfig(ctx, numel, VecSize); - int blocks = ((numel + VecSize - 1) / VecSize + threads - 1) / threads; - int main_tid = numel / VecSize; - int tail_tid = numel % VecSize; - int vec_len = main_tid * VecSize; - auto stream = ctx.stream(); - const auto merge_dims = DimensionsTransform(ins, out->dims(), axis); - const auto offset_calculator = StridesCalculation( - merge_dims.dim_size, merge_dims.in_dims, merge_dims.out_dims); +#define DIM_SIZE(size) \ + case size: { \ + LaunchKernel(ctx, ins, out, func, \ + merge_dims); \ + } break; switch (merge_dims.dim_size) { - case 1: { - auto broadcast_wrapper = - BroadcastArgsWrapper( - ins, out, vec_len, func, offset_calculator); - ElementwiseBroadcastKernel<<>>( - broadcast_wrapper, main_tid, tail_tid); - break; - } - case 2: { - auto broadcast_wrapper = - BroadcastArgsWrapper( - ins, out, vec_len, func, offset_calculator); - ElementwiseBroadcastKernel<<>>( - broadcast_wrapper, main_tid, tail_tid); - break; - } - case 3: { - auto broadcast_wrapper = - BroadcastArgsWrapper( - ins, out, vec_len, func, offset_calculator); - ElementwiseBroadcastKernel<<>>( - broadcast_wrapper, main_tid, tail_tid); - break; - } - case 4: { - auto broadcast_wrapper = - BroadcastArgsWrapper( - ins, out, vec_len, func, offset_calculator); - ElementwiseBroadcastKernel<<>>( - broadcast_wrapper, main_tid, tail_tid); - break; - } - case 5: { - auto broadcast_wrapper = - BroadcastArgsWrapper( - ins, out, vec_len, func, offset_calculator); - ElementwiseBroadcastKernel<<>>( - broadcast_wrapper, main_tid, tail_tid); - break; - } - case 6: { - auto broadcast_wrapper = - BroadcastArgsWrapper( - ins, out, vec_len, func, offset_calculator); - ElementwiseBroadcastKernel<<>>( - broadcast_wrapper, main_tid, tail_tid); - break; - } - case 7: { - auto broadcast_wrapper = - BroadcastArgsWrapper( - ins, out, vec_len, func, offset_calculator); - ElementwiseBroadcastKernel<<>>( - broadcast_wrapper, main_tid, tail_tid); - break; - } - case 8: { - auto broadcast_wrapper = - BroadcastArgsWrapper( - ins, out, vec_len, func, offset_calculator); - ElementwiseBroadcastKernel<<>>( - broadcast_wrapper, main_tid, tail_tid); - break; - } - default: { - PADDLE_THROW(platform::errors::InvalidArgument( - "The maximum dimension of input tensor is expected to be less than " - "%d, but recieved %d.\n", - merge_dims.dim_size, framework::DDim::kMaxRank)); - } + DIM_SIZE(1); + DIM_SIZE(2); + DIM_SIZE(3); + DIM_SIZE(4); + DIM_SIZE(5); + DIM_SIZE(6); + DIM_SIZE(7); + DIM_SIZE(8); } +#undef DIM_SIZE } template @@ -528,5 +369,7 @@ void LaunchElementwiseCudaKernel( } } +#undef MAX_INPUT_NUM + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h index 1b680cfc995a5..e591b145d2388 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h" #include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/fast_divmod.h" @@ -26,6 +27,7 @@ limitations under the License. */ namespace paddle { namespace operators { +namespace kps = paddle::operators::kernel_primitives; enum ElementwiseType { kUnary = 1, kBinary = 2, kTernary = 3 }; /* @@ -67,121 +69,74 @@ int GetVectorizedSizeForIO(const std::vector &ins, return vec_size; } -template -struct ElementwiseDataWrapper { - using InVecType = platform::AlignedVector; - using OutVecType = platform::AlignedVector; - - const InT *__restrict__ in_data[ET]; - OutT *out_data; - uint32_t scalar_cal_offset; - - HOSTDEVICE ElementwiseDataWrapper( - const std::vector &ins, - std::vector *outs, uint32_t scalar_cal_offset) - : scalar_cal_offset(scalar_cal_offset) { -#pragma unroll - for (int i = 0; i < ET; ++i) { - in_data[i] = ins[i]->data(); - } - out_data = (*outs)[0]->data(); - } - - inline __device__ void LoadVectorizedData(InVecType vec_args[], int tid) { -#pragma unroll - for (int i = 0; i < ET; ++i) { - const InVecType *in_vec_data = - reinterpret_cast(in_data[i]); - vec_args[i] = in_vec_data[tid]; - } - } - - inline __device__ void LoadScalarizedData(InT args[], int tid) { +template +__device__ void DealSegment( + const framework::Array &in, OutT *out, int num, + Functor func) { + int data_offset = VecSize * blockIdx.x * blockDim.x; + InT args[ET][VecSize]; + OutT result[VecSize]; +// load data #pragma unroll - for (int i = 0; i < ET; ++i) { - args[i] = in_data[i][tid + scalar_cal_offset]; - } - } - - inline __device__ void StoreVectorizedData(OutVecType res, int tid) { - OutVecType *out_vec = reinterpret_cast(out_data); - out_vec[tid] = res; - } - - inline __device__ void StoreScalarizedData(OutT res, int tid) { - out_data[tid + scalar_cal_offset] = res; + for (int i = 0; i < ET; i++) { + kps::Init(args[i], static_cast(1.0f)); + kps::ReadData(args[i], in[i] + data_offset, + num); } -}; - -template -__device__ inline void VectorizedKernelImpl(ElementwiseWrapper data, - Functor func, int tid) { - using InVecType = platform::AlignedVector; - using OutVecType = platform::AlignedVector; - InVecType ins_vec[ET]; - OutVecType out_vec; - InT *ins_ptr[ET]; - InT ins[ET]; -#pragma unroll - for (int i = 0; i < ET; ++i) { - ins_ptr[i] = reinterpret_cast(&(ins_vec[i])); - } - // load - data.LoadVectorizedData(ins_vec, tid); -// compute -#pragma unroll - for (int i = 0; i < VecSize; ++i) { -#pragma unroll - for (int j = 0; j < ET; ++j) { - ins[j] = ins_ptr[j][i]; - } - out_vec.val[i] = func(ins); + // compute + if (ET == kUnary) { + kps::ElementwiseUnary(result, args[0], + func); + } else if (ET == kBinary) { + kps::ElementwiseBinary(result, args[0], + args[1], func); + } else { + kps::ElementwiseTernary( + result, args[0], args[1], args[2], func); } - // store - data.StoreVectorizedData(out_vec, tid); -} -template -__device__ inline void ScalarKernelImpl(ElementwiseWrapper data, Functor func, - int tid) { - InT ins[ET]; - OutT out; - - // load - data.LoadScalarizedData(ins, tid); - // compute - out = func(ins); // store - data.StoreScalarizedData(out, tid); + kps::WriteData(out + data_offset, result, + num); } -template -__global__ void VectorizedKernel(ElementwiseWrapper data, int main_tid, - int tail_tid, Functor func) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; - - if (tid < main_tid) { - VectorizedKernelImpl( - data, func, tid); - } - if (tid < tail_tid) { - ScalarKernelImpl(data, func, - tid); +template +__global__ void ElementVectorizeKernel( + framework::Array in, OutT *out, int size, + Functor func) { + int data_offset = VecSize * blockIdx.x * blockDim.x; + int num = size - data_offset; + // the num this time have to deal with + if (VecSize * blockDim.x > num) { // reminder segment + DealSegment(in, out, num, func); + } else { // complete segment + DealSegment(in, out, num, func); } } -template -__global__ void ScalarKernel(ElementwiseWrapper data, int numel, Functor func) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid < numel) { - ScalarKernelImpl(data, func, - tid); +template +void ElementwiseCudaKernel(const platform::CUDADeviceContext &ctx, + const std::vector &ins, + std::vector *outs, + Functor func) { + auto numel = ins[0]->numel(); + int block_size = GetThreadsConfig(ctx, numel, VecSize); + int grid_size = + ((numel + VecSize - 1) / VecSize + block_size - 1) / block_size; + + auto stream = ctx.stream(); + OutT *out = (*outs)[0]->data(); + framework::Array in; + for (int i = 0; i < ET; i++) { + in[i] = ins[i]->data(); } + ElementVectorizeKernel<<>>( + in, out, numel, func); } template @@ -190,43 +145,17 @@ void LaunchSameDimsElementwiseCudaKernel( const std::vector &ins, std::vector *outs, Functor func) { // calculate the max vec_size for all ins and outs - auto numel = ins[0]->numel(); int vec_size = GetVectorizedSizeForIO(ins, *outs); - int block_size = GetThreadsConfig(ctx, numel, vec_size); - int grid_size = - ((numel + vec_size - 1) / vec_size + block_size - 1) / block_size; - int main_tid = numel / vec_size; - int tail_tid = numel % vec_size; - uint32_t vec_len = main_tid * vec_size; - - // cuda kernel - auto stream = ctx.stream(); - switch (vec_size) { - case 4: { - auto data_wrapper = - ElementwiseDataWrapper(ins, outs, vec_len); - VectorizedKernel<<>>( - data_wrapper, main_tid, tail_tid, func); + case 4: + ElementwiseCudaKernel(ctx, ins, outs, func); break; - } - case 2: { - auto data_wrapper = - ElementwiseDataWrapper(ins, outs, vec_len); - VectorizedKernel<<>>( - data_wrapper, main_tid, tail_tid, func); + case 2: + ElementwiseCudaKernel(ctx, ins, outs, func); break; - } - case 1: { - auto data_wrapper = - ElementwiseDataWrapper(ins, outs, 0); - ScalarKernel<<>>(data_wrapper, - numel, func); + case 1: + ElementwiseCudaKernel(ctx, ins, outs, func); break; - } default: { PADDLE_THROW(platform::errors::Unimplemented( "Unsupported vectorized size: %d !", vec_size)); diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc index 778bab9f4dd26..0858a43838b96 100644 --- a/paddle/fluid/operators/flatten_op.cc +++ b/paddle/fluid/operators/flatten_op.cc @@ -188,8 +188,8 @@ class Flatten2Op : public framework::OperatorWithKernel { // are the same. ctx->ShareLoD("X", "Out"); } - - OP_INOUT_CHECK(ctx->HasOutput("XShape"), "Output", "XShape", "Flatten2"); + if (!ctx->HasOutput("XShape")) return; + // OP_INOUT_CHECK(ctx->HasOutput("XShape"), "Output", "XShape", "Flatten2"); std::vector xshape_dims(in_dims.size() + 1); xshape_dims[0] = 0; for (int i = 0; i < in_dims.size(); ++i) { @@ -207,7 +207,8 @@ class Flatten2OpMaker : public FlattenOpMaker { AddOutput("XShape", "XShape is just used to store the shape and lod of X, which will " "be used in FlattenGradOp.") - .AsIntermediate(); + .AsIntermediate() + .AsExtra(); } }; @@ -281,8 +282,8 @@ class FlattenContiguousRangeOp : public framework::OperatorWithKernel { // are the same. ctx->ShareLoD("X", "Out"); } - - OP_INOUT_CHECK(ctx->HasOutput("XShape"), "Output", "XShape", "Flatten2"); + if (!ctx->HasOutput("XShape")) return; + // OP_INOUT_CHECK(ctx->HasOutput("XShape"), "Output", "XShape", "Flatten2"); std::vector xshape_dims(in_dims.size() + 1); xshape_dims[0] = 0; for (int i = 0; i < in_dims.size(); ++i) { @@ -361,7 +362,8 @@ Case 2: AddOutput("XShape", "XShape is just used to store the shape and lod of X, which will " "be used in FlattenGradOp.") - .AsIntermediate(); + .AsIntermediate() + .AsExtra(); } }; diff --git a/paddle/fluid/operators/log_softmax_op_npu.cc b/paddle/fluid/operators/log_softmax_op_npu.cc index d955bef6ce2ac..a2c3a1b323af9 100644 --- a/paddle/fluid/operators/log_softmax_op_npu.cc +++ b/paddle/fluid/operators/log_softmax_op_npu.cc @@ -14,9 +14,13 @@ #include "paddle/fluid/operators/log_softmax_op.h" #include "paddle/fluid/operators/npu_op_runner.h" + namespace paddle { namespace operators { -template + +using NPUDeviceContext = platform::NPUDeviceContext; + +template class LogSoftmaxNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -24,22 +28,47 @@ class LogSoftmaxNPUKernel : public framework::OpKernel { auto* Out = ctx.Output("Out"); const int rank = X->dims().size(); const int axis = CanonicalAxis(ctx.Attr("axis"), rank); - std::vector axes; - axes.push_back(axis); - framework::NPUAttributeMap attr_input = {{"axes", axes}}; Out->mutable_data(ctx.GetPlace()); - const auto& runner = NpuOpRunner("LogSoftmaxV2", {*X}, {*Out}, attr_input); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); + + if (X->numel() != 0) { + auto stream = ctx.template device_context().stream(); + const auto& runner = NpuOpRunner("LogSoftmaxV2", {*X}, {*Out}, + {{"axes", std::vector{axis}}}); + runner.Run(stream); + } } }; + +template +class LogSoftmaxGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* Out = ctx.Input("Out"); + auto* dOut = ctx.Input(framework::GradVarName("Out")); + auto* dX = ctx.Output(framework::GradVarName("X")); + const int rank = dOut->dims().size(); + const int axis = CanonicalAxis(ctx.Attr("axis"), rank); + + // allocate memory on device. + dX->mutable_data(ctx.GetPlace()); + + if (dOut->numel() != 0) { + auto stream = ctx.template device_context().stream(); + const auto& runner = NpuOpRunner("LogSoftmaxGrad", {*dOut, *Out}, {*dX}, + {{"axis", std::vector{axis}}}); + runner.Run(stream); + } + } +}; + } // namespace operators } // namespace paddle + namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL( - log_softmax, - ops::LogSoftmaxNPUKernel); +REGISTER_OP_NPU_KERNEL(log_softmax, ops::LogSoftmaxNPUKernel, + ops::LogSoftmaxNPUKernel); + +REGISTER_OP_NPU_KERNEL(log_softmax_grad, ops::LogSoftmaxGradNPUKernel, + ops::LogSoftmaxGradNPUKernel); diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc index c65fa6340708a..387cd92b69f92 100644 --- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc +++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc @@ -29,11 +29,6 @@ class LookupTableV2NPUKernel : public framework::OpKernel { auto *output_t = ctx.Output("Out"); // float tensor auto *table_t = ctx.Input("W"); - // It seems cann 20.1 accepts int64, but cann 20.2+ not. - PADDLE_ENFORCE_EQ(ids_t->type(), framework::proto::VarType::INT32, - platform::errors::Unimplemented( - "The index of LookupTableV2 should be int32.")); - auto *table_var = ctx.InputVar("W"); PADDLE_ENFORCE_EQ( table_var->IsType(), true, diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc index 78747108d44f5..c0d813ccc215e 100644 --- a/paddle/fluid/operators/matmul_op.cc +++ b/paddle/fluid/operators/matmul_op.cc @@ -715,53 +715,66 @@ class MatMulOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr( "use_mkldnn", "(bool, default false) Indicates if MKL-DNN kernel will be used") - .SetDefault(false); + .SetDefault(false) + .AsExtra(); AddAttr>("fused_reshape_X", R"DOC(Shape of fused reshape of `X` input.)DOC") - .SetDefault({}); + .SetDefault({}) + .AsExtra(); AddAttr>("fused_reshape_Y", R"DOC(Shape of fused reshape of `Y` input.)DOC") - .SetDefault({}); + .SetDefault({}) + .AsExtra(); AddAttr>("fused_transpose_X", R"DOC(Axis of fused transpose of `X` input.)DOC") - .SetDefault({}); + .SetDefault({}) + .AsExtra(); AddAttr>("fused_transpose_Y", R"DOC(Axis of fused transpose of `Y` input.)DOC") - .SetDefault({}); + .SetDefault({}) + .AsExtra(); AddAttr>( "fused_reshape_Out", R"DOC(When MKLDNN MatMul_transpose_reshape fuse activated, " "it's a shape atribute of fused reshape for `Out` output.)DOC") - .SetDefault({}); + .SetDefault({}) + .AsExtra(); AddAttr>( "fused_transpose_Out", R"DOC(When MKLDNN MatMul_transpose_reshape fuse activated, " "it's a axis atribute of fused transpose for `Out` output.)DOC") - .SetDefault({}); + .SetDefault({}) + .AsExtra(); AddAttr( "use_quantizer", "(bool, default false) " "This parameter is no longer used. Use 'mkldnn_data_type' instead.") - .SetDefault(false); + .SetDefault(false) + .AsExtra(); AddAttr( "mkldnn_data_type", "(string, default \"float32\"). Data type of mkldnn kernel") .SetDefault("float32") - .InEnum({"float32", "int8", "bfloat16"}); + .InEnum({"float32", "int8", "bfloat16"}) + .AsExtra(); /* int8 parameters */ AddAttr("Scale_x", "(float, default 1.0f), The quantize scale of X tensor") - .SetDefault(1.0f); + .SetDefault(1.0f) + .AsExtra(); AddAttr("Scale_y", "(float, default 1.0f), The quantize scale of Y tensor") - .SetDefault(1.0f); + .SetDefault(1.0f) + .AsExtra(); AddAttr("Scale_out", "(float, default 1.0f), The quantize scale of output data") - .SetDefault(1.0f); + .SetDefault(1.0f) + .AsExtra(); AddAttr("force_fp32_output", "(bool, default false) Force INT8 kernel output FP32, only " "used in MKL-DNN INT8") - .SetDefault(false); + .SetDefault(false) + .AsExtra(); #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ !defined(PADDLE_WITH_HIP) diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc index 4ec9a052bb2e5..b875149ec63c8 100644 --- a/paddle/fluid/operators/matmul_v2_op.cc +++ b/paddle/fluid/operators/matmul_v2_op.cc @@ -141,12 +141,14 @@ class MatMulV2OpMaker : public framework::OpProtoAndCheckerMaker { .SetDefault(false); AddAttr("use_mkldnn", "(bool, default false) Only used in mkldnn kernel") - .SetDefault(false); + .SetDefault(false) + .AsExtra(); AddAttr( "mkldnn_data_type", "(string, default \"float32\"). Data type of mkldnn kernel") .SetDefault("float32") - .InEnum({"float32", "bfloat16"}); + .InEnum({"float32", "bfloat16"}) + .AsExtra(); AddComment( R"DOC(Matrix multiplication Out = X * Y. A has shape (d0, d1 ... M, K), B has shape (d0, d1 ... K, N), Out has shape ((d0, d1 ... M, N)). diff --git a/paddle/fluid/operators/memcpy_d2h_op.cc b/paddle/fluid/operators/memcpy_d2h_op.cc index 41b8b367918f8..3158b0963a43a 100644 --- a/paddle/fluid/operators/memcpy_d2h_op.cc +++ b/paddle/fluid/operators/memcpy_d2h_op.cc @@ -131,7 +131,7 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(memcpy_d2h, float, ops::MemcpyD2HKernel, double, ops::MemcpyD2HKernel, plat::float16, ops::MemcpyD2HKernel); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_ROCM) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) REGISTER_OP_CUDA_KERNEL_FUNCTOR(memcpy_d2h, float, ops::MemcpyD2HKernel, double, ops::MemcpyD2HKernel, int, ops::MemcpyD2HKernel, int64_t, ops::MemcpyD2HKernel, bool, diff --git a/paddle/fluid/operators/memcpy_h2d_op.cc b/paddle/fluid/operators/memcpy_h2d_op.cc index e439be1620183..f100dc6f7a53e 100644 --- a/paddle/fluid/operators/memcpy_h2d_op.cc +++ b/paddle/fluid/operators/memcpy_h2d_op.cc @@ -131,7 +131,7 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(memcpy_h2d, float, ops::MemcpyH2DKernel, double, ops::MemcpyH2DKernel, plat::float16, ops::MemcpyH2DKernel); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_ROCM) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) REGISTER_OP_CUDA_KERNEL_FUNCTOR(memcpy_h2d, float, ops::MemcpyH2DKernel, double, ops::MemcpyH2DKernel, int, ops::MemcpyH2DKernel, int64_t, ops::MemcpyH2DKernel, bool, diff --git a/paddle/fluid/operators/memcpy_op.cc b/paddle/fluid/operators/memcpy_op.cc index ecd2d48dcbd10..56eee13cb060a 100644 --- a/paddle/fluid/operators/memcpy_op.cc +++ b/paddle/fluid/operators/memcpy_op.cc @@ -141,7 +141,7 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(memcpy, float, ops::MemcpyKernel, double, ops::MemcpyKernel, plat::float16, ops::MemcpyKernel); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_ROCM) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) REGISTER_OP_CUDA_KERNEL_FUNCTOR(memcpy, float, ops::MemcpyKernel, double, ops::MemcpyKernel, int, ops::MemcpyKernel, int64_t, ops::MemcpyKernel, bool, diff --git a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc index 35f93eba690e8..723c3c8352d54 100644 --- a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/mkldnn/matmul_mkldnn_op.h" +#include using dnnl::memory; using dnnl::primitive; @@ -20,6 +21,7 @@ using paddle::framework::DataLayout; using paddle::framework::ExecutionContext; using paddle::framework::vectorize; using paddle::platform::GetMKLDNNFormat; +using paddle::platform::MKLDNNFormatForSize; using paddle::platform::MKLDNNDeviceContext; using paddle::platform::MKLDNNGetDataType; using paddle::platform::to_void_cast; @@ -82,15 +84,39 @@ static Tensor FoldFirstAndLastDims(const MKLDNNDeviceContext& dev_ctx, } template +constexpr bool IsInt8() { + return std::is_same::value || std::is_same::value; +} + +template +constexpr bool IsBfloat16() { + return std::is_same::value; +} + +// Get row matrix shape from a vector shape. If the rank of x_dim > 1, the +// original x_dim is returned. +static paddle::framework::DDim RowMatrixDimsFromVector( + const paddle::framework::DDim& x_dim) { + return x_dim.size() > 1 ? x_dim : paddle::framework::make_ddim({1, x_dim[0]}); +} + +// Get column matrix shape from a vector shape. If the ran of y_dim > 1, the +// original y_dim is returned. +static paddle::framework::DDim ColumnMatrixDimsFromVector( + const paddle::framework::DDim& y_dim) { + return y_dim.size() > 1 ? y_dim : paddle::framework::make_ddim({y_dim[0], 1}); +} + +template class MatMulMKLDNNHandler - : public paddle::platform::MKLDNNHandlerNoCachingT { + : public paddle::platform::MKLDNNHandlerNoCachingT { public: MatMulMKLDNNHandler(const mkldnn::engine engine, paddle::platform::Place cpu_place, Tensor* x, bool trans_x, Tensor* y, bool trans_y, Tensor* out, float scale) - : paddle::platform::MKLDNNHandlerNoCachingT(engine, - cpu_place) { + : paddle::platform::MKLDNNHandlerNoCachingT(engine, + cpu_place) { auto mat_dim_x = paddle::operators::math::CreateMatrixDescriptor(x->dims(), 0, trans_x); auto mat_dim_y = @@ -115,117 +141,98 @@ class MatMulMKLDNNHandler !trans_y ? memory::dims{N * K, N, 1} : memory::dims{N * K, 1, K}; memory::dims out_strides = memory::dims{M * N, N, 1}; - auto x_md = memory::desc(x_dims, MKLDNNGetDataType(), x_strides); - auto y_md = memory::desc(y_dims, MKLDNNGetDataType(), y_strides); - auto out_md = memory::desc(out_dims, MKLDNNGetDataType(), out_strides); + auto x_md = memory::desc(x_dims, MKLDNNGetDataType(), x_strides); + auto y_md = memory::desc(y_dims, MKLDNNGetDataType(), y_strides); + auto out_md = memory::desc(out_dims, MKLDNNGetDataType(), out_strides); dnnl::primitive_attr attrs; if (scale != 1.0f) attrs.set_output_scales(0, {scale}); this->AcquireForwardPrimitiveDescriptor(attrs, x_md, y_md, out_md); } + // Constructor for FWD MatMul + MatMulMKLDNNHandler(const mkldnn::engine engine, const ExecutionContext& ctx, + float scale) + : paddle::platform::MKLDNNHandlerNoCachingT( + engine, ctx.GetPlace()), + matmul_dims_(GetMatmulDims(ctx)) { + dnnl::primitive_attr attr; + float scale_out = ComputeOutputScale(ctx); + if (scale_out != 1.0f) { + constexpr unsigned tensor_wide_scale = 0; + attr.set_output_scales(tensor_wide_scale, {scale_out}); + } + + auto x_md = memory::desc(matmul_dims_.x_dims, MKLDNNGetDataType(), + matmul_dims_.x_strides); + auto y_md = memory::desc(matmul_dims_.y_dims, MKLDNNGetDataType(), + matmul_dims_.y_strides); + auto out_md = memory::desc(matmul_dims_.out_dims, MKLDNNGetDataType(), + matmul_dims_.out_strides); + this->AcquireForwardPrimitiveDescriptor(attr, x_md, y_md, out_md); + } std::shared_ptr AcquireWeightsMemory(const Tensor* input) { - const T* input_data = input->data(); + const YT* input_data = input->data(); return this->AcquireMemoryFromPrimitive(this->fwd_pd_->weights_desc(), - to_void_cast(input_data)); + to_void_cast(input_data)); } -}; -template -constexpr bool IsInt8() { - return std::is_same::value || std::is_same::value; -} - -template -constexpr bool IsBfloat16() { - return std::is_same::value; -} - -// Get row matrix shape from a vector shape. If the rank of x_dim > 1, the -// original x_dim is returned. -static paddle::framework::DDim RowMatrixDimsFromVector( - const paddle::framework::DDim& x_dim) { - return x_dim.size() > 1 ? x_dim : paddle::framework::make_ddim({1, x_dim[0]}); -} - -// Get column matrix shape from a vector shape. If the ran of y_dim > 1, the -// original y_dim is returned. -static paddle::framework::DDim ColumnMatrixDimsFromVector( - const paddle::framework::DDim& y_dim) { - return y_dim.size() > 1 ? y_dim : paddle::framework::make_ddim({y_dim[0], 1}); -} - -/** - * Reshape a tensor to 3-D or 2-D tensor by matrix descriptor. - * - * The shape would be [BatchSize, H, W] or [H, W]. - * If transposed, `H,W` will be swapped. - */ -static void ReshapeTensorToMatrixSequence( - Tensor* x, const paddle::operators::math::MatDescriptor& descriptor) { - int64_t h, w; - h = descriptor.height_; - w = descriptor.width_; - if (descriptor.trans_) { - std::swap(w, h); - } - if (descriptor.batch_size_) { - x->Resize({descriptor.batch_size_, h, w}); - } else { - x->Resize({h, w}); - } -} + public: + void Execute(const paddle::framework::Tensor* x, + const paddle::framework::Tensor* y, + paddle::framework::Tensor* out) { + const auto src_memory_p = this->AcquireSrcMemory(x); + const auto weights_memory_p = this->AcquireWeightsMemory(y); + const auto dst_memory_p = this->AcquireDstMemory(out); + + auto matmul_p = this->AcquireForwardPrimitive(); + + std::unordered_map matmul_args = { + {DNNL_ARG_SRC, *src_memory_p}, + {DNNL_ARG_WEIGHTS, *weights_memory_p}, + {DNNL_ARG_DST, *dst_memory_p}}; + + auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream(); + + // Simulate batch matmul by processing in loop + void* x_ptr = src_memory_p->get_data_handle(); + void* y_ptr = weights_memory_p->get_data_handle(); + void* out_ptr = dst_memory_p->get_data_handle(); + auto offsets = this->GetOffsets(); + for (uint16_t i = 0; i < this->GetBatchSize(); ++i) { + src_memory_p->set_data_handle(x_ptr); + weights_memory_p->set_data_handle(y_ptr); + dst_memory_p->set_data_handle(out_ptr); + matmul_p->execute(astream, { + {MKLDNN_ARG_SRC, *src_memory_p}, + {MKLDNN_ARG_WEIGHTS, *weights_memory_p}, + {MKLDNN_ARG_DST, *dst_memory_p}, + }); + x_ptr = static_cast(x_ptr) + std::get<0>(offsets); + y_ptr = static_cast(y_ptr) + std::get<1>(offsets); + out_ptr = static_cast(out_ptr) + std::get<2>(offsets); + } + astream.wait(); -/** - * Reshape the x,y,out tensor to 3-D or 2-D tensor by matrix descriptor - * Out = matmul(x, y) - * - * This method will first calculate X,Y matrix sequence, and then calculate - * the out shape. - * - * Assume X = [BatchSize, H1, W1], Y = [BatchSize, H2, W2] - * The out = [BatchSize, H1, W2] - * - * If there is no batch size in `X` and `Y`, the out will be [H1, W2] - * If any of `X` and `Y` has batch size BatchSize, the out will have the - * BatchSize. - */ -static void ReshapeXYOutToMatrixSequence(Tensor* x, Tensor* y, Tensor* out, - bool trans_x, bool trans_y) { - auto x_dim = RowMatrixDimsFromVector(x->dims()); - auto y_dim = ColumnMatrixDimsFromVector(y->dims()); - auto mat_dim_x = - paddle::operators::math::CreateMatrixDescriptor(x_dim, 0, trans_x); - auto mat_dim_y = - paddle::operators::math::CreateMatrixDescriptor(y_dim, 0, trans_y); - if (mat_dim_x.batch_size_ == 0 && mat_dim_y.batch_size_ == 0) { - out->Resize({mat_dim_x.height_, mat_dim_y.width_}); - } else { - out->Resize({std::max(mat_dim_x.batch_size_, mat_dim_y.batch_size_), - mat_dim_x.height_, mat_dim_y.width_}); + auto format = + MKLDNNFormatForSize(out->dims().size(), dnnl::memory::format_tag::nchw); + out->set_format(format); + out->set_layout(DataLayout::kMKLDNN); } - ReshapeTensorToMatrixSequence(x, mat_dim_x); - ReshapeTensorToMatrixSequence(y, mat_dim_y); -} - -template -class MatMulFactory { - public: - void CreateAndExecute(const ExecutionContext& ctx) { - SetDNNLEngine(ctx); - if (IsInitialized()) { - UpdateDataPointers(ctx); - Execute(); - SetOutputFormat(ctx); - return; - } - CreateMemories(ctx); - CreatePrimitive(ctx); - Execute(); - SetOutputFormat(ctx); - SetInitialized(); + std::shared_ptr AcquireDstMemory( + paddle::framework::Tensor* output) { + // We cannot use base AcquireDstMemory as it makes an allocation request + // base on DST memory primitive size. This is fine in general, but in MatMul + // we have primitive that covers only one batch of Data and then shift + // pointer for every new batch. Hence Tensor size is bigger that dst memory + // primitive size. So would we request less memory that is there and it + // triggers an + // assertion. So as there is no 'any' format here we can leave default size + // of Tensor as computed in ComputeInferShape + OT* ptr = output->mutable_data(this->place_); + return this->AcquireMemoryFromPrimitive(this->fwd_pd_->dst_desc(), ptr); } private: @@ -234,47 +241,6 @@ class MatMulFactory { out_strides; }; - void SetDNNLEngine(const ExecutionContext& ctx) { - auto& dev_ctx = ctx.template device_context(); - engine_ = dev_ctx.GetEngine(); - } - - template - dnnl::memory CreateMemory(const memory::dims& dims, - const memory::dims& strides, const T* data) { - auto md = memory::desc(dims, MKLDNNGetDataType(), strides); - return dnnl::memory(md, engine_, to_void_cast(data)); - } - - std::vector Transpose(const std::vector& x, - const std::vector& axis) { - size_t in_rank = x.size(); - size_t axis_size = axis.size(); - - auto axis_set = std::set(axis.begin(), axis.end()); - PADDLE_ENFORCE_EQ(axis_set.size(), axis_size, - paddle::platform::errors::InvalidArgument( - "In an axis array, elements must be unique.")); - - PADDLE_ENFORCE_EQ(in_rank, axis_size, - paddle::platform::errors::InvalidArgument( - "The input dimension's size " - "should be equal to the axis's size. " - "But received dimension is %d, " - "axis's size is %d", - in_rank, axis_size)); - - PADDLE_ENFORCE_LT(*std::max_element(axis.begin(), axis.end()), axis_size, - paddle::platform::errors::InvalidArgument( - "Axis values must be ranging from 0 to (dims - 1).")); - - std::vector new_x(x.size()); - for (size_t i = 0; i < x.size(); i++) { - new_x[i] = x[axis[i]]; - } - return new_x; - } - std::pair GetInputDimsAndStrides(const ExecutionContext& ctx, std::string input_name) { auto shape = ctx.Attr>("fused_reshape_" + input_name); @@ -310,6 +276,15 @@ class MatMulFactory { return std::make_pair(mat_dim, strides); } + float ComputeOutputScale(const ExecutionContext& ctx) { + float scale_x = ctx.Attr("Scale_x"); + float scale_y = ctx.Attr("Scale_y"); + bool force_fp32_out = ctx.Attr("force_fp32_output"); + float scale_out = force_fp32_out ? 1.f : ctx.Attr("Scale_out"); + float alpha = ctx.Attr("alpha"); + return alpha * scale_out / (scale_x * scale_y); + } + bool IsInputFused(const ExecutionContext& ctx) const { return !(ctx.Attr>("fused_reshape_X").empty() && ctx.Attr>("fused_reshape_Y").empty()); @@ -322,14 +297,6 @@ class MatMulFactory { return !fused_reshape_Out.empty() && !fused_transpose_Out.empty(); } - void CorrectStridesWhenFloatOutputFused(const ExecutionContext& ctx, - const memory::dim N, memory::dim b, - memory::dims* out_strides) const { - if (!IsInt8() && !IsBfloat16() && IsOutputFused(ctx)) { - *out_strides = {N, b * N, 1}; - } - } - MatMulDims GetMatmulDims(const ExecutionContext& ctx) { paddle::operators::math::MatDescriptor mat_dim_x; memory::dims strides_x; @@ -381,125 +348,112 @@ class MatMulFactory { return {x_dims, y_dims, out_dims, strides_x, strides_y, out_strides}; } - void CreateMemories(const ExecutionContext& ctx) { - auto matmul_dims = GetMatmulDims(ctx); + std::vector Transpose(const std::vector& x, + const std::vector& axis) { + size_t in_rank = x.size(); + size_t axis_size = axis.size(); - x_mem_ = CreateMemory(matmul_dims.x_dims, matmul_dims.x_strides, - ctx.Input("X")->data()); - y_mem_ = CreateMemory(matmul_dims.y_dims, matmul_dims.y_strides, - ctx.Input("Y")->data()); - out_mem_ = CreateMemory( - matmul_dims.out_dims, matmul_dims.out_strides, - ctx.Output("Out")->mutable_data(ctx.GetPlace())); - } + auto axis_set = std::set(axis.begin(), axis.end()); + PADDLE_ENFORCE_EQ(axis_set.size(), axis_size, + paddle::platform::errors::InvalidArgument( + "In an axis array, elements must be unique.")); - float ComputeOutputScale(const ExecutionContext& ctx) { - float scale_x = ctx.Attr("Scale_x"); - float scale_y = ctx.Attr("Scale_y"); - bool force_fp32_out = ctx.Attr("force_fp32_output"); - float scale_out = force_fp32_out ? 1.f : ctx.Attr("Scale_out"); - float alpha = ctx.Attr("alpha"); - return alpha * scale_out / (scale_x * scale_y); - } + PADDLE_ENFORCE_EQ(in_rank, axis_size, + paddle::platform::errors::InvalidArgument( + "The input dimension's size " + "should be equal to the axis's size. " + "But received dimension is %d, " + "axis's size is %d", + in_rank, axis_size)); - void CreatePrimitive(const ExecutionContext& ctx) { - dnnl::primitive_attr attr; - float scale_out = ComputeOutputScale(ctx); - if (scale_out != 1.0f) { - constexpr unsigned tensor_wide_scale = 0; - attr.set_output_scales(tensor_wide_scale, {scale_out}); - } + PADDLE_ENFORCE_LT(*std::max_element(axis.begin(), axis.end()), axis_size, + paddle::platform::errors::InvalidArgument( + "Axis values must be ranging from 0 to (dims - 1).")); - auto matmul_d = dnnl::matmul::desc(x_mem_.get_desc(), y_mem_.get_desc(), - out_mem_.get_desc()); - auto matmul_pd = dnnl::matmul::primitive_desc(matmul_d, attr, engine_); - matmul_prim_ = dnnl::matmul(matmul_pd); + std::vector new_x(x.size()); + for (size_t i = 0; i < x.size(); i++) { + new_x[i] = x[axis[i]]; + } + return new_x; } - void Execute() { - dnnl::stream stream(engine_); - - void* x_ptr = x_mem_.get_data_handle(); - void* y_ptr = y_mem_.get_data_handle(); - void* out_ptr = out_mem_.get_data_handle(); - for (uint16_t i = 0; i < batch_size_; i++) { - x_mem_.set_data_handle(x_ptr); - y_mem_.set_data_handle(y_ptr); - out_mem_.set_data_handle(out_ptr); - matmul_prim_.execute(stream, { - {MKLDNN_ARG_SRC, x_mem_}, - {MKLDNN_ARG_WEIGHTS, y_mem_}, - {MKLDNN_ARG_DST, out_mem_}, - }); - x_ptr = static_cast(x_ptr) + x_offset_; - y_ptr = static_cast(y_ptr) + y_offset_; - out_ptr = static_cast(out_ptr) + out_offset_; + void CorrectStridesWhenFloatOutputFused(const ExecutionContext& ctx, + const memory::dim N, memory::dim b, + memory::dims* out_strides) const { + if (!IsInt8() && !IsBfloat16() && IsOutputFused(ctx)) { + *out_strides = {N, b * N, 1}; } - stream.wait(); } - void SetOutputFormat(const ExecutionContext& ctx) { - using paddle::platform::MKLDNNFormatForSize; - auto* out = ctx.Output("Out"); - auto format = - MKLDNNFormatForSize(out->dims().size(), dnnl::memory::format_tag::nchw); - out->set_format(format); - out->set_layout(DataLayout::kMKLDNN); - } + uint16_t GetBatchSize(void) const { return batch_size_; } - void UpdateDataPointers(const ExecutionContext& ctx) { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); - x_mem_.set_data_handle(to_void_cast(x->data())); - y_mem_.set_data_handle(to_void_cast(y->data())); - out_mem_.set_data_handle(out->mutable_data(ctx.GetPlace())); + std::tuple GetOffsets() const { + return std::make_tuple(x_offset_, y_offset_, out_offset_); } - // If initialized, x memory should've been already initialized - bool IsInitialized() { return initialized_; } - - void SetInitialized() { initialized_ = true; } - private: - struct memory_offsets { - size_t x_offset; - size_t y_offset; - size_t out_offset; - }; - - dnnl::engine engine_; - dnnl::memory x_mem_; - dnnl::memory y_mem_; - dnnl::memory out_mem_; - dnnl::matmul matmul_prim_; + MatMulDims matmul_dims_; uint32_t x_offset_; uint32_t y_offset_; uint32_t out_offset_; uint16_t batch_size_; - bool initialized_ = false; }; -template -static std::shared_ptr> GetPrimitiveFactory( - const ExecutionContext& ctx) { - const auto& out_name = ctx.OutputName("Out"); - const auto& dev_ctx = ctx.template device_context(); - const auto batch_size = ctx.Input("X")->dims()[0]; - std::string key = paddle::platform::CreateKey(dev_ctx, batch_size, out_name); - key = paddle::platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key); - - auto factory = - std::static_pointer_cast>(dev_ctx.GetBlob(key)); - if (factory == nullptr) { - factory = std::make_shared>(); - dev_ctx.SetBlob(key, factory); +/** + * Reshape a tensor to 3-D or 2-D tensor by matrix descriptor. + * + * The shape would be [BatchSize, H, W] or [H, W]. + * If transposed, `H,W` will be swapped. + */ +static void ReshapeTensorToMatrixSequence( + Tensor* x, const paddle::operators::math::MatDescriptor& descriptor) { + int64_t h, w; + h = descriptor.height_; + w = descriptor.width_; + if (descriptor.trans_) { + std::swap(w, h); + } + if (descriptor.batch_size_) { + x->Resize({descriptor.batch_size_, h, w}); + } else { + x->Resize({h, w}); + } +} + +/** + * Reshape the x,y,out tensor to 3-D or 2-D tensor by matrix descriptor + * Out = matmul(x, y) + * + * This method will first calculate X,Y matrix sequence, and then calculate + * the out shape. + * + * Assume X = [BatchSize, H1, W1], Y = [BatchSize, H2, W2] + * The out = [BatchSize, H1, W2] + * + * If there is no batch size in `X` and `Y`, the out will be [H1, W2] + * If any of `X` and `Y` has batch size BatchSize, the out will have the + * BatchSize. + */ +static void ReshapeXYOutToMatrixSequence(Tensor* x, Tensor* y, Tensor* out, + bool trans_x, bool trans_y) { + auto x_dim = RowMatrixDimsFromVector(x->dims()); + auto y_dim = ColumnMatrixDimsFromVector(y->dims()); + auto mat_dim_x = + paddle::operators::math::CreateMatrixDescriptor(x_dim, 0, trans_x); + auto mat_dim_y = + paddle::operators::math::CreateMatrixDescriptor(y_dim, 0, trans_y); + if (mat_dim_x.batch_size_ == 0 && mat_dim_y.batch_size_ == 0) { + out->Resize({mat_dim_x.height_, mat_dim_y.width_}); + } else { + out->Resize({std::max(mat_dim_x.batch_size_, mat_dim_y.batch_size_), + mat_dim_x.height_, mat_dim_y.width_}); } - return factory; + ReshapeTensorToMatrixSequence(x, mat_dim_x); + ReshapeTensorToMatrixSequence(y, mat_dim_y); } -// Choose appropriate primitive factory implementation based on inferred +// Choose appropriate Handler instances based on inferred // output type (uint8, int8 or float). template static void ExecuteMatMul(const ExecutionContext& ctx) { @@ -507,31 +461,41 @@ static void ExecuteMatMul(const ExecutionContext& ctx) { constexpr bool is_bfloat16 = IsBfloat16(); const bool force_fp32_output = ctx.Attr("force_fp32_output"); constexpr bool fuse_relu = false; // TODO(intel): Enable eltwise fuses + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); + float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 1.0f; + const auto& dev_ctx = + ctx.template device_context(); + if (force_fp32_output || ((!is_int8) && (!is_bfloat16))) { - GetPrimitiveFactory(ctx)->CreateAndExecute(ctx); + MatMulMKLDNNHandler(dev_ctx.GetEngine(), ctx, alpha) + .Execute(x, y, out); } else if (is_bfloat16) { - GetPrimitiveFactory(ctx) - ->CreateAndExecute(ctx); + MatMulMKLDNNHandler(dev_ctx.GetEngine(), + ctx, alpha) + .Execute(x, y, out); } else if (fuse_relu) { - GetPrimitiveFactory(ctx)->CreateAndExecute(ctx); + MatMulMKLDNNHandler(dev_ctx.GetEngine(), ctx, alpha) + .Execute(x, y, out); } else { - GetPrimitiveFactory(ctx)->CreateAndExecute(ctx); + MatMulMKLDNNHandler(dev_ctx.GetEngine(), ctx, alpha) + .Execute(x, y, out); } } template -class DNNLMatMulKernel : public paddle::framework::OpKernel { +class MatMulMKLDNNKernel : public paddle::framework::OpKernel { public: void Compute(const ExecutionContext& ctx) const override { if (ctx.HasAttr("head_number")) { PADDLE_ENFORCE_EQ( ctx.Attr("head_number"), 1, paddle::platform::errors::Unimplemented( - "DNNL matmul doesn't support multiple heads. Expected " + "oneDNN matmul doesn't support multiple heads. Expected " "head_number=1. But received `head_number` is %d", ctx.Attr("head_number"))); } - MKLDNNDeviceContext::tls().log_lib_version(); ExecuteMatMul(ctx); } }; @@ -547,7 +511,7 @@ void MatMulGradMKLDNNKernel::Compute(const ExecutionContext& ctx) const { PADDLE_ENFORCE_EQ( ctx.Attr("head_number"), 1, platform::errors::Unimplemented( - "DNNL matmul doesn't support multiple heads. Expected " + "oneDNN matmul doesn't support multiple heads. Expected " "head_number=1. But received `head_number` is %d", ctx.Attr("head_number"))); } @@ -577,8 +541,9 @@ void MatMulGradMKLDNNKernel::ExecuteMatMulGrad( float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 1.0f; - MatMulMKLDNNHandler handler(engine, ctx.GetPlace(), &x_combined, trans_x, - &y_combined, trans_y, out, alpha); + MatMulMKLDNNHandler handler(engine, ctx.GetPlace(), &x_combined, + trans_x, &y_combined, trans_y, out, + alpha); const auto src_memory_p = handler.AcquireSrcMemory(&x_combined); const auto weights_memory_p = handler.AcquireWeightsMemory(&y_combined); @@ -679,9 +644,9 @@ template class MatMulGradMKLDNNKernel; namespace ops = paddle::operators; REGISTER_OP_KERNEL(matmul, MKLDNN, ::paddle::platform::CPUPlace, - DNNLMatMulKernel, - DNNLMatMulKernel, - DNNLMatMulKernel, DNNLMatMulKernel); + MatMulMKLDNNKernel, + MatMulMKLDNNKernel, + MatMulMKLDNNKernel, MatMulMKLDNNKernel); REGISTER_OP_KERNEL(matmul_grad, MKLDNN, ::paddle::platform::CPUPlace, ops::MatMulGradMKLDNNKernel, diff --git a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc index 244430e69f234..d5e428bd805fb 100644 --- a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc @@ -22,6 +22,25 @@ using paddle::framework::LoDTensor; using platform::to_void_cast; using platform::GetMKLDNNFormat; +static std::vector extract_shape( + const std::vector& list_new_shape_tensor) { + std::vector vec_new_shape; + vec_new_shape.reserve(list_new_shape_tensor.size()); + + for (const auto& tensor : list_new_shape_tensor) { + PADDLE_ENFORCE_EQ( + tensor->dims(), framework::make_ddim({1}), + platform::errors::InvalidArgument( + "If the element type of 'shape' in ReshapeOp is Tensor, " + "the element's shape must be [1]. But received the element's shape " + "is [%s]", + tensor->dims())); + vec_new_shape.emplace_back(*tensor->data()); + } + + return vec_new_shape; +} + template class ReshapeMKLDNNKernel : public framework::OpKernel { public: @@ -59,7 +78,11 @@ class ReshapeMKLDNNKernel : public framework::OpKernel { } if (ctx.Type().find("reshape") != std::string::npos) { - if (ctx.HasInput("Shape")) { + auto list_new_shape_tensor = ctx.MultiInput("ShapeTensor"); + if (list_new_shape_tensor.size() > 0) { + auto new_shape = extract_shape(list_new_shape_tensor); + out_dims = ValidateShape(new_shape, x_dims); + } else if (ctx.HasInput("Shape")) { auto* shape_tensor = ctx.Input("Shape"); auto* shape_data = shape_tensor->data(); diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc index 5d1682889535f..14291f8458430 100644 --- a/paddle/fluid/operators/mul_op.cc +++ b/paddle/fluid/operators/mul_op.cc @@ -166,22 +166,26 @@ class MulOpMaker : public framework::OpProtoAndCheckerMaker { "scale_x to be used for int8 mul input data x. scale_x has the" "same purpose as scale_in in OPs that support quantization." "Only to be used with MKL-DNN INT8") - .SetDefault(1.0f); + .SetDefault(1.0f) + .AsExtra(); AddAttr>( "scale_y", "scale_y to be used for int8 mul input data y. scale_y has the" "same purpose as scale_weights in OPs that support quantization." "Only to be used with MKL-DNN INT8") - .SetDefault({1.0f}); + .SetDefault({1.0f}) + .AsExtra(); AddAttr("scale_out", "scale_out to be used for int8 output data." "Only used with MKL-DNN INT8") - .SetDefault(1.0f); + .SetDefault(1.0f) + .AsExtra(); AddAttr( "force_fp32_output", "(bool, default false) Force quantize kernel output FP32, only " "used in quantized MKL-DNN.") - .SetDefault(false); + .SetDefault(false) + .AsExtra(); AddComment(R"DOC( Mul Operator. diff --git a/paddle/fluid/operators/norm_op.cc b/paddle/fluid/operators/norm_op.cc index 1fc51e76e2540..5880141520fa1 100644 --- a/paddle/fluid/operators/norm_op.cc +++ b/paddle/fluid/operators/norm_op.cc @@ -88,7 +88,11 @@ class NormOpGradOpMaker : public framework::SingleGradOpMaker { op->SetAttrMap(this->Attrs()); op->SetInput("X", this->Input("X")); op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); +#ifndef PADDLE_WITH_ASCEND_CL op->SetInput("Norm", this->Output("Norm")); +#else + op->SetInput("Out", this->Output("Out")); +#endif op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); } }; diff --git a/paddle/fluid/operators/norm_op_npu.cc b/paddle/fluid/operators/norm_op_npu.cc index 17b0fca2bb097..ca2eac06c7247 100644 --- a/paddle/fluid/operators/norm_op_npu.cc +++ b/paddle/fluid/operators/norm_op_npu.cc @@ -15,24 +15,26 @@ limitations under the License. */ namespace paddle { namespace operators { +using DDim = framework::DDim; +using Tensor = framework::Tensor; + +void CheckAxis(int axis, int rank) { + // check the axis is in [-rank, rank-1] + if (axis <= rank - 1 && axis >= -rank) return; + PADDLE_THROW(platform::errors::InvalidArgument( + "axis in norm operator must between (%d) and (%d)" + "but got (%d).", + -rank, rank - 1, axis)); +} + template class NormNPUKernel : public framework::OpKernel { - private: - void CheckAxis(int axis, int rank) const { - // check the axis is in [-rank, rank-1] - if (axis <= rank - 1 && axis >= -rank) return; - PADDLE_THROW(platform::errors::InvalidArgument( - "axis in norm operator must between (%d) and (%d)" - "but got (%d).", - -rank, rank - 1, axis)); - } - public: - void Compute(const framework::ExecutionContext& ctx) const override { + void Compute(const framework::ExecutionContext &ctx) const override { VLOG(4) << "Launch Norm Op Kernel on NPU." << std::endl; - auto* in_x = ctx.Input("X"); - auto* out_y = ctx.Output("Out"); - auto* out_norm = ctx.Output("Norm"); + auto *in_x = ctx.Input("X"); + auto *out_y = ctx.Output("Out"); + auto *out_norm = ctx.Output("Norm"); out_y->mutable_data(ctx.GetPlace()); out_norm->mutable_data(ctx.GetPlace()); auto xdim = in_x->dims(); @@ -46,7 +48,7 @@ class NormNPUKernel : public framework::OpKernel { attr_input_norm["p"] = 2; attr_input_norm["keepdim"] = true; attr_input_norm["epsilon"] = eps; - const auto& runner = + const auto &runner = NpuOpRunner("LpNorm", {*in_x}, {*out_norm}, attr_input_norm); auto stream = ctx.template device_context() @@ -56,12 +58,48 @@ class NormNPUKernel : public framework::OpKernel { } }; +template +class NormGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + float epsilon = ctx.Attr("epsilon"); + int axis = ctx.Attr("axis"); + + auto *x = ctx.Input("X"); + auto *y = ctx.Input("Out"); + auto *dy = ctx.Input(framework::GradVarName("Out")); + auto *dx = ctx.Output(framework::GradVarName("X")); + + auto xdim = x->dims(); + CheckAxis(axis, xdim.size()); + + auto place = ctx.GetPlace(); + + dx->mutable_data(place); + + framework::NPUAttributeMap attr_input_norm; + attr_input_norm["dim"] = std::vector({axis}); + attr_input_norm["eps"] = epsilon; + const auto &runner = + NpuOpRunner("L2NormalizeGrad", {*x, *y, *dy}, {*dx}, attr_input_norm); + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; namespace plat = paddle::platform; + REGISTER_OP_NPU_KERNEL( norm, ops::NormNPUKernel, ops::NormNPUKernel) + +REGISTER_OP_NPU_KERNEL( + norm_grad, ops::NormGradNPUKernel, + ops::NormGradNPUKernel); diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index 5e7244f4390d8..4b0179953030a 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -110,7 +110,8 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr( "use_cudnn", "(bool, default false) Only used in cudnn kernel, need install cudnn") - .SetDefault(false); + .SetDefault(false) + .AsExtra(); AddAttr( "data_format", "(string, default NCHW) Only used in " @@ -120,16 +121,19 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker { .SetDefault("AnyLayout"); AddAttr("use_mkldnn", "(bool, default false) Only used in mkldnn kernel") - .SetDefault(false); + .SetDefault(false) + .AsExtra(); AddAttr( "mkldnn_data_type", "(string, default \"float32\"). Data type of mkldnn kernel") .SetDefault("float32") - .InEnum({"float32", "bfloat16"}); + .InEnum({"float32", "bfloat16"}) + .AsExtra(); AddAttr("is_test", "(bool, default false) Set to true for inference only, false " "for training. Some layers may run faster when this is true.") - .SetDefault(false); + .SetDefault(false) + .AsExtra(); AddComment(R"DOC( Softmax Operator. diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc index cc9f2c75989db..a765f344daf8a 100644 --- a/paddle/fluid/platform/collective_helper.cc +++ b/paddle/fluid/platform/collective_helper.cc @@ -140,6 +140,50 @@ void NCCLCommContext::CreateAllNCCLComms(const std::vector& dev_ids, }); } +void NCCLCommContext::CreateNCCLCommMultiTrainer( + const std::vector& dev_ids, ncclUniqueId* nccl_id, int ntrainers, + int train_id, int ring_id) { + PADDLE_ENFORCE_GT( + dev_ids.size(), 0, + paddle::platform::errors::InvalidArgument( + "dev ids = [%d], it should greater than 0.", dev_ids.size())); + const int kDevices = dev_ids.size(); + VLOG(3) << "Begin CreateNCCLCommMultiTrainer. device number: " << kDevices + << ", ntrainers: " << ntrainers << ", train_id: " << train_id + << ", rind_id: " << ring_id; + ncclComm_t comms[kDevices]; + { + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupStart()); + for (int i = 0; i < kDevices; i++) { +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS(hipSetDevice(i)); +#else + PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(i)); +#endif + platform::dynload::ncclCommInitRank(comms + i, kDevices * ntrainers, + *nccl_id, train_id * kDevices + i); + VLOG(3) << "ncclCommInitRank: " << i; + } + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupEnd()); + VLOG(3) << "nccl group end seccessss"; + } + PADDLE_ENFORCE_EQ(comm_map_.count(ring_id), 0, + platform::errors::InvalidArgument( + "comm_map_ of ring_id: %s should be 0. %s is provided", + ring_id, comm_map_.count(ring_id))); + for (int i = 0; i < kDevices; ++i) { + AssignNCCLComm(comms[i], kDevices * ntrainers, train_id * kDevices + i, + dev_ids[i], ring_id); + VLOG(3) << "nccl communicator of train_id " << train_id * kDevices + i + << " in ring " << ring_id << " has been created on device " + << dev_ids[i]; + } + + std::call_once(once_flag_, []() { + std::atexit([]() { NCCLCommContext::Instance().ReleaseNCCLComms(); }); + }); +} + NCCLComm* NCCLCommContext::AssignNCCLComm(ncclComm_t comm, int nranks, int rank, int dev_id, int ring_id) { std::unique_ptr dev_ctx( diff --git a/paddle/fluid/platform/collective_helper.h b/paddle/fluid/platform/collective_helper.h index b9be9dc8304e1..566121a08b880 100644 --- a/paddle/fluid/platform/collective_helper.h +++ b/paddle/fluid/platform/collective_helper.h @@ -77,6 +77,10 @@ class NCCLCommContext { void CreateAllNCCLComms(const std::vector& dev_ids, int ring_id = 0); + void CreateNCCLCommMultiTrainer(const std::vector& dev_ids, + ncclUniqueId* nccl_id, int nranks, int rank, + int ring_id); + // a latter comm with the same dev_id and the same ring_id // will override the former NCCLComm* AssignNCCLComm(ncclComm_t comm, int nranks, int rank, int dev_id, diff --git a/paddle/fluid/platform/device_event_cpu.h b/paddle/fluid/platform/device_event_cpu.h index b08323d7f1506..6e2bf4c7ad135 100644 --- a/paddle/fluid/platform/device_event_cpu.h +++ b/paddle/fluid/platform/device_event_cpu.h @@ -23,7 +23,8 @@ namespace platform { struct CPUDeviceEventWrapper { explicit CPUDeviceEventWrapper(const platform::Place& place, - unsigned int flag = 0) { + unsigned int flag = 0) + : status_(EventStatus::INITIALIZED) { PADDLE_ENFORCE_EQ( platform::is_cpu_place(place), true, platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/pybind/data_set_py.cc b/paddle/fluid/pybind/data_set_py.cc index 7a32d8729fc6c..41cf0189d3d9d 100644 --- a/paddle/fluid/pybind/data_set_py.cc +++ b/paddle/fluid/pybind/data_set_py.cc @@ -309,6 +309,8 @@ void BindDataset(py::module *m) { &framework::Dataset::SetFleetSendSleepSeconds, py::call_guard()) .def("enable_pv_merge", &framework::Dataset::EnablePvMerge, + py::call_guard()) + .def("set_heter_ps", &framework::Dataset::SetHeterPs, py::call_guard()); py::class_(*m, "IterableDatasetWrapper") diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 729cc799b8342..a524b4291f627 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -948,7 +948,11 @@ function assert_file_diff_approvals() { function check_coverage() { - /bin/bash ${PADDLE_ROOT}/tools/coverage/paddle_coverage.sh + if [ ${WITH_COVERAGE:-ON} == "ON" ] ; then + /bin/bash ${PADDLE_ROOT}/tools/coverage/paddle_coverage.sh + else + echo "WARNING: check_coverage need to compile with WITH_COVERAGE=ON, but got WITH_COVERAGE=OFF" + fi } @@ -1170,13 +1174,13 @@ set -x fi if [ -a "$PADDLE_ROOT/added_ut" ];then added_uts=^$(awk BEGIN{RS=EOF}'{gsub(/\n/,"$|^");print}' $PADDLE_ROOT/added_ut)$ - ctest -R "(${added_uts})" --output-on-failure --repeat-until-fail 3 --timeout 15;added_ut_error=$? - if [ "$added_ut_error" != 0 ];then - echo "========================================" - echo "Added UT should not exceed 15 seconds" - echo "========================================" - exit 8; - fi + #ctest -R "(${added_uts})" --output-on-failure --repeat-until-fail 3 --timeout 15;added_ut_error=$? + #if [ "$added_ut_error" != 0 ];then + # echo "========================================" + # echo "Added UT should not exceed 15 seconds" + # echo "========================================" + # exit 8; + #fi fi set +x EXIT_CODE=0; diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py index 8d20dd994475f..cf9d40d7b00c0 100644 --- a/python/paddle/fluid/dataset.py +++ b/python/paddle/fluid/dataset.py @@ -985,6 +985,13 @@ def get_shuffle_data_size(self, fleet=None): return global_data_size[0] return local_data_size[0] + def _set_heter_ps(self, enable_heter_ps=False): + """ + Set heter ps mode + user no need to call this function. + """ + self.dataset.set_heter_ps(enable_heter_ps) + class QueueDataset(DatasetBase): """ diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py index 102dcd436226c..83e7d0ae1e09b 100644 --- a/python/paddle/fluid/dygraph/varbase_patch_methods.py +++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py @@ -390,7 +390,8 @@ def clear_grad(self): def item(self, *args): """ - Convert one element Tensor to a Python scalar. + Convert element at specific position in Tensor into Python scalars. If the position is not specified, the Tensor must be a + single-element Tensor. Args: *args(int): The input coordinates. If it's single int, the data in the corresponding order of flattened Tensor will be returned. @@ -426,8 +427,6 @@ def item(self, *args): print(x.item(2)) #3.3 print(x.item(0, 2)) #3.3 - x = paddle.to_tensor([1, 2]) - x.item() #ValueError: only one element tensor can be converted to Python scalar when no input coordinates. """ return self._getitem_from_offset(*args).item() diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py index 49c262607498c..39cf3ebeb32a9 100644 --- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py +++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py @@ -101,16 +101,17 @@ def init_worker(self): # barrier_all for init_worker self._role_maker._barrier_all() # prepare for client to client communication - if not self._opt_info["use_ps_gpu"]: - if self._role_maker.is_worker(): - info = self._fleet_ptr.get_clients_info() - all_info = self._role_maker._worker_gather(info[0]) - self._fleet_ptr.gather_clients(all_info) - self._fleet_ptr.set_client2client_config( - self._client2client_request_timeout_ms, - self._client2client_connect_timeout_ms, - self._client2client_max_retry) - self._fleet_ptr.create_client2client_connection() + if self._role_maker.is_worker(): + info = self._fleet_ptr.get_clients_info() + print("IIIIFO: {}".format(info)) + all_info = self._role_maker._worker_gather(info[0]) + print("ALL info: {}".format(all_info)) + self._fleet_ptr.gather_clients(all_info) + self._fleet_ptr.set_client2client_config( + self._client2client_request_timeout_ms, + self._client2client_connect_timeout_ms, + self._client2client_max_retry) + self._fleet_ptr.create_client2client_connection() # barrier for init model self._role_maker._barrier_worker() if self._role_maker.is_first_worker(): @@ -1120,14 +1121,14 @@ def minimize(self, fleet._main_programs = programs fleet._scopes = scopes if opt_info["use_ps_gpu"]: - from paddle.fluid.transpiler.collective import SingleProcessMultiThread + from paddle.fluid.transpiler.collective import MultiThread # check start program env = self.get_dist_env() if not isinstance(losses, list): startup_programs = [startup_programs] for i in range(0, len(startup_programs)): - t = SingleProcessMultiThread() + t = MultiThread() start_program = startup_programs[i] main_program = programs[i] t.transpile( diff --git a/python/paddle/fluid/layers/metric_op.py b/python/paddle/fluid/layers/metric_op.py index 1962a4ed6a6ee..bf046b89322bb 100644 --- a/python/paddle/fluid/layers/metric_op.py +++ b/python/paddle/fluid/layers/metric_op.py @@ -84,7 +84,9 @@ def accuracy(input, label, k=1, correct=None, total=None): if total is None: total = _varbase_creator(dtype="int32") - topk_out, topk_indices = nn.topk(input, k=k) + _k = k.numpy().item(0) if isinstance(k, Variable) else k + topk_out, topk_indices = _C_ops.top_k_v2(input, 'k', _k, 'sorted', + False) _acc, _, _ = _C_ops.accuracy(topk_out, topk_indices, label, correct, total) return _acc @@ -92,7 +94,20 @@ def accuracy(input, label, k=1, correct=None, total=None): helper = LayerHelper("accuracy", **locals()) check_variable_and_dtype(input, 'input', ['float16', 'float32', 'float64'], 'accuracy') - topk_out, topk_indices = nn.topk(input, k=k) + topk_out = helper.create_variable_for_type_inference(dtype=input.dtype) + topk_indices = helper.create_variable_for_type_inference(dtype="int64") + inputs = {"X": [input]} + if isinstance(k, Variable): + inputs['K'] = [k] + else: + attrs = {'k': k} + attrs['sorted'] = False + helper.append_op( + type="top_k_v2", + inputs=inputs, + attrs=attrs, + outputs={"Out": [topk_out], + "Indices": [topk_indices]}) acc_out = helper.create_variable_for_type_inference(dtype="float32") if correct is None: correct = helper.create_variable_for_type_inference(dtype="int32") diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 4216384b6f8b2..12a7ca2e44115 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -1816,6 +1816,10 @@ def conv3d(input, "Attr(data_format): %s." % str(data_format)) channel_last = (data_format == "NDHWC") + if len(input.shape) != 5: + raise ValueError( + "Input should be 5D tensor, but received input with the shape of {}". + format(input.shape)) num_channels = input.shape[4] if channel_last else input.shape[1] if num_channels < 0: raise ValueError( @@ -1824,6 +1828,10 @@ def conv3d(input, if groups is None: num_filter_channels = num_channels + elif groups <= 0: + raise ValueError( + "the groups of conv3d should be greater than 0. Received groups: {}". + format(groups)) else: if num_channels % groups != 0: raise ValueError( @@ -3398,6 +3406,7 @@ def data_norm(input, } attrs = { "epsilon": epsilon, + "data_layout": data_layout, "sync_stats": sync_stats, "summary_decay_rate": summary_decay_rate, } @@ -4243,10 +4252,15 @@ def conv3d_transpose(input, raise ValueError( "Param(data_format) of Op(fluid.layers.conv3d_transpose) got wrong value: received " + data_format + " but only NCDHW or NDHWC supported.") + l_type = "conv3d_transpose" helper = LayerHelper(l_type, **locals()) if not isinstance(input, Variable): raise TypeError("Input of conv3d_transpose must be Variable") + if len(input.shape) != 5: + raise ValueError( + "Input should be 5D tensor, but received input with the shape of {}". + format(input.shape)) input_channel = input.shape[1] if data_format == 'NCDHW' else input.shape[ -1] @@ -4338,6 +4352,15 @@ def is_list_or_tuple(ele): raise ValueError("output_size should be int, list[int] or tuple[int]") groups = 1 if groups is None else groups + if groups <= 0: + raise ValueError( + "the groups of conv3d_transpose should be greater than 0. Received groups: {}". + format(groups)) + if num_filters % groups != 0: + raise ValueError("Attr(num_filters) must be divisible by groups," + "Received: Attr(num_filters) is {}, the groups is {}". + format(num_filters, groups)) + filter_shape = [input_channel, num_filters // groups] + filter_size img_filter = helper.create_parameter( dtype=input.dtype, shape=filter_shape, attr=helper.param_attr) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt index 250ffcc48c302..9ac1c26c63483 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt @@ -10,6 +10,10 @@ if(NOT ON_INFER) set_tests_properties(test_lac PROPERTIES TIMEOUT 120) endif() +if(WIN32 AND NOT WITH_GPU) + list(REMOVE_ITEM TEST_OPS test_resnet_amp) # disable on Windows CPU CI for timeout +endif() + foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS ${GC_ENVS}) endforeach(TEST_OP) @@ -32,6 +36,3 @@ if(NOT WIN32) set_tests_properties(test_tsm PROPERTIES TIMEOUT 900) #set_tests_properties(test_resnet PROPERTIES TIMEOUT 120) endif() -#if(WIN32) -# set_tests_properties(test_resnet PROPERTIES TIMEOUT 300) -#endif() diff --git a/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt b/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt index c1ca62629e60a..7692f8befdf58 100644 --- a/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt @@ -1,8 +1,6 @@ file(GLOB TEST_INTERP_CASES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") string(REPLACE ".py" "" TEST_INTERP_CASES "${TEST_INTERP_CASES}") -list(REMOVE_ITEM TEST_INTERP_CASES test_standalone_executor) - foreach(target ${TEST_INTERP_CASES}) py_test_modules(${target} MODULES ${target}) endforeach() diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py index b59fcd8d02e2f..1f971ae1b2508 100644 --- a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py +++ b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py @@ -25,10 +25,12 @@ class LinearTestCase(unittest.TestCase): def setUp(self): - self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda( + place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda( ) else paddle.CPUPlace() + self.place = core.Place() + self.place.set_place(place) - def test_interp_base(self): + def build_program(self): a = paddle.static.data(name="a", shape=[2, 2], dtype='float32') b = paddle.ones([2, 2]) * 2 t = paddle.static.nn.fc(a, 2) @@ -36,11 +38,15 @@ def test_interp_base(self): main_program = paddle.fluid.default_main_program() startup_program = paddle.fluid.default_startup_program() - p = core.Place() - p.set_place(self.place) - standaloneexecutor = StandaloneExecutor(p, startup_program.desc, - main_program.desc, core.Scope()) + return startup_program, main_program, c + + return standaloneexecutor, c + + def test_interp_base(self): + startup_program, main_program, c = self.build_program() + standaloneexecutor = StandaloneExecutor( + self.place, startup_program.desc, main_program.desc, core.Scope()) out = standaloneexecutor.run({ "a": np.ones( [2, 2], dtype="float32") * 2 @@ -55,24 +61,35 @@ def test_interp_base(self): out = standaloneexecutor.run({ "a": np.ones( [2, 2], dtype="float32") * i - }, [a.name, c.name]) + }, ['a', c.name]) + def test_dry_run(self): + startup_program, main_program, c = self.build_program() + standaloneexecutor = StandaloneExecutor( + self.place, startup_program.desc, main_program.desc, core.Scope()) # test for cost_info cost_info = standaloneexecutor.dry_run({ "a": np.ones( - [2, 2], dtype="float32") * i + [2, 2], dtype="float32") }) self.check_cost_info(cost_info) def check_cost_info(self, cost_info): + IS_WINDOWS = sys.platform.startswith('win') + if core.is_compiled_with_cuda(): - # self.assertEqual(cost_info.host_memory_bytes(), 16) - self.assertGreater(cost_info.device_memory_bytes(), 0) + # input `a` is on CPU, 16 bytes + self.assertEqual(cost_info.host_memory_bytes(), 16) + # # w,bias,b, out, memory block is at least 256 bytes on Linux + gt = 16 * 4 if IS_WINDOWS else 256 * 4 + self.assertGreater(cost_info.device_memory_bytes(), gt) self.assertGreaterEqual(cost_info.device_total_memory_bytes(), cost_info.device_memory_bytes()) else: - self.assertGreater(cost_info.host_memory_bytes(), 0) + # x(16 bytes), w(16 bytes), bias(8 bytes), b(16 bytes), out(16 bytes) + self.assertGreaterEqual(cost_info.host_memory_bytes(), 72) self.assertEqual(cost_info.device_memory_bytes(), 0) + self.assertGreaterEqual(cost_info.device_total_memory_bytes(), 0) class MultiStreamModelTestCase(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_reshape_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_reshape_mkldnn_op.py index a28827207ee83..78e5af3311b99 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_reshape_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_reshape_mkldnn_op.py @@ -72,9 +72,9 @@ def init_data(self): class TestReshape2OneDNNOpDimInfer2(TestReshape2OneDNNOp): def init_data(self): - self.ori_shape = (10, 2, 6) - self.new_shape = (10, 0, 3, -1) - self.infered_shape = (10, 2, 3, -1) + self.ori_shape = (6, 20) + self.new_shape = (0, -1, 20) + self.actual_shape = (2, 3, 20) def set_additional_inputs(self): self.inputs["Shape"] = np.array(self.actual_shape, dtype="int32") @@ -85,11 +85,6 @@ def set_outputs(self): 'XShape': np.random.random(self.ori_shape).astype("float32") } - def init_data1(self): - self.ori_shape = (6, 20) - self.new_shape = (0, -1, 20) - self.actual_shape = (2, 3, 20) - class TestReshape2OneDNNOp_attr_OnlyShape(TestReshape2OneDNNOp): def set_additional_inputs(self): @@ -119,6 +114,34 @@ def init_data(self): self.shape = (5, -1, -1) +class TestReshape2OneDNNOpDimInfer1_attr_ShapeTensor(TestReshape2OneDNNOp): + def set_additional_inputs(self): + shape_tensor = [] + for index, ele in enumerate(self.new_shape): + shape_tensor.append(("x" + str(index), np.ones( + (1)).astype('int32') * ele)) + + self.inputs["ShapeTensor"] = shape_tensor + + def init_data(self): + self.ori_shape = (5, 20) + self.new_shape = (5, -1, 10) + self.infered_shape = (5, -1, 10) + self.shape = (5, -1, -1) + + +class TestReshape2OneDNNOpDimInfer1_attr_ShapeTensorAndShape( + TestReshape2OneDNNOpDimInfer1_attr_ShapeTensor): + def set_additional_inputs(self): + shape_tensor = [] + for index, ele in enumerate(self.new_shape): + shape_tensor.append(("x" + str(index), np.ones( + (1)).astype('int32') * ele)) + + self.inputs["Shape"] = np.array((1, 2, 3, 4), dtype="int32") + self.inputs["ShapeTensor"] = shape_tensor + + class TestReshapeOneDNNOp(TestReshape2OneDNNOp): def set_op_type(self): self.op_type = "reshape" diff --git a/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py index 76c81d2d683a0..1b8b13a0d27ea 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py @@ -421,7 +421,7 @@ def compute(x_np, is_test, trainable_statistics): x = np.random.randn(*shape).astype("float32") y1 = compute(x, False, False) y2 = compute(x, True, True) - self.assertTrue(np.allclose(y1, y2)) + self.assertTrue(np.allclose(y1, y2, atol=1e-5)) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/npu/test_log_softmax_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_log_softmax_op_npu.py index e8b680d1ddc1b..f6baefec7f29e 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_log_softmax_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_log_softmax_op_npu.py @@ -22,9 +22,10 @@ import paddle.fluid as fluid from paddle.fluid import core import paddle.nn.functional as F + from test_log_softmax import ref_log_softmax, ref_log_softmax_grad + paddle.enable_static() -np.random.seed(10) class TestLogSoftmaxNPUOp(OpTest): @@ -55,10 +56,16 @@ def set_dtype(self): pass def test_check_output(self): - self.check_output_with_place(self.place) + if self.dtype == np.float16: + self.check_output_with_place(self.place, atol=1e-2) + else: + self.check_output_with_place(self.place) def test_check_grad(self): - pass + if self.dtype == np.float16: + return + self.check_grad_with_place( + self.place, ['X'], ['Out'], user_defined_grads=[self.x_grad]) def test_class(op_type, typename): @@ -88,8 +95,73 @@ def set_dtype(self): globals()[cls_name] = TestLogSoftmaxAxis -for _typename in {'float32'}: +for _typename in {np.float32, np.float16}: test_class("logsoftmax", _typename) test_class2("logsoftmax", _typename) + + +class TestNNLogSoftmaxAPI(unittest.TestCase): + def setUp(self): + self.x_shape = [2, 3, 4, 5] + self.x = np.random.uniform(-1., 1., self.x_shape).astype(np.float32) + self.place = paddle.NPUPlace(0) \ + if paddle.fluid.core.is_compiled_with_npu() \ + else paddle.CPUPlace() + + def check_api(self, axis=-1): + ref_out = np.apply_along_axis(ref_log_softmax, axis, self.x) + + logsoftmax = paddle.nn.LogSoftmax(axis) + # test static api + with paddle.static.program_guard(paddle.static.Program()): + x = paddle.fluid.data(name='x', shape=self.x_shape) + y = logsoftmax(x) + exe = paddle.static.Executor(self.place) + out = exe.run(feed={'x': self.x}, fetch_list=[y]) + self.assertTrue(np.allclose(out[0], ref_out)) + + # test dygrapg api + paddle.disable_static(self.place) + x = paddle.to_tensor(self.x) + y = logsoftmax(x) + self.assertTrue(np.allclose(y.numpy(), ref_out)) + paddle.enable_static() + + def test_check_api(self): + for axis in [-1, 1]: + self.check_api(axis) + + +class TestNNFunctionalLogSoftmaxAPI(unittest.TestCase): + def setUp(self): + self.x_shape = [2, 3, 4, 5] + self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32) + self.place = paddle.NPUPlace(0) \ + if paddle.fluid.core.is_compiled_with_npu() \ + else paddle.CPUPlace() + + def check_api(self, axis=-1, dtype=None): + x = self.x.copy() + if dtype is not None: + x = x.astype(dtype) + ref_out = np.apply_along_axis(ref_log_softmax, axis, x) + with paddle.static.program_guard(paddle.static.Program()): + x = paddle.fluid.data(name='x', shape=self.x_shape) + y = F.log_softmax(x, axis, dtype) + exe = paddle.static.Executor(self.place) + out = exe.run(feed={'x': self.x}, fetch_list=[y]) + self.assertTrue(np.allclose(out[0], ref_out)) + + paddle.disable_static(self.place) + x = paddle.to_tensor(self.x) + y = F.log_softmax(x, axis, dtype) + self.assertTrue(np.allclose(y.numpy(), ref_out), True) + paddle.enable_static() + + def test_check_api(self): + for axis in [-1, 1]: + self.check_api(axis) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_norm_op_npu.py index 2c946bb893127..2c41f09ff5148 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_norm_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_norm_op_npu.py @@ -20,26 +20,18 @@ import numpy as np import paddle import paddle.fluid as fluid -from op_test import OpTest, skip_check_grad_ci +from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci +from paddle.fluid.tests.unittests.test_norm_op import l2_norm -SEED = 2021 - -def l2_norm(x, axis, epsilon): - x2 = x**2 - s = np.sum(x2, axis=axis, keepdims=True) - r = np.sqrt(s) + epsilon - y = x / np.broadcast_to(r, x.shape) - return y, r - - -class TestNorm(OpTest): +class TestNPUNormOp(OpTest): def setUp(self): paddle.enable_static() self.set_npu() self.place = paddle.NPUPlace(0) self.op_type = "norm" self.init_dtype() + self.init_test_case() x = np.random.random(self.shape).astype(self.dtype) y, norm = l2_norm(x, self.axis, self.epsilon) @@ -52,6 +44,8 @@ def set_npu(self): def init_dtype(self): self.dtype = np.float32 + + def init_test_case(self): self.axis = 1 self.epsilon = 1e-10 self.shape = (2, 3, 4, 5) @@ -59,29 +53,50 @@ def init_dtype(self): def test_check_output(self): self.check_output_with_place(self.place) + def test_check_grad(self): + if self.dtype == np.float16: + return -class TestNormOp2(TestNorm): + self.check_grad_with_place( + self.place, ['X'], 'Out', max_relative_error=0.006) + + +class TestNPUNormOp2(TestNPUNormOp): def init_test_case(self): self.shape = [5, 3, 9, 7] self.axis = 0 self.epsilon = 1e-8 - self.dtype = np.float32 -class TestNormOp3(TestNorm): +class TestNPUNormOp3(TestNPUNormOp): def init_test_case(self): self.shape = [5, 3, 2, 7] self.axis = -1 self.epsilon = 1e-8 - self.dtype = np.float32 -class TestNormOp4(TestNorm): +@skip_check_grad_ci(reason="'check_grad' on large inputs is too slow, " + + "however it is desirable to cover the forward pass") +class TestNPUNormOp4(TestNPUNormOp): def init_test_case(self): self.shape = [128, 1024, 14, 14] self.axis = 2 self.epsilon = 1e-8 - self.dtype = np.float32 + + def test_check_grad(self): + pass + + +@skip_check_grad_ci(reason="'check_grad' on large inputs is too slow, " + + "however it is desirable to cover the forward pass") +class TestNPUNormOp5(TestNPUNormOp): + def init_test_case(self): + self.shape = [2048, 2048] + self.axis = 1 + self.epsilon = 1e-8 + + def test_check_grad(self): + pass class API_NormTest(unittest.TestCase): @@ -96,13 +111,15 @@ def test_norm_x_type(): self.assertRaises(TypeError, test_norm_x_type) -class TestNormFP16(TestNorm): +class TestNPUNormOpFP16(TestNPUNormOp): def set_npu(self): self.__class__.use_npu = True self.__class__.no_need_check_grad = True def init_dtype(self): self.dtype = np.float16 + + def init_test_case(self): self.axis = -1 self.epsilon = 1e-10 self.shape = (2, 3, 100) diff --git a/python/paddle/fluid/tests/unittests/test_accuracy_op.py b/python/paddle/fluid/tests/unittests/test_accuracy_op.py index 00cf7d5e9877b..10ab76e4bfb15 100755 --- a/python/paddle/fluid/tests/unittests/test_accuracy_op.py +++ b/python/paddle/fluid/tests/unittests/test_accuracy_op.py @@ -78,6 +78,42 @@ def test_errors(self): paddle.metric.accuracy(input=x3, label=label) +class TestAccuracyAPI1(unittest.TestCase): + def setUp(self): + self.predictions = paddle.static.data( + shape=[2, 5], name="predictions", dtype="float32") + self.label = paddle.static.data( + shape=[2, 1], name="labels", dtype="int64") + self.result = paddle.static.accuracy( + input=self.predictions, label=self.label, k=1) + self.input_predictions = np.array( + [[0.2, 0.1, 0.4, 0.1, 0.1], [0.2, 0.3, 0.1, 0.15, 0.25]], + dtype="float32") + self.input_labels = np.array([[2], [0]], dtype="int64") + self.expect_value = np.array([0.5], dtype='float32') + + def test_api(self): + exe = paddle.static.Executor() + result, = exe.run(feed={ + "predictions": self.input_predictions, + 'labels': self.input_labels + }, + fetch_list=[self.result.name]) + self.assertEqual((result == self.expect_value).all(), True) + + +class TestAccuracyAPI2(unittest.TestCase): + def test_api(self): + with fluid.dygraph.guard(): + predictions = paddle.to_tensor( + [[0.2, 0.1, 0.4, 0.1, 0.1], [0.2, 0.3, 0.1, 0.15, 0.25]], + dtype='float32') + label = paddle.to_tensor([[2], [0]], dtype="int64") + result = paddle.static.accuracy(input=predictions, label=label, k=1) + expect_value = np.array([0.5], dtype='float32') + self.assertEqual((result.numpy() == expect_value).all(), True) + + class TestAccuracyAPI(unittest.TestCase): def test_api(self): with fluid.dygraph.guard(): diff --git a/python/paddle/fluid/tests/unittests/test_deformable_conv_op.py b/python/paddle/fluid/tests/unittests/test_deformable_conv_op.py index 13624d189f72b..45a23231945ec 100644 --- a/python/paddle/fluid/tests/unittests/test_deformable_conv_op.py +++ b/python/paddle/fluid/tests/unittests/test_deformable_conv_op.py @@ -111,7 +111,7 @@ def dconv_im2col_gemm(input, offset, mask, filter, group, conv_param): class TestModulatedDeformableConvOp(OpTest): def setUp(self): self.op_type = "deformable_conv" - self.dtype = np.float32 + self.init_type() self.init_group() self.init_dilation() self.init_test_case() @@ -183,6 +183,9 @@ def init_dilation(self): def init_group(self): self.groups = 1 + def init_type(self): + self.dtype = np.float32 + class TestWithStride(TestModulatedDeformableConvOp): def init_test_case(self): @@ -258,6 +261,32 @@ def init_group(self): self.groups = 2 +class TestWithDouble(TestModulatedDeformableConvOp): + def init_type(self): + self.dtype = np.float64 + + def init_test_case(self): + self.pad = [1, 1] + self.stride = [1, 1] + self.dilations = [1, 1] + self.input_size = [2, 6, 4, 4] # NCHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [4, f_c, 3, 3] + self.im2col_step = 1 + self.deformable_groups = 1 + offset_c = 2 * self.deformable_groups * self.filter_size[ + 2] * self.filter_size[3] + mask_c = self.deformable_groups * self.filter_size[ + 2] * self.filter_size[3] + self.offset_size = [ + self.input_size[0], offset_c, self.input_size[2], self.input_size[3] + ] + self.mask_size = [ + self.input_size[0], mask_c, self.input_size[2], self.input_size[3] + ] + + class TestModulatedDeformableConvInvalidInput(unittest.TestCase): def test_error(self): def test_invalid_input(): diff --git a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py index 769f05b0fcd59..e8b18d601afae 100644 --- a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py +++ b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py @@ -108,7 +108,7 @@ def dconv_im2col_gemm(input, offset, filter, group, conv_param): class TestModulatedDeformableConvOp(OpTest): def setUp(self): self.op_type = "deformable_conv_v1" - self.dtype = np.float32 + self.init_type() self.init_group() self.init_dilation() self.init_test_case() @@ -177,6 +177,9 @@ def init_dilation(self): def init_group(self): self.groups = 1 + def init_type(self): + self.dtype = np.float32 + class TestWithStride(TestModulatedDeformableConvOp): def init_test_case(self): @@ -253,6 +256,11 @@ def init_group(self): self.groups = 2 +class TestWithDouble(TestModulatedDeformableConvOp): + def init_type(self): + self.dtype = np.float64 + + class TestModulatedDeformableConvV1InvalidInput(unittest.TestCase): def test_error(self): def test_invalid_input(): diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv1d.py b/python/paddle/fluid/tests/unittests/test_functional_conv1d.py new file mode 100644 index 0000000000000..b803835d107d4 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_functional_conv1d.py @@ -0,0 +1,70 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn.functional as F +from paddle import fluid +import paddle.fluid.dygraph as dg +import paddle.fluid.initializer as I +import numpy as np +import unittest +from unittest import TestCase + + +class TestFunctionalConv1DError(TestCase): + def setUp(self): + self.input = [] + self.filter = [] + self.bias = None + self.padding = 0 + self.stride = 1 + self.dilation = 1 + self.groups = 1 + self.data_format = "NCL" + + def dygraph_case(self): + with dg.guard(): + x = dg.to_variable(self.input, dtype=paddle.float32) + w = dg.to_variable(self.filter, dtype=paddle.float32) + b = None if self.bias is None else dg.to_variable( + self.bias, dtype=paddle.float32) + y = F.conv1d( + x, + w, + b, + padding=self.padding, + stride=self.stride, + dilation=self.dilation, + groups=self.groups, + data_format=self.data_format) + + def test_exception(self): + with self.assertRaises(ValueError): + self.dygraph_case() + + +class TestFunctionalConv1DErrorCase1(TestFunctionalConv1DError): + def setUp(self): + self.input = np.random.randn(1, 3, 3) + self.filter = np.random.randn(3, 3, 1) + self.bias = None + self.padding = 0 + self.stride = 1 + self.dilation = 1 + self.groups = 0 + self.data_format = "NCL" + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv1d_transpose.py b/python/paddle/fluid/tests/unittests/test_functional_conv1d_transpose.py new file mode 100644 index 0000000000000..4284ab48827e0 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_functional_conv1d_transpose.py @@ -0,0 +1,70 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn.functional as F +from paddle import fluid +import paddle.fluid.dygraph as dg +import paddle.fluid.initializer as I +import numpy as np +import unittest +from unittest import TestCase + + +class TestFunctionalConv1DError(TestCase): + def setUp(self): + self.input = [] + self.filter = [] + self.bias = None + self.padding = 0 + self.stride = 1 + self.dilation = 1 + self.groups = 1 + self.data_format = "NCL" + + def dygraph_case(self): + with dg.guard(): + x = dg.to_variable(self.input, dtype=paddle.float32) + w = dg.to_variable(self.filter, dtype=paddle.float32) + b = None if self.bias is None else dg.to_variable( + self.bias, dtype=paddle.float32) + y = F.conv1d_transpose( + x, + w, + b, + padding=self.padding, + stride=self.stride, + dilation=self.dilation, + groups=self.groups, + data_format=self.data_format) + + def test_exception(self): + with self.assertRaises(ValueError): + self.dygraph_case() + + +class TestFunctionalConv1DErrorCase1(TestFunctionalConv1DError): + def setUp(self): + self.input = np.random.randn(1, 3, 3) + self.filter = np.random.randn(3, 3, 1) + self.bias = None + self.padding = 0 + self.stride = 1 + self.dilation = 1 + self.groups = 0 + self.data_format = "NCL" + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv2d.py b/python/paddle/fluid/tests/unittests/test_functional_conv2d.py index 766e1bb1d34af..cec48724da2fe 100644 --- a/python/paddle/fluid/tests/unittests/test_functional_conv2d.py +++ b/python/paddle/fluid/tests/unittests/test_functional_conv2d.py @@ -457,5 +457,81 @@ def setUp(self): self.data_format = "NHCW" +class TestFunctionalConv2DErrorCase12(TestCase): + def setUp(self): + self.input = np.array([]) + self.filter = np.array([]) + self.num_filters = 0 + self.filter_size = 0 + self.bias = None + self.padding = 0 + self.stride = 1 + self.dilation = 1 + self.groups = 1 + self.data_format = "NCHW" + + def static_graph_case(self): + main = fluid.Program() + start = fluid.Program() + with fluid.unique_name.guard(): + with fluid.program_guard(main, start): + x = fluid.data("input", self.input.shape, dtype=paddle.float32) + y = fluid.layers.conv2d( + x, + self.num_filters, + self.filter_size, + stride=self.stride, + padding=self.padding, + dilation=self.dilation, + groups=self.groups, + param_attr=I.NumpyArrayInitializer(self.filter), + bias_attr=False if self.bias is None else + I.NumpyArrayInitializer(self.bias), + act=None, + data_format=self.data_format) + exe = fluid.Executor() + exe.run(start) + out, = exe.run(main, feed={"input": self.input}, fetch_list=[y]) + return out + + def dygraph_case(self): + with dg.guard(): + x = dg.to_variable(self.input, dtype=paddle.float32) + w = dg.to_variable(self.filter, dtype=paddle.float32) + b = None if self.bias is None else dg.to_variable( + self.bias, dtype=paddle.float32) + y = F.conv2d( + x, + w, + b, + padding=self.padding, + stride=self.stride, + dilation=self.dilation, + groups=self.groups, + data_format=self.data_format) + + def test_dygraph_exception(self): + with self.assertRaises(ValueError): + self.dygraph_case() + + def test_static_exception(self): + with self.assertRaises(ValueError): + self.static_graph_case() + + +class TestFunctionalConv2DErrorCase13(TestFunctionalConv2DErrorCase12): + def setUp(self): + self.input = np.random.randn(1, 3, 3, 3) + self.filter = np.random.randn(3, 3, 1, 1) + self.num_filters = 3 + self.filter_size = 1 + self.bias = None + self.padding = 0 + self.stride = 1 + self.dilation = 1 + self.groups = 0 + self.data_format = "NCHW" + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py b/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py index e3b821a07bffd..f25a15106c491 100644 --- a/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py +++ b/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py @@ -463,5 +463,81 @@ def setUp(self): self.data_format = "NCHW" +class TestFunctionalConv2DErrorCase10(TestCase): + def setUp(self): + self.input = np.array([]) + self.filter = np.array([]) + self.num_filters = 0 + self.filter_size = 0 + self.bias = None + self.padding = 0 + self.stride = 1 + self.dilation = 1 + self.groups = 1 + self.data_format = "NCHW" + + def static_graph_case(self): + main = fluid.Program() + start = fluid.Program() + with fluid.unique_name.guard(): + with fluid.program_guard(main, start): + x = fluid.data("input", self.input.shape, dtype=paddle.float32) + y = fluid.layers.conv2d( + x, + self.num_filters, + self.filter_size, + stride=self.stride, + padding=self.padding, + dilation=self.dilation, + groups=self.groups, + param_attr=I.NumpyArrayInitializer(self.filter), + bias_attr=False if self.bias is None else + I.NumpyArrayInitializer(self.bias), + act=None, + data_format=self.data_format) + exe = fluid.Executor() + exe.run(start) + out, = exe.run(main, feed={"input": self.input}, fetch_list=[y]) + return out + + def dygraph_case(self): + with dg.guard(): + x = dg.to_variable(self.input, dtype=paddle.float32) + w = dg.to_variable(self.filter, dtype=paddle.float32) + b = None if self.bias is None else dg.to_variable( + self.bias, dtype=paddle.float32) + y = F.conv2d_transpose( + x, + w, + b, + padding=self.padding, + stride=self.stride, + dilation=self.dilation, + groups=self.groups, + data_format=self.data_format) + + def test_dygraph_exception(self): + with self.assertRaises(ValueError): + self.dygraph_case() + + def test_static_exception(self): + with self.assertRaises(ValueError): + self.static_graph_case() + + +class TestFunctionalConv2DErrorCase11(TestFunctionalConv2DErrorCase10): + def setUp(self): + self.input = np.random.randn(1, 3, 3, 3) + self.filter = np.random.randn(3, 3, 1, 1) + self.num_filters = 3 + self.filter_size = 1 + self.bias = None + self.padding = 0 + self.stride = 1 + self.dilation = 1 + self.groups = 0 + self.data_format = "NCHW" + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv3d.py b/python/paddle/fluid/tests/unittests/test_functional_conv3d.py index b413a56c07a9c..8ccaf30cbdb34 100644 --- a/python/paddle/fluid/tests/unittests/test_functional_conv3d.py +++ b/python/paddle/fluid/tests/unittests/test_functional_conv3d.py @@ -432,5 +432,81 @@ def setUp(self): self.data_format = "NDHWC" +class TestFunctionalConv3DErrorCase11(TestCase): + def setUp(self): + self.input = np.array([]) + self.filter = np.array([]) + self.num_filters = 0 + self.filter_size = 0 + self.bias = None + self.padding = 0 + self.stride = 1 + self.dilation = 1 + self.groups = 1 + self.data_format = "NCDHW" + + def static_graph_case(self): + main = fluid.Program() + start = fluid.Program() + with fluid.unique_name.guard(): + with fluid.program_guard(main, start): + x = fluid.data("input", self.input.shape, dtype=paddle.float32) + y = fluid.layers.conv3d( + x, + self.num_filters, + self.filter_size, + stride=self.stride, + padding=self.padding, + dilation=self.dilation, + groups=self.groups, + param_attr=I.NumpyArrayInitializer(self.filter), + bias_attr=False if self.bias is None else + I.NumpyArrayInitializer(self.bias), + act=None, + data_format=self.data_format) + exe = fluid.Executor() + exe.run(start) + out, = exe.run(main, feed={"input": self.input}, fetch_list=[y]) + return out + + def dygraph_case(self): + with dg.guard(): + x = dg.to_variable(self.input, dtype=paddle.float32) + w = dg.to_variable(self.filter, dtype=paddle.float32) + b = None if self.bias is None else dg.to_variable( + self.bias, dtype=paddle.float32) + y = F.conv3d( + x, + w, + b, + padding=self.padding, + stride=self.stride, + dilation=self.dilation, + groups=self.groups, + data_format=self.data_format) + + def test_dygraph_exception(self): + with self.assertRaises(ValueError): + self.dygraph_case() + + def test_static_exception(self): + with self.assertRaises(ValueError): + self.static_graph_case() + + +class TestFunctionalConv3DErrorCase12(TestFunctionalConv3DErrorCase11): + def setUp(self): + self.input = np.random.randn(1, 3, 3, 3, 3) + self.filter = np.random.randn(3, 3, 1, 1, 1) + self.num_filters = 3 + self.filter_size = 1 + self.bias = None + self.padding = 0 + self.stride = 1 + self.dilation = 1 + self.groups = 0 + self.data_format = "NCDHW" + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv3d_transpose.py b/python/paddle/fluid/tests/unittests/test_functional_conv3d_transpose.py index 910d28515b778..a003de6596822 100644 --- a/python/paddle/fluid/tests/unittests/test_functional_conv3d_transpose.py +++ b/python/paddle/fluid/tests/unittests/test_functional_conv3d_transpose.py @@ -483,5 +483,82 @@ def setUp(self): self.data_format = "NCDHW" +class TestFunctionalConv3DTransposeErrorCase10(TestCase): + def setUp(self): + self.input = np.array([]) + self.filter = np.array([]) + self.num_filters = 0 + self.filter_size = 0 + self.bias = None + self.padding = 0 + self.stride = 1 + self.dilation = 1 + self.groups = 1 + self.data_format = "NCDHW" + + def static_graph_case(self): + main = fluid.Program() + start = fluid.Program() + with fluid.unique_name.guard(): + with fluid.program_guard(main, start): + x = fluid.data("input", self.input.shape, dtype=paddle.float32) + y = fluid.layers.conv3d_transpose( + x, + self.num_filters, + self.filter_size, + stride=self.stride, + padding=self.padding, + dilation=self.dilation, + groups=self.groups, + param_attr=I.NumpyArrayInitializer(self.filter), + bias_attr=False if self.bias is None else + I.NumpyArrayInitializer(self.bias), + act=None, + data_format=self.data_format) + exe = fluid.Executor() + exe.run(start) + out, = exe.run(main, feed={"input": self.input}, fetch_list=[y]) + return out + + def dygraph_case(self): + with dg.guard(): + x = dg.to_variable(self.input, dtype=paddle.float32) + w = dg.to_variable(self.filter, dtype=paddle.float32) + b = None if self.bias is None else dg.to_variable( + self.bias, dtype=paddle.float32) + y = F.conv3d_transpose( + x, + w, + b, + padding=self.padding, + stride=self.stride, + dilation=self.dilation, + groups=self.groups, + data_format=self.data_format) + + def test_dygraph_exception(self): + with self.assertRaises(ValueError): + self.dygraph_case() + + def test_static_exception(self): + with self.assertRaises(ValueError): + self.static_graph_case() + + +class TestFunctionalConv3DTransposeErrorCase11( + TestFunctionalConv3DTransposeErrorCase10): + def setUp(self): + self.input = np.random.randn(1, 3, 3, 3, 3) + self.filter = np.random.randn(3, 3, 1, 1, 1) + self.num_filters = 3 + self.filter_size = 1 + self.bias = None + self.padding = 0 + self.stride = 1 + self.dilation = 1 + self.groups = 0 + self.data_format = "NCDHW" + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_memcpy_op.py b/python/paddle/fluid/tests/unittests/test_memcpy_op.py index 3fecef9397c63..d6efe4d471efd 100755 --- a/python/paddle/fluid/tests/unittests/test_memcpy_op.py +++ b/python/paddle/fluid/tests/unittests/test_memcpy_op.py @@ -64,7 +64,7 @@ def get_prog(self): }) return main_program, gpu_var, pinned_var - def test_gpu_cpoy_to_pinned(self): + def test_gpu_copy_to_pinned(self): main_program, gpu_var, pinned_var = self.get_prog() main_program.global_block().append_op( type='memcpy', @@ -79,7 +79,7 @@ def test_gpu_cpoy_to_pinned(self): self.assertTrue(np.allclose(gpu_, pinned_)) self.assertTrue(np.allclose(pinned_, np.ones((10, 10)))) - def test_pinned_cpoy_gpu(self): + def test_pinned_copy_gpu(self): main_program, gpu_var, pinned_var = self.get_prog() main_program.global_block().append_op( type='memcpy', @@ -94,6 +94,59 @@ def test_pinned_cpoy_gpu(self): self.assertTrue(np.allclose(gpu_, pinned_)) self.assertTrue(np.allclose(gpu_, np.zeros((10, 10)))) + def test_hip_copy_bool_value(self): + if core.is_compiled_with_rocm(): + paddle.enable_static() + main_program = Program() + with program_guard(main_program): + pinned_var_name = "tensor@Pinned" + gpu_var_name = "tensor@GPU" + pinned_var = main_program.global_block().create_var( + name=pinned_var_name, + shape=[1], + dtype='bool', + persistable=False, + stop_gradient=True) + gpu_var = main_program.global_block().create_var( + name=gpu_var_name, + shape=[1], + dtype='bool', + persistable=False, + stop_gradient=True) + main_program.global_block().append_op( + type="fill_constant", + outputs={"Out": gpu_var_name}, + attrs={ + "shape": [1], + "dtype": gpu_var.dtype, + "value": False, + "place_type": 1 + }) + main_program.global_block().append_op( + type="fill_constant", + outputs={"Out": pinned_var_name}, + attrs={ + "shape": [1], + "dtype": gpu_var.dtype, + "value": True, + "place_type": 2 + }) + + main_program.global_block().append_op( + type='memcpy', + inputs={'X': pinned_var}, + outputs={'Out': gpu_var}, + attrs={'dst_place_type': 1}) + place = fluid.CUDAPlace(0) + exe = fluid.Executor(place) + gpu_, pinned_ = exe.run(main_program, + feed={}, + fetch_list=[gpu_var.name, pinned_var.name]) + expect_value = np.array([1]).astype('bool') + self.assertTrue(np.array_equal(gpu_, expect_value)) + else: + pass + class TestMemcpyOPError(unittest.TestCase): def get_prog(self): diff --git a/python/paddle/fluid/transpiler/collective.py b/python/paddle/fluid/transpiler/collective.py index 308a876977cf4..ec8602ec7e672 100644 --- a/python/paddle/fluid/transpiler/collective.py +++ b/python/paddle/fluid/transpiler/collective.py @@ -29,7 +29,7 @@ from ..framework import Program, default_main_program, default_startup_program from .details import wait_server_ready -__all__ = ['GradAllReduce', 'LocalSGD'] +__all__ = ['GradAllReduce', 'LocalSGD', 'MultiThread'] OpRole = core.op_proto_and_checker_maker.OpRole @@ -97,8 +97,14 @@ def _transpile_startup_program(self): self.wait_port) self._broadcast_params() - def _init_communicator(self, program, current_endpoint, endpoints, rank, - ring_id, wait_port): + def _init_communicator(self, + program, + current_endpoint, + endpoints, + rank, + ring_id, + wait_port, + has_multitrainer=False): nranks = len(endpoints) other_endpoints = endpoints[:] other_endpoints.remove(current_endpoint) @@ -150,16 +156,28 @@ def _init_communicator(self, program, current_endpoint, endpoints, rank, 'other_endpoints': other_endpoints, self.op_role_key: OpRole.Forward }) - block.append_op( - type='c_comm_init', - inputs={'X': nccl_id_var}, - outputs={}, - attrs={ - 'nranks': nranks, - 'rank': rank, - 'ring_id': ring_id, - self.op_role_key: OpRole.Forward - }) + if not has_multitrainer: + block.append_op( + type='c_comm_init', + inputs={'X': nccl_id_var}, + outputs={}, + attrs={ + 'nranks': nranks, + 'rank': rank, + 'ring_id': ring_id, + self.op_role_key: OpRole.Forward + }) + else: + block.append_op( + type='c_comm_init_multitrainer', + inputs={'X': nccl_id_var}, + outputs={}, + attrs={ + 'ntrainers': nranks, + 'trainer_id': rank, + 'ring_id': ring_id, + self.op_role_key: OpRole.Forward + }) def _broadcast_params(self): block = self.startup_program.global_block() @@ -425,7 +443,7 @@ class MultiThread(GradAllReduce): def __init__(self, nrings=1): GradAllReduce.__init__(self, nrings) - self.mode = "box" + self.mode = "single_process_multi_thread" def _transpile_startup_program(self): if len(self.endpoints) > 1: @@ -434,9 +452,9 @@ def _transpile_startup_program(self): print("total endpoints: ", self.endpoints) print("rank: %d, ring_id: %d" % (self.rank, self.nrings)) for ring_id in range(self.nrings): - self._init_communicator(self.startup_program, - self.current_endpoint, self.endpoints, - self.rank, ring_id, self.wait_port) + self._init_communicator( + self.startup_program, self.current_endpoint, self.endpoints, + self.rank, ring_id, self.wait_port, True) else: print("begin to _transpile_startup_program for single-node") diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py index 319248dfda2fa..fcf6f1cdac4b3 100644 --- a/python/paddle/nn/functional/conv.py +++ b/python/paddle/nn/functional/conv.py @@ -299,12 +299,20 @@ def conv1d(x, channel_last = (data_format == "NLC") channel_dim = -1 if channel_last else 1 conv2d_data_format = "NHWC" if channel_last else "NCHW" + if len(x.shape) != 3: + raise ValueError( + "Input x should be 3D tensor, but received x with the shape of {}". + format(x.shape)) num_channels = x.shape[channel_dim] num_filters = weight.shape[0] if num_channels < 0: raise ValueError("The channel dimension of the input({}) " "should be defined. Received: {}.".format( x.shape, num_channels)) + if groups <= 0: + raise ValueError( + "The groups of conv1d should be greater than 0. Received groups: {}". + format(groups)) if num_channels % groups != 0: raise ValueError( "the channel of input must be divisible by groups," @@ -508,12 +516,20 @@ def conv2d(x, channel_last = (data_format == "NHWC") channel_dim = -1 if channel_last else 1 + if len(x.shape) != 4: + raise ValueError( + "Input x should be 4D tensor, but received x with the shape of {}". + format(x.shape)) num_channels = x.shape[channel_dim] num_filters = weight.shape[0] if num_channels < 0: raise ValueError("The channel dimension of the input({}) " "should be defined. Received: {}.".format( x.shape, num_channels)) + if groups <= 0: + raise ValueError( + "The groups of conv2d should be greater than 0. Received groups: {}". + format(groups)) if num_channels % groups != 0: raise ValueError( "the channel of input must be divisible by groups," @@ -710,12 +726,20 @@ def conv1d_transpose(x, data_format)) channel_last = (data_format == "NLC") channel_dim = -1 if channel_last else 1 + if len(x.shape) != 3: + raise ValueError( + "Input x should be 3D tensor, but received x with the shape of {}". + format(x.shape)) num_channels = x.shape[channel_dim] if num_channels < 0: raise ValueError("The channel dimension of the input({}) " "should be defined. Received: {}.".format( x.shape, num_channels)) + if groups <= 0: + raise ValueError( + "The groups of conv1d_transpose should be greater than 0. Received groups: {}". + format(groups)) if num_channels % groups != 0: raise ValueError( "the channel of input must be divisible by groups," @@ -964,11 +988,19 @@ def conv2d_transpose(x, data_format)) channel_last = (data_format == "NHWC") channel_dim = -1 if channel_last else 1 + if len(x.shape) != 4: + raise ValueError( + "Input x should be 4D tensor, but received x with the shape of {}". + format(x.shape)) num_channels = x.shape[channel_dim] if num_channels < 0: raise ValueError("The channel dimension of the input({}) " "should be defined. Received: {}.".format( x.shape, num_channels)) + if groups <= 0: + raise ValueError( + "The groups of conv2d_transpose should be greater than 0. Received groups: {}". + format(groups)) if num_channels % groups != 0: raise ValueError( "the channel of input must be divisible by groups," @@ -1167,12 +1199,20 @@ def conv3d(x, channel_last = (data_format == "NDHWC") channel_dim = -1 if channel_last else 1 + if len(x.shape) != 5: + raise ValueError( + "Input x should be 5D tensor, but received x with the shape of {}". + format(x.shape)) num_channels = x.shape[channel_dim] num_filters = weight.shape[0] if num_channels < 0: raise ValueError( "The channel dimension of the input({}) should be defined. " "Received: {}.".format(x.shape, num_channels)) + if groups <= 0: + raise ValueError( + "The groups of conv3d should be greater than 0. Received groups: {}". + format(groups)) if num_channels % groups != 0: raise ValueError( "The number of input channels must be divisible by Attr(groups). " @@ -1358,12 +1398,20 @@ def conv3d_transpose(x, channel_last = (data_format == "NDHWC") channel_dim = -1 if channel_last else 1 + if len(x.shape) != 5: + raise ValueError( + "Input x should be 5D tensor, but received x with the shape of {}". + format(x.shape)) num_channels = x.shape[channel_dim] num_filters = weight.shape[1] if num_channels < 0: raise ValueError( "The channel dimension of the input({}) should be defined. " "Received: {}.".format(x.shape, num_channels)) + if groups <= 0: + raise ValueError( + "The groups of conv3d_transpose should be greater than 0. Received groups: {}". + format(groups)) if num_channels % groups != 0: raise ValueError( "The number of input channels must be divisible by Attr(groups). " diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index 7e2880dbf64e8..29f3425cb7687 100755 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -1657,12 +1657,6 @@ def trace(x, offset=0, axis1=0, axis2=1, name=None): data2 = paddle.trace(case2, offset=1, axis1=1, axis2=2) # data2.shape = [3] data3 = paddle.trace(case3, offset=-3, axis1=1, axis2=-1) # data2.shape = [3, 5] """ - if in_dygraph_mode(): - return _C_ops.trace(x, 'offset', offset, 'axis1', axis1, 'axis2', axis2) - - inputs = {'Input': [x]} - attrs = {'offset': offset, 'axis1': axis1, 'axis2': axis2} - def __check_input(input, offset, dim1, dim2): check_dtype(x.dtype, 'Input', ['int32', 'int64', 'float16', 'float32', 'float64'], @@ -1677,11 +1671,11 @@ def __check_input(input, offset, dim1, dim2): axis1_ = axis1 if axis1 >= 0 else len(input_shape) + axis1 axis2_ = axis2 if axis2 >= 0 else len(input_shape) + axis2 - assert axis1_ < len(input_shape), \ + assert ((0 <= axis1_) and (axis1_ < len(input_shape))), \ "The argument axis1 is out of range (expected to be in range of [%d, %d], but got %d).\n" \ % (-(len(input_shape)), len(input_shape) - 1, axis1) - assert axis2_ < len(input_shape), \ + assert ((0 <= axis2_) and (axis2_ < len(input_shape))), \ "The argument axis2 is out of range (expected to be in range of [%d, %d], but got %d).\n" \ % (-(len(input_shape)), len(input_shape) - 1, axis2) @@ -1691,6 +1685,11 @@ def __check_input(input, offset, dim1, dim2): "But received axis1 = %d, axis2 = %d\n"%(axis1, axis2) __check_input(input, offset, axis1, axis2) + if in_dygraph_mode(): + return _C_ops.trace(x, 'offset', offset, 'axis1', axis1, 'axis2', axis2) + + inputs = {'Input': [x]} + attrs = {'offset': offset, 'axis1': axis1, 'axis2': axis2} helper = LayerHelper('trace', **locals()) out = helper.create_variable_for_type_inference(dtype=x.dtype) diff --git a/python/paddle/utils/cpp_extension/cpp_extension.py b/python/paddle/utils/cpp_extension/cpp_extension.py index dcaa1ca15e5dc..19fa84046ed2d 100644 --- a/python/paddle/utils/cpp_extension/cpp_extension.py +++ b/python/paddle/utils/cpp_extension/cpp_extension.py @@ -355,6 +355,8 @@ def __init__(self, *args, **kwargs): super(BuildExtension, self).__init__(*args, **kwargs) self.no_python_abi_suffix = kwargs.get("no_python_abi_suffix", True) self.output_dir = kwargs.get("output_dir", None) + # whether containing cuda source file in Extensions + self.contain_cuda_file = False def initialize_options(self): super(BuildExtension, self).initialize_options() @@ -432,6 +434,9 @@ def unix_custom_single_compiler(obj, src, ext, cc_args, extra_postargs, # shared library have same ABI suffix with core_(no)avx.so. # See https://stackoverflow.com/questions/34571583/understanding-gcc-5s-glibcxx-use-cxx11-abi-or-the-new-abi add_compile_flag(['-D_GLIBCXX_USE_CXX11_ABI=1'], cflags) + # Append this macor only when jointly compiling .cc with .cu + if not is_cuda_file(src) and self.contain_cuda_file: + cflags.append('-DPADDLE_WITH_CUDA') add_std_without_repeat( cflags, self.compiler.compiler_type, use_std14=True) @@ -506,6 +511,9 @@ def win_custom_spawn(cmd): elif isinstance(self.cflags, list): cflags = MSVC_COMPILE_FLAGS + self.cflags cmd += cflags + # Append this macor only when jointly compiling .cc with .cu + if not is_cuda_file(src) and self.contain_cuda_file: + cmd.append('-DPADDLE_WITH_CUDA') return original_spawn(cmd) @@ -633,6 +641,8 @@ def _record_op_info(self): for i, extension in enumerate(self.extensions): sources = [os.path.abspath(s) for s in extension.sources] + if not self.contain_cuda_file: + self.contain_cuda_file = any([is_cuda_file(s) for s in sources]) op_names = parse_op_name_from(sources) for op_name in op_names: diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh index 05365f028f51d..88c8ba3dab9f6 100644 --- a/tools/windows/run_unittests.sh +++ b/tools/windows/run_unittests.sh @@ -96,7 +96,6 @@ disable_wincpu_test="^jit_kernel_test$|\ ^test_bmn$|\ ^test_mobile_net$|\ ^test_resnet_v2$|\ -^test_resnet_amp$|\ ^test_build_strategy$|\ ^test_se_resnet$|\ ^disable_wincpu_test$"