AnnaTrainingG · AnnaTrainingG · Jul 12, 2021 · Jul 6, 2021 · Jul 6, 2021 · Jul 6, 2021
diff --git a/AUTHORS.md b/AUTHORS.md
@@ -78,3 +78,6 @@
 | zhaopu7 | Pu Zhao |
 | zhouxiao-coder | Xiao Zhou |
 | Zrachel | Rui-Qing Zhang |
+| jeng1220 | Bai-Cheng(Ryan) Jeng (NVIDIA) |
+| mingxu1067 | Ming Huang (NVIDIA) |
+| zlsh80826 | Reese Wang (NVIDIA) |
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
@@ -233,3 +233,4 @@ endif()
 mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
 mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
 
+include(thrust)
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
@@ -35,7 +35,7 @@ ELSE ()
 ENDIF()
 
 SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210625")
+SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210701")
 SET(XPU_XRE_URL  "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
 SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
 SET(XPU_XCCL_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210623/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)

diff --git a/cmake/hip.cmake b/cmake/hip.cmake
@@ -85,3 +85,5 @@ message(STATUS "HIP library name: ${hip_library_name}")
 # set HIP link libs
 find_library(ROCM_HIPRTC_LIB ${hip_library_name} HINTS ${HIP_PATH}/lib)
 message(STATUS "ROCM_HIPRTC_LIB: ${ROCM_HIPRTC_LIB}")
+
+include(thrust)
diff --git a/cmake/thrust.cmake b/cmake/thrust.cmake
@@ -0,0 +1,24 @@
+function(add_thrust_patches_if_necessary)
+  set(thrust_detect_file ${PROJECT_BINARY_DIR}/detect_thrust.cu)
+  file(WRITE ${thrust_detect_file} ""
+    "#include \"thrust/version.h\"\n"
+    "#include \"thrust/shuffle.h\"\n"
+    "#include \"stdio.h\"\n"
+    "int main() {\n"
+    "  int version = THRUST_VERSION;\n"
+    "  printf(\"%d\", version);\n"
+    "  return 0;\n"
+    "}\n")
+
+  execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}"
+                  "--run" "${thrust_detect_file}"
+                  WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
+                  RESULT_VARIABLE nvcc_res ERROR_QUIET)
+  if(NOT nvcc_res EQUAL 0)
+    set(thrust_patches "${PADDLE_SOURCE_DIR}/patches/thrust")
+    message(STATUS "Add thrust patches: ${thrust_patches}")
+    include_directories(${thrust_patches})
+  endif()
+endfunction()
+
+add_thrust_patches_if_necessary()
diff --git a/paddle/fluid/extension/include/ext_all.h b/paddle/fluid/extension/include/ext_all.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#if !defined(_MSC_VER) && __cplusplus < 199711L
-#error C++11 or later compatible compiler is required to use Paddle.
+#if !defined(_MSC_VER) && __cplusplus < 201402L
+#error C++14 or later compatible compiler is required to use Paddle.
 #endif
 
 #ifdef _WIN32

diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
@@ -31,6 +31,11 @@ USE_INT_STAT(STAT_total_feasign_num_in_mem);
 namespace paddle {
 namespace framework {
 
+DLManager& global_dlmanager_pool() {
+  static DLManager manager;
+  return manager;
+}
+
 void RecordCandidateList::ReSize(size_t length) {
   mutex_.lock();
   capacity_ = length;
@@ -366,6 +371,10 @@ void InMemoryDataFeed<T>::SetParseInsId(bool parse_ins_id) {
 template <typename T>
 void InMemoryDataFeed<T>::LoadIntoMemory() {
 #ifdef _LINUX
+  if (!so_parser_name_.empty()) {
+    LoadIntoMemoryFromSo();
+    return;
+  }
   VLOG(3) << "LoadIntoMemory() begin, thread_id=" << thread_id_;
   std::string filename;
   while (this->PickOneFile(&filename)) {
@@ -408,6 +417,51 @@ void InMemoryDataFeed<T>::LoadIntoMemory() {
 #endif
 }
 
+template <typename T>
+void InMemoryDataFeed<T>::LoadIntoMemoryFromSo() {
+#ifdef _LINUX
+  VLOG(3) << "LoadIntoMemoryFromSo() begin, thread_id=" << thread_id_;
+
+  string::LineFileReader reader;
+  paddle::framework::CustomParser* parser =
+      global_dlmanager_pool().Load(so_parser_name_, slot_conf_);
+
+  std::string filename;
+  while (this->PickOneFile(&filename)) {
+    VLOG(3) << "PickOneFile, filename=" << filename
+            << ", thread_id=" << thread_id_;
+    int err_no = 0;
+    this->fp_ = fs_open_read(filename, &err_no, this->pipe_command_);
+    CHECK(this->fp_ != nullptr);
+    __fsetlocking(&*(this->fp_), FSETLOCKING_BYCALLER);
+
+    paddle::framework::ChannelWriter<T> writer(input_channel_);
+    T instance;
+    platform::Timer timeline;
+    timeline.Start();
+
+    while (1) {
+      if (!reader.getline(&*(fp_.get()))) {
+        break;
+      } else {
+        const char* str = reader.get();
+        ParseOneInstanceFromSo(str, &instance, parser);
+      }
+
+      writer << std::move(instance);
+      instance = T();
+    }
+
+    writer.Flush();
+    timeline.Pause();
+    VLOG(3) << "LoadIntoMemoryFromSo() read all lines, file=" << filename
+            << ", cost time=" << timeline.ElapsedSec()
+            << " seconds, thread_id=" << thread_id_;
+  }
+  VLOG(3) << "LoadIntoMemoryFromSo() end, thread_id=" << thread_id_;
+#endif
+}
+
 // explicit instantiation
 template class InMemoryDataFeed<Record>;
 
@@ -827,16 +881,23 @@ void MultiSlotInMemoryDataFeed::Init(
   inductive_shape_index_.resize(all_slot_num);
   use_slots_.clear();
   use_slots_is_dense_.clear();
+  slot_conf_.resize(all_slot_num);
   for (size_t i = 0; i < all_slot_num; ++i) {
     const auto& slot = multi_slot_desc.slots(i);
     all_slots_[i] = slot.name();
     all_slots_type_[i] = slot.type();
     use_slots_index_[i] = slot.is_used() ? use_slots_.size() : -1;
+
+    slot_conf_[i].name = slot.name();
+    slot_conf_[i].type = slot.type();
+    slot_conf_[i].use_slots_index = use_slots_index_[i];
+
     total_dims_without_inductive_[i] = 1;
     inductive_shape_index_[i] = -1;
     if (slot.is_used()) {
       use_slots_.push_back(all_slots_[i]);
       use_slots_is_dense_.push_back(slot.is_dense());
+      slot_conf_[i].use_slots_is_dense = slot.is_dense();
       std::vector<int> local_shape;
       if (slot.is_dense()) {
         for (int j = 0; j < slot.shape_size(); ++j) {
@@ -869,6 +930,7 @@ void MultiSlotInMemoryDataFeed::Init(
   }
   visit_.resize(all_slot_num, false);
   pipe_command_ = data_feed_desc.pipe_command();
+  so_parser_name_ = data_feed_desc.so_parser_name();
   finish_init_ = true;
   input_type_ = data_feed_desc.input_type();
 }
@@ -887,6 +949,12 @@ void MultiSlotInMemoryDataFeed::GetMsgFromLogKey(const std::string& log_key,
   *rank = (uint32_t)strtoul(rank_str.c_str(), NULL, 16);
 }
 
+void MultiSlotInMemoryDataFeed::ParseOneInstanceFromSo(const char* str,
+                                                       Record* instance,
+                                                       CustomParser* parser) {
+  parser->ParseOneInstance(str, instance);
+}
+
 bool MultiSlotInMemoryDataFeed::ParseOneInstanceFromPipe(Record* instance) {
 #ifdef _LINUX
   thread_local string::LineFileReader reader;

diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h
@@ -117,6 +117,94 @@ using PvInstance = PvInstanceObject*;
 
 inline PvInstance make_pv_instance() { return new PvInstanceObject(); }
 
+struct SlotConf {
+  std::string name;
+  std::string type;
+  int use_slots_index;
+  int use_slots_is_dense;
+};
+
+class CustomParser {
+ public:
+  CustomParser() {}
+  virtual ~CustomParser() {}
+  virtual void Init(const std::vector<SlotConf>& slots) = 0;
+  virtual void ParseOneInstance(const char* str, Record* instance) = 0;
+};
+
+typedef paddle::framework::CustomParser* (*CreateParserObjectFunc)();
+
+class DLManager {
+  struct DLHandle {
+    void* module;
+    paddle::framework::CustomParser* parser;
+  };
+
+ public:
+  DLManager() {}
+
+  ~DLManager() {
+#ifdef _LINUX
+    std::lock_guard<std::mutex> lock(mutex_);
+    for (auto it = handle_map_.begin(); it != handle_map_.end(); ++it) {
+      delete it->second.parser;
+      dlclose(it->second.module);
+    }
+#endif
+  }
+
+  bool Close(const std::string& name) {
+#ifdef _LINUX
+    auto it = handle_map_.find(name);
+    if (it == handle_map_.end()) {
+      return true;
+    }
+    delete it->second.parser;
+    dlclose(it->second.module);
+#endif
+    VLOG(0) << "Not implement in windows";
+    return false;
+  }
+
+  paddle::framework::CustomParser* Load(const std::string& name,
+                                        std::vector<SlotConf>& conf) {
+#ifdef _LINUX
+    std::lock_guard<std::mutex> lock(mutex_);
+    DLHandle handle;
+    std::map<std::string, DLHandle>::iterator it = handle_map_.find(name);
+    if (it != handle_map_.end()) {
+      return it->second.parser;
+    }
+
+    handle.module = dlopen(name.c_str(), RTLD_NOW);
+    if (handle.module == nullptr) {
+      VLOG(0) << "Create so of " << name << " fail";
+      return nullptr;
+    }
+
+    CreateParserObjectFunc create_parser_func =
+        (CreateParserObjectFunc)dlsym(handle.module, "CreateParserObject");
+    handle.parser = create_parser_func();
+    handle.parser->Init(conf);
+    handle_map_.insert({name, handle});
+
+    return handle.parser;
+#endif
+    VLOG(0) << "Not implement in windows";
+    return nullptr;
+  }
+
+  paddle::framework::CustomParser* ReLoad(const std::string& name,
+                                          std::vector<SlotConf>& conf) {
+    Close(name);
+    return Load(name, conf);
+  }
+
+ private:
+  std::mutex mutex_;
+  std::map<std::string, DLHandle> handle_map_;
+};
+
 class DataFeed {
  public:
   DataFeed() {
@@ -252,6 +340,8 @@ class DataFeed {
   bool finish_set_filelist_;
   bool finish_start_;
   std::string pipe_command_;
+  std::string so_parser_name_;
+  std::vector<SlotConf> slot_conf_;
   std::vector<std::string> ins_id_vec_;
   std::vector<std::string> ins_content_vec_;
   platform::Place place_;
@@ -324,10 +414,13 @@ class InMemoryDataFeed : public DataFeed {
   virtual void SetEnablePvMerge(bool enable_pv_merge);
   virtual void SetCurrentPhase(int current_phase);
   virtual void LoadIntoMemory();
+  virtual void LoadIntoMemoryFromSo();
 
  protected:
   virtual bool ParseOneInstance(T* instance) = 0;
   virtual bool ParseOneInstanceFromPipe(T* instance) = 0;
+  virtual void ParseOneInstanceFromSo(const char* str, T* instance,
+                                      CustomParser* parser) {}
   virtual void PutToFeedVec(const std::vector<T>& ins_vec) = 0;
 
   int thread_id_;
@@ -688,6 +781,8 @@ class MultiSlotInMemoryDataFeed : public InMemoryDataFeed<Record> {
  protected:
   virtual bool ParseOneInstance(Record* instance);
   virtual bool ParseOneInstanceFromPipe(Record* instance);
+  virtual void ParseOneInstanceFromSo(const char* str, Record* instance,
+                                      CustomParser* parser);
   virtual void PutToFeedVec(const std::vector<Record>& ins_vec);
   virtual void GetMsgFromLogKey(const std::string& log_key, uint64_t* search_id,
                                 uint32_t* cmatch, uint32_t* rank);

diff --git a/paddle/fluid/framework/data_feed.proto b/paddle/fluid/framework/data_feed.proto
@@ -33,4 +33,5 @@ message DataFeedDesc {
   optional string rank_offset = 6;
   optional int32 pv_batch_size = 7 [ default = 32 ];
   optional int32 input_type = 8 [ default = 0 ];
+  optional string so_parser_name = 9;
 }
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
@@ -581,6 +581,7 @@ class SectionWorker : public DeviceWorker {
   void RunUpdate(
       std::unique_ptr<GarbageCollector>&,
       std::unordered_map<const OperatorBase*, std::vector<std::string>>&);
+  void PrepareUnusedVar();
 
  protected:
   int section_id_;
@@ -595,6 +596,8 @@ class SectionWorker : public DeviceWorker {
 
   std::vector<std::unique_ptr<OperatorBase>> ops_;
   std::shared_ptr<framework::ProgramDesc> program_;
+  std::unordered_map<const OperatorBase*, std::vector<std::string>>
+      unused_vars_;
   static uint64_t batch_id_;
 
   platform::DeviceContext* dev_ctx_ = nullptr;

diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
@@ -1,4 +1,5 @@
 // Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021 NVIDIA Corporation. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -189,6 +190,7 @@ message DistributedStrategy {
   optional bool without_graph_optimization = 30 [ default = false ];
   optional int32 fuse_grad_size_in_num = 31 [ default = 1 ];
   optional bool calc_comm_same_stream = 32 [ default = false ];
+  optional bool asp = 33 [ default = false ];
 
   optional RecomputeConfig recompute_configs = 101;
   optional AMPConfig amp_configs = 102;

diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
@@ -91,7 +91,8 @@ ConvElementwiseAdd2ActFusePass::ConvElementwiseAdd2ActFusePass() {
       .End()
       .AddAttr("axis")
       // the first elementwise_add-axis needs to be 1, the second has to be -1
-      .IsIntIn({1, -1})
+      // or 0
+      .IsIntIn({1, -1, 0})
       .End();
 
   AddOpCompat(OpCompat("relu"))

diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -2262,11 +2262,26 @@ PDNode *patterns::QuantizePlacement::operator()(
 PDNode *patterns::Bfloat16Placement::operator()(
     const std::unordered_set<std::string> &bfloat16_enabled_op_types) {
   std::unordered_set<std::string> supported_op_types =
-      std::unordered_set<std::string>(
-          {"concat", "conv2d", "conv2d_transpose", "elementwise_add",
-           "elementwise_mul", "fc", "fusion_gru", "fusion_lstm", "gelu",
-           "layer_norm", "matmul", "matmul_v2", "pool2d", "relu", "reshape2",
-           "softmax", "split", "sum", "transpose2"});
+      std::unordered_set<std::string>({"concat",
+                                       "conv2d",
+                                       "conv2d_transpose",
+                                       "elementwise_add",
+                                       "elementwise_mul",
+                                       "fc",
+                                       "fusion_gru",
+                                       "fusion_lstm",
+                                       "gelu",
+                                       "layer_norm",
+                                       "matmul",
+                                       "matmul_v2",
+                                       "pool2d",
+                                       "prelu",
+                                       "relu",
+                                       "reshape2",
+                                       "softmax",
+                                       "split",
+                                       "sum",
+                                       "transpose2"});
   if (!bfloat16_enabled_op_types.empty()) {
     supported_op_types = bfloat16_enabled_op_types;
   }
Original file line number	Diff line number	Diff line change
Expand Up		@@ -233,3 +233,4 @@ endif()
		mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
		mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)

		include(thrust)