Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…

… clear_add_index_put_api
PaddlePaddle · May 9, 2023 · f956aee · f956aee
2 parents ed00d81 + 9cd0a5b
commit f956aee
Show file tree

Hide file tree

Showing 547 changed files with 11,084 additions and 4,595 deletions.
diff --git a/.gitignore b/.gitignore
@@ -75,7 +75,8 @@ tools/nvcc_lazy
 # TODO(zhiqiang) Move this file to build directory.
 paddle/fluid/pybind/eager_op_function.cc
 tools/nvcc_lazy
-
+paddle/phi/kernels/sparse/gpu/cutlass_generator/all_gemm_operations.h
+paddle/phi/kernels/sparse/gpu/cutlass_generator/configurations.h
 
 # these files (directories) are generated before build system generation
 paddle/fluid/operators/generated_op*.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -92,6 +92,11 @@ message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
                "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
 message(STATUS "AR tools: ${CMAKE_AR}")
 
+if((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") AND CMAKE_CXX_COMPILER_VERSION
+                                              VERSION_GREATER 10.4)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=uninitialized")
+endif()
+
 # MUSL build turn off warnings
 if(WITH_MUSL)
   set(CMAKE_CXX_FLAGS

diff --git a/cmake/external/cutlass.cmake b/cmake/external/cutlass.cmake
@@ -42,20 +42,37 @@ ExternalProject_Add(
   INSTALL_COMMAND ""
   TEST_COMMAND "")
 
+set(tmp_gemm_operations_file
+    ${CMAKE_SOURCE_DIR}/paddle/phi/kernels/sparse/gpu/cutlass_generator/generated/gemm/all_gemm_operations.h.tmp
+)
+set(tmp_configurations_file
+    ${CMAKE_SOURCE_DIR}/paddle/phi/kernels/sparse/gpu/cutlass_generator/generated/gemm/configurations.h.tmp
+)
+set(gemm_operations_file
+    ${CMAKE_SOURCE_DIR}/paddle/phi/kernels/sparse/gpu/cutlass_generator/all_gemm_operations.h
+)
+set(configurations_file
+    ${CMAKE_SOURCE_DIR}/paddle/phi/kernels/sparse/gpu/cutlass_generator/configurations.h
+)
+
 add_custom_target(
   cutlass_codegen
-  COMMAND
-    rm -rf
-    ${CMAKE_SOURCE_DIR}/paddle/phi/kernels/sparse/gpu/cutlass_generator/build
-  COMMAND
-    mkdir -p
-    ${CMAKE_SOURCE_DIR}/paddle/phi/kernels/sparse/gpu/cutlass_generator/build/generated/gemm
   COMMAND
     ${PYTHON_EXECUTABLE} -B
     ${CMAKE_SOURCE_DIR}/paddle/phi/kernels/sparse/gpu/cutlass_generator/gather_gemm_scatter_generator.py
     "${THIRD_PARTY_PATH}/cutlass/src/extern_cutlass/tools/library/scripts/"
-    "${CMAKE_SOURCE_DIR}/paddle/phi/kernels/sparse/gpu/cutlass_generator/build"
+    "${CMAKE_SOURCE_DIR}/paddle/phi/kernels/sparse/gpu/cutlass_generator"
     "${CMAKE_CUDA_COMPILER_VERSION}"
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_gemm_operations_file}
+          ${gemm_operations_file}
+  COMMAND
+    ${CMAKE_COMMAND} -E echo
+    "copy_if_different ${tmp_gemm_operations_file} to ${gemm_operations_file}"
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_configurations_file}
+          ${configurations_file}
+  COMMAND
+    ${CMAKE_COMMAND} -E echo
+    "copy_if_different ${tmp_configurations_file} to ${configurations_file}"
   VERBATIM)
 
 add_library(cutlass INTERFACE)

diff --git a/paddle/fluid/distributed/collective/process_group_custom.cc b/paddle/fluid/distributed/collective/process_group_custom.cc
@@ -189,7 +189,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Collective(
     std::vector<phi::DenseTensor>& outputs,
     Fn fn,
     CommType op_type,
-    bool sync_op,
+    bool sync_op UNUSED,
     bool use_calc_stream) {
   const auto places = GetPlaceList(inputs);
   const auto key = GetKeyFromPlaces(places);

diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_server.h b/paddle/fluid/distributed/ps/service/brpc_ps_server.h
@@ -227,8 +227,10 @@ class DownpourPServerBrpcClosure : public PServerClosure {
   PsRequestMessage *request(size_t i) { return &_requests[i]; }
   PsResponseMessage *response(size_t i) { return &_responses[i]; }
   brpc::Controller *cntl(size_t i) { return _cntls[i].get(); }
-  int check_response(size_t request_idx, int cmd_id) { return 1; }
-  int check_save_response(size_t request_idx, int cmd_id) { return 1; }
+  int check_response(size_t request_idx UNUSED, int cmd_id UNUSED) { return 1; }
+  int check_save_response(size_t request_idx UNUSED, int cmd_id UNUSED) {
+    return 1;
+  }
 
  private:
   std::atomic<int32_t> _waiting_num;

diff --git a/paddle/fluid/distributed/ps/service/communicator/communicator.h b/paddle/fluid/distributed/ps/service/communicator/communicator.h
@@ -287,10 +287,10 @@ class Communicator {
     return {};
   }
   virtual void SaveFLStrategy(
-      const std::unordered_map<uint32_t, std::string> &fl_strategy) {}
+      const std::unordered_map<uint32_t, std::string> &fl_strategy UNUSED) {}
   virtual void StartCoordinator(
-      const std::string &self_endpoint,
-      const std::vector<std::string> &trainer_endpoints) {}
+      const std::string &self_endpoint UNUSED,
+      const std::vector<std::string> &trainer_endpoints UNUSED) {}
 
   virtual ~Communicator() {}
   virtual void RpcProfilerControl();
@@ -337,13 +337,13 @@ class Communicator {
 
   virtual void BarrierTriggerDecrement() {}
 
-  virtual void BarrierTriggerReset(int init_counter) {}
+  virtual void BarrierTriggerReset(int init_counter UNUSED) {}
 
   virtual void InitEnvs() = 0;
 
-  virtual void InitImpl(const RpcCtxMap &send_varname_to_ctx,
-                        const RecvCtxMap &recv_varname_to_ctx,
-                        Scope *recv_scope) {}
+  virtual void InitImpl(const RpcCtxMap &send_varname_to_ctx UNUSED,
+                        const RecvCtxMap &recv_varname_to_ctx UNUSED,
+                        Scope *recv_scope UNUSED) {}
 
   static Communicator *GetInstance() { return communicator_.get(); }
 
@@ -682,9 +682,9 @@ class FLCommunicator : public GeoCommunicator {
   virtual void InitBrpcClient(const std::string &dist_desc,
                               const std::vector<std::string> &host_sign_list);
 
-  void InitImpl(const RpcCtxMap &send_varname_to_ctx,
-                const RecvCtxMap &recv_varname_to_ctx,
-                Scope *recv_scope) {}
+  void InitImpl(const RpcCtxMap &send_varname_to_ctx UNUSED,
+                const RecvCtxMap &recv_varname_to_ctx UNUSED,
+                Scope *recv_scope UNUSED) {}
 
   void StartCoordinatorClient(
       const std::vector<std::string> &trainer_endpoints);

diff --git a/paddle/fluid/distributed/ps/service/coordinator_client.h b/paddle/fluid/distributed/ps/service/coordinator_client.h
@@ -151,8 +151,8 @@ class CoordinatorService : public PsService {
                          ::google::protobuf::Closure* done);
 
   int32_t SaveFLClientInfo(const CoordinatorReqMessage& request,
-                           CoordinatorResMessage* response,
-                           brpc::Controller* cntl) {
+                           CoordinatorResMessage* response UNUSED,
+                           brpc::Controller* cntl UNUSED) {
     _coordinator_service_handle->SaveFLClientInfo(request);
     return 0;
   }

diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.h b/paddle/fluid/distributed/ps/table/common_graph_table.h
@@ -511,7 +511,7 @@ class GraphTable : public Table {
   }
   virtual ~GraphTable();
 
-  virtual void *GetShard(size_t shard_idx) { return 0; }
+  virtual void *GetShard(size_t shard_idx UNUSED) { return 0; }
 
   static int32_t sparse_local_shard_num(uint32_t shard_num,
                                         uint32_t server_num) {
@@ -624,15 +624,16 @@ class GraphTable : public Table {
   Node *find_node(GraphTableType table_type, int idx, uint64_t id);
   Node *find_node(GraphTableType table_type, uint64_t id);
 
-  virtual int32_t Pull(TableContext &context) { return 0; }  // NOLINT
-  virtual int32_t Push(TableContext &context) { return 0; }  // NOLINT
+  virtual int32_t Pull(TableContext &context UNUSED) { return 0; }  // NOLINT
+  virtual int32_t Push(TableContext &context UNUSED) { return 0; }  // NOLINT
 
   virtual int32_t clear_nodes(GraphTableType table_type, int idx);
   virtual void Clear() {}
   virtual int32_t Flush() { return 0; }
-  virtual int32_t Shrink(const std::string &param) { return 0; }
+  virtual int32_t Shrink(const std::string &param UNUSED) { return 0; }
   // 指定保存路径
-  virtual int32_t Save(const std::string &path, const std::string &converter) {
+  virtual int32_t Save(const std::string &path UNUSED,
+                       const std::string &converter UNUSED) {
     return 0;
   }
   virtual int32_t InitializeShard() { return 0; }

diff --git a/paddle/fluid/eager/amp_auto_cast.h b/paddle/fluid/eager/amp_auto_cast.h
@@ -43,7 +43,7 @@ inline std::vector<paddle::Tensor> AmpAutoCasts(
     const std::string& inputs_name,
     const std::vector<paddle::Tensor>& inputs,
     const phi::DataType& dst_dtype,
-    std::string op_name) {
+    std::string op_name UNUSED) {
   VLOG(6) << "AMP AmpAutoCasts:"
           << " inputs(" << inputs_name << ") dst_dtype("
           << phi::DataTypeToString(dst_dtype) << ").";

diff --git a/paddle/fluid/eager/nan_inf_utils.cc b/paddle/fluid/eager/nan_inf_utils.cc
@@ -115,7 +115,9 @@ void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor) {
 
 void CheckTensorHasNanOrInf(const std::string& api_name,
                             const paddle::optional<Tensor>& tensor) {
-  CheckTensorHasNanOrInf(api_name, tensor.get());
+  if (tensor) {
+    CheckTensorHasNanOrInf(api_name, *tensor);
+  }
 }
 
 void CheckTensorHasNanOrInf(const std::string& api_name,
@@ -169,7 +171,7 @@ void CheckTensorHasNanOrInf(
     const std::string& api_name,
     const paddle::optional<std::vector<Tensor>>& tensors) {
   if (tensors) {
-    CheckTensorHasNanOrInf(api_name, tensors.get());
+    CheckTensorHasNanOrInf(api_name, *tensors);
   }
 }
 

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
@@ -928,9 +928,7 @@ if(WITH_DISTRIBUTE)
            fleet_executor)
   endif()
 elseif(WITH_PSLIB)
-  set(DISTRIBUTE_COMPILE_FLAGS
-      "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
-  )
+  set(DISTRIBUTE_COMPILE_FLAGS "")
   if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
     set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
   endif()

diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
@@ -89,11 +89,6 @@ struct DLDeviceVisitor
         platform::errors::Unimplemented("platform::XPUPlace is not supported"));
   }
 
-  inline ::DLDevice operator()(const platform::NPUPlace &place) const {
-    PADDLE_THROW(
-        platform::errors::Unimplemented("platform::NPUPlace is not supported"));
-  }
-
   inline ::DLDevice operator()(const platform::NPUPinnedPlace &place) const {
     PADDLE_THROW(platform::errors::Unimplemented(
         "platform::NPUPinnedPlace is not supported"));

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -104,7 +104,6 @@ pass_library(delete_dropout_op_pass inference)
 pass_library(delete_concat_op_pass inference)
 pass_library(delete_c_identity_op_pass inference)
 pass_library(preln_residual_bias_fuse_pass inference)
-pass_library(delete_fill_constant_op_pass inference)
 pass_library(constant_folding_pass inference)
 pass_library(auto_mixed_precision_pass inference)
 pass_library(conv2d_fusion_layout_transfer_pass inference)
@@ -118,7 +117,6 @@ pass_library(fused_multi_transformer_encoder_pass inference)
 pass_library(fused_multi_transformer_decoder_pass inference)
 pass_library(fuse_multi_transformer_layer_pass inference)
 pass_library(adaptive_pool2d_convert_global_pass inference)
-pass_library(unsqueeze2_eltwise_fuse_pass inference)
 pass_library(yolo_box_fuse_pass inference)
 pass_library(layer_norm_fuse_pass inference)
 pass_library(add_support_int8_pass inference)
@@ -242,7 +240,7 @@ if(WITH_XPU)
   pass_library(one_beam_size_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
   pass_library(delete_isolated_node_pass inference DIR xpu DEPS
                ${XPU_PASS_DEPS})
-  pass_library(fused_multi_transformer_xpu_quant_pass inference DIR xpu DEPS
+  pass_library(fused_multi_transformer_xpu_pass inference DIR xpu DEPS
                ${XPU_PASS_DEPS})
   pass_library(stack_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
   pass_library(fused_multi_transformer_cachekv_layout_trans_pass inference DIR
@@ -391,10 +389,6 @@ cc_test(
   test_adaptive_pool2d_convert_global_pass
   SRCS adaptive_pool2d_convert_global_pass_tester.cc
   DEPS adaptive_pool2d_convert_global_pass)
-cc_test(
-  test_unsqueeze2_eltwise_fuse_pass_cc
-  SRCS unsqueeze2_eltwise_fuse_pass_tester.cc
-  DEPS unsqueeze2_eltwise_fuse_pass)
 cc_test(
   test_generate_pass_cc
   SRCS generate_pass_tester.cc
@@ -519,9 +513,9 @@ if(WITH_XPU)
     SRCS xpu/delete_isolated_node_pass_test.cc
     DEPS delete_isolated_node_pass)
   cc_test(
-    test_fused_multi_transformer_xpu_quant_pass
-    SRCS xpu/fused_multi_transformer_xpu_quant_pass_tester.cc
-    DEPS fused_multi_transformer_xpu_quant_pass)
+    test_fused_multi_transformer_xpu_pass
+    SRCS xpu/fused_multi_transformer_xpu_pass_tester.cc
+    DEPS fused_multi_transformer_xpu_pass)
   cc_test(
     test_one_beam_size_fuse_pass
     SRCS xpu/one_beam_size_fuse_pass_test.cc

diff --git a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
@@ -55,8 +55,6 @@ static phi::Backend ConvertPlaceToBackend(const phi::Place& place) {
       return phi::Backend::GPU;
     case phi::AllocationType::XPU:
       return phi::Backend::XPU;
-    case phi::AllocationType::NPU:
-      return phi::Backend::NPU;
     default:
       PADDLE_THROW(platform::errors::InvalidArgument(
           "Cannot convert place(%d).", static_cast<int>(place.GetType())));

diff --git a/paddle/fluid/framework/ir/constant_folding_pass.cc b/paddle/fluid/framework/ir/constant_folding_pass.cc
@@ -64,7 +64,7 @@ void ConstantFoldingPass::ApplyImpl(ir::Graph *graph) const {
       platform::errors::Fatal(
           "scope must not be null when applying constant floding."));
 
-  std::vector<std::string> blacklist{"feed", "matrix_multiply"};
+  std::vector<std::string> blacklist{"feed", "matrix_multiply", "save"};
 
   auto op_node_sorted = framework::ir::TopologyVarientSort(
       *graph, static_cast<framework::ir::SortKind>(0));