Merge branch 'develop' into cmake_format

PaddlePaddle · Jun 3, 2022 · a5fe84a · a5fe84a · paddle-bot-old · Jun 4, 2022
2 parents 2ceb85c + cb1a0ec
commit a5fe84a
Show file tree

Hide file tree

Showing 48 changed files with 1,586 additions and 166 deletions.
diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc
@@ -27,7 +27,7 @@
 #include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/framework/variable.h"
 
-PADDLE_DEFINE_EXPORTED_bool(retain_grad_for_all_tensor, true,
+PADDLE_DEFINE_EXPORTED_bool(retain_grad_for_all_tensor, false,
                             "retain grad for all tensor");
 
 namespace egr {

diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
@@ -188,7 +188,8 @@ class DeQuantizer final : public Quanter {
   bool IsNotPermittedName(const std::string& output_name) const override {
     std::unordered_map<std::string, std::vector<std::string>> block_list{
         {"layer_norm",
-         {"Mean", "Variance"}}};  // not used in inference in MKLDNN
+         {"Mean", "Variance"}},     // not used in inference in MKLDNN
+        {"fc", {"ResidualData"}}};  // artifical output, already dequantized
 
     std::vector<std::string> blocked_outputs{"XShape"};  // blocklist for any op
     auto op_name = op->Name();

diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -348,6 +348,10 @@ void CpuPassStrategy::EnableMkldnnQuantizer() {
 void CpuPassStrategy::EnableMkldnnBfloat16() {
 #ifdef PADDLE_WITH_MKLDNN
   if (!use_mkldnn_bfloat16_) {
+    passes_.push_back("fc_mkldnn_pass");
+    passes_.push_back("fc_act_mkldnn_fuse_pass");
+    passes_.push_back("fc_elementwise_add_mkldnn_fuse_pass");
+
     passes_.push_back("cpu_bfloat16_placement_pass");
     passes_.push_back("cpu_bfloat16_pass");
     passes_.push_back("cpu_quantize_squash_pass");

diff --git a/...tors/collective/c_allreduce_sum_op_xpu.cc → ...erators/collective/c_allreduce_sum_op.kps b/...tors/collective/c_allreduce_sum_op_xpu.cc → ...erators/collective/c_allreduce_sum_op.kps
@@ -1,4 +1,4 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,10 +12,31 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef PADDLE_WITH_XPU_KP
+
+// Please do not modify the following code
+#if defined(__CUDA_ARCH__)
+#undef __CUDA_ARCH__
+#endif
+
+#if defined(__CUDACC__)
+#undef __CUDACC__
+#endif
+
+#if defined(__CUDA__)
+#undef __CUDA__
+#endif
+
+#if defined(__NVCC__)
+#undef __NVCC__
+#endif
+
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_XPU_KERNEL(c_allreduce_sum,
-                       ops::CAllReduceOpXPUKernel<ops::kRedSum, float>)
+REGISTER_OP_KERNEL(c_allreduce_sum, KP, plat::XPUPlace,
+                   ops::CAllReduceOpXPUKernel<ops::kRedSum, float>);
+
+#endif
diff --git a/paddle/fluid/operators/collective/c_comm_init_all_op.cc b/paddle/fluid/operators/collective/c_comm_init_all_op.cc
@@ -17,11 +17,16 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 
 #include "paddle/fluid/framework/threadpool.h"
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
+#if defined(PADDLE_WITH_XPU_BKCL)
+#include "paddle/fluid/platform/device/xpu/bkcl_helper.h"
+#endif
+
 namespace paddle {
 namespace framework {
 class InferShapeContext;
@@ -48,9 +53,9 @@ class CCommInitAllOp : public framework::OperatorBase {
 
   void RunImpl(const framework::Scope& scope,
                const platform::Place& place) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(place), true,
-                      platform::errors::PreconditionNotMet(
-                          "CCommInitAllOp can run on gpu place only"));
+// PADDLE_ENFORCE_EQ(platform::is_gpu_place(place), true,
+//                   platform::errors::PreconditionNotMet(
+//                       "CCommInitAllOp can run on gpu place only"));
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     std::vector<int> devices = Attr<std::vector<int>>("devices");
@@ -61,9 +66,52 @@ class CCommInitAllOp : public framework::OperatorBase {
     int rid = Attr<int>("ring_id");
 
     platform::NCCLCommContext::Instance().CreateAllNCCLComms(devices, rid);
+
+#elif defined(PADDLE_WITH_XPU_BKCL)
+    std::vector<int> devices = Attr<std::vector<int>>("devices");
+    int ring_id = Attr<int>("ring_id");
+
+    if (devices.empty()) {
+      int count = platform::GetXPUDeviceCount();
+      for (int i = 0; i < count; ++i) {
+        devices.push_back(i);
+      }
+    }
+
+    if (devices.size() > 1) {
+      std::vector<platform::Place> place_list_;
+      for (size_t i = 0; i < devices.size(); ++i) {
+        auto p = platform::XPUPlace(devices[i]);
+        place_list_.push_back(p);
+      }
+
+      // create pthread to bkcl_init_rank on all devices
+      auto ptr = new platform::BKCLContextMap(place_list_);
+      ptr->init();
+
+      for (size_t i = 0; i < devices.size(); ++i) {
+        platform::BKCLCommContext::Instance().AssignBKCLComm(
+            ptr->contexts_.at(devices[i]).comm_, devices.size(), devices[i],
+            devices[i], ring_id);
+
+        VLOG(0) << "bkcl communicator of rank " << devices[i] << " in ring "
+                << ring_id << " has been created on device " << devices[i];
+
+        // TODO(WorgenZhang): need release comm_map_ when quit
+        // std::call_once(once_flag_, []() {
+        //   std::atexit([]() {
+        //   platform::BKCLCommContext::Instance().ReleaseBKCLComms(); });
+        // });
+      }
+
+      VLOG(0) << "done bkcl_init_rank on all devices";
+    } else {
+      VLOG(0)
+          << "bkcl_init_rank doesn't support on one device, skip init process";
+    }
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with GPU."));
+        "PaddlePaddle should compile with GPU or XPU."));
 #endif
   }
 };

diff --git a/paddle/fluid/operators/collective/c_comm_init_op.cc b/paddle/fluid/operators/collective/c_comm_init_op.cc
@@ -97,18 +97,9 @@ class CCommInitOp : public framework::OperatorBase {
     if (Attr<int>("device_id") >= 0) {
       device_id = Attr<int>("device_id");
     }
-
-#if defined(PADDLE_WITH_XPU_BKCL) && defined(PADDLE_WITH_HETERPS) && \
-    defined(PADDLE_WITH_PSLIB)
-    // XPUPS rank_id only equals 0, so replace rank_id with device_id
-    CommContext::Instance().CreateComm(comm_id, nranks, device_id, device_id,
-                                       rid);
-#else
     int rank_id = Attr<int>("rank");
     CommContext::Instance().CreateComm(comm_id, nranks, rank_id, device_id,
                                        rid);
-#endif
-
 #endif
   }
 };

diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
@@ -23,7 +23,6 @@ class CSyncCalcStreamOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "(Tensor) Dependency of the variable need to sync");
     AddComment(R"DOC(
 CSyncCalcStream Operator
-
 Call calculation stream synchronization.
 )DOC");
   }

diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.kps b/paddle/fluid/operators/collective/c_sync_calc_stream_op.kps
@@ -0,0 +1,42 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU_KP
+
+// Please do not modify the following code
+#if defined(__CUDA_ARCH__)
+#undef __CUDA_ARCH__
+#endif
+
+#if defined(__CUDACC__)
+#undef __CUDACC__
+#endif
+
+#if defined(__CUDA__)
+#undef __CUDA__
+#endif
+
+#if defined(__NVCC__)
+#undef __NVCC__
+#endif
+
+#include "paddle/fluid/operators/collective/c_sync_calc_stream_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_KERNEL(c_sync_calc_stream, KP, plat::XPUPlace,
+                   ops::CSyncCalcStreamKernel<float>);
+
+#endif
diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
@@ -11,25 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
-#endif
-
-#if defined(PADDLE_WITH_ASCEND_CL)
-#include "paddle/fluid/platform/device/npu/hccl_helper.h"
-#endif
-
-#if defined(PADDLE_WITH_CNCL)
-#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
-#endif
-
-#if defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL)
-#include "paddle/fluid/platform/collective_helper.h"
-#endif
+#include "paddle/fluid/operators/collective/c_sync_comm_stream_op.h"
 
 namespace paddle {
 namespace operators {
@@ -58,62 +40,11 @@ class CSyncCommStreamOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("ring_id", "(int default 0) ring id.").SetDefault(0);
     AddComment(R"DOC(
 CSyncCommStream Operator
-
 Call communication stream synchronization.
 )DOC");
   }
 };
 
-template <typename T>
-class CSyncCommStreamKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    auto place = ctx.GetPlace();
-    int ring_id = ctx.Attr<int>("ring_id");
-    auto stream =
-        platform::NCCLCommContext::Instance().Get(ring_id, place)->stream();
-
-    platform::GpuStreamSync(stream);
-
-#elif defined(PADDLE_WITH_ASCEND_CL)
-    auto place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_npu_place(place), true,
-                      platform::errors::PreconditionNotMet(
-                          "Sync comm stream op can run on npu place only for "
-                          "now, but we got %s, please check the environment.",
-                          place.DebugString()));
-    int ring_id = ctx.Attr<int>("ring_id");
-    auto stream =
-        platform::HCCLCommContext::Instance().Get(ring_id, place)->stream();
-    platform::NPUStreamSync(stream);
-
-#elif defined(PADDLE_WITH_CNCL)
-    auto place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_mlu_place(place), true,
-                      platform::errors::PreconditionNotMet(
-                          "Sync stream op can run on mlu place only for now."));
-    int ring_id = ctx.Attr<int>("ring_id");
-    auto stream =
-        platform::CNCLCommContext::Instance().Get(ring_id, place)->stream();
-    platform::MLUStreamSync(stream);
-#elif defined(PADDLE_WITH_XPU_BKCL)
-    auto place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_xpu_place(place), true,
-                      platform::errors::PreconditionNotMet(
-                          "Sync stream op can run on xpu place only for now."));
-    int ring_id = ctx.Attr<int>("ring_id");
-    auto comm_dev_ctx = platform::BKCLCommContext::Instance()
-                            .Get(ring_id, place)
-                            ->dev_context();
-    comm_dev_ctx->Wait();
-#else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with GPU."));
-#endif
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
 
@@ -127,5 +58,3 @@ REGISTER_OP_CUDA_KERNEL(c_sync_comm_stream, ops::CSyncCommStreamKernel<float>);
 REGISTER_OP_NPU_KERNEL(c_sync_comm_stream, ops::CSyncCommStreamKernel<float>);
 
 REGISTER_OP_MLU_KERNEL(c_sync_comm_stream, ops::CSyncCommStreamKernel<float>);
-
-REGISTER_OP_XPU_KERNEL(c_sync_comm_stream, ops::CSyncCommStreamKernel<float>);