From d85ceb56f224ef1e92c5c725b0c228a25985dd32 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Tue, 6 Sep 2022 11:23:29 +0000
Subject: [PATCH 1/3] simplify size op

---
 paddle/phi/kernels/cpu/size_kernel.cc         | 32 ------------------
 paddle/phi/kernels/gpu/amp_kernel.cu          |  4 ++-
 paddle/phi/kernels/gpu/size_kernel.cu         | 31 -----------------
 .../size_kernel_impl.h => size_kernel.cc}     | 33 +++++++++++--------
 paddle/phi/kernels/size_kernel.h              |  2 +-
 5 files changed, 23 insertions(+), 79 deletions(-)
 delete mode 100644 paddle/phi/kernels/cpu/size_kernel.cc
 delete mode 100644 paddle/phi/kernels/gpu/size_kernel.cu
 rename paddle/phi/kernels/{impl/size_kernel_impl.h => size_kernel.cc} (56%)

diff --git a/paddle/phi/kernels/cpu/size_kernel.cc b/paddle/phi/kernels/cpu/size_kernel.cc
deleted file mode 100644
index 4019976ecec9c6..00000000000000
--- a/paddle/phi/kernels/cpu/size_kernel.cc
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/kernels/size_kernel.h"
-
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/impl/size_kernel_impl.h"
-
-PD_REGISTER_KERNEL(size,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::SizeKernel,
-                   uint8_t,
-                   int16_t,
-                   int,
-                   int64_t,
-                   phi::dtype::float16,
-                   float,
-                   double,
-                   bool) {}
diff --git a/paddle/phi/kernels/gpu/amp_kernel.cu b/paddle/phi/kernels/gpu/amp_kernel.cu
index 51e11cc44b8563..230eb801d20d51 100644
--- a/paddle/phi/kernels/gpu/amp_kernel.cu
+++ b/paddle/phi/kernels/gpu/amp_kernel.cu
@@ -365,4 +365,6 @@ PD_REGISTER_KERNEL(update_loss_scaling,
                    phi::UpdateLossScalingKernel,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16) {
+  kernel->InputAt(1).SetBackend(phi::Backend::ALL_BACKEND);
+}
diff --git a/paddle/phi/kernels/gpu/size_kernel.cu b/paddle/phi/kernels/gpu/size_kernel.cu
deleted file mode 100644
index fb6acd5599a8e5..00000000000000
--- a/paddle/phi/kernels/gpu/size_kernel.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/kernels/size_kernel.h"
-
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/impl/size_kernel_impl.h"
-
-PD_REGISTER_KERNEL(size,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::SizeKernel,
-                   int16_t,
-                   int,
-                   int64_t,
-                   phi::dtype::float16,
-                   float,
-                   double,
-                   bool) {}
diff --git a/paddle/phi/kernels/impl/size_kernel_impl.h b/paddle/phi/kernels/size_kernel.cc
similarity index 56%
rename from paddle/phi/kernels/impl/size_kernel_impl.h
rename to paddle/phi/kernels/size_kernel.cc
index f9757bc4477569..e197d3de28645e 100644
--- a/paddle/phi/kernels/impl/size_kernel_impl.h
+++ b/paddle/phi/kernels/size_kernel.cc
@@ -12,28 +12,33 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#pragma once
+#include "paddle/phi/kernels/size_kernel.h"
 
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 
 namespace phi {
 
-template <typename T, typename Context>
+template <typename Context>
 void SizeKernel(const Context& ctx,
                 const DenseTensor& input,
                 DenseTensor* out) {
-  auto place = ctx.GetPlace();
-  auto out_data = ctx.template Alloc<int64_t>(out);
-  auto cpu_place = phi::CPUPlace();
-  if (place == cpu_place) {
-    out_data[0] = input.numel();
-  } else {
-    DenseTensor cpu_tensor;
-    cpu_tensor.Resize(out->dims());
-    auto cpu_data = ctx.template HostAlloc<int64_t>(&cpu_tensor);
-    cpu_data[0] = input.numel();
-    phi::Copy(ctx, cpu_tensor, place, false, out);
-  }
+  auto* out_data = ctx.template HostAlloc<int64_t>(out);
+  out_data[0] = input.numel();
 }
 
 }  // namespace phi
+
+PD_REGISTER_GENERAL_KERNEL(
+    size, CPU, ALL_LAYOUT, phi::SizeKernel<phi::CPUContext>, ALL_DTYPE) {
+  kernel->OutputAt(0).SetDataType(phi::DataType::INT64);
+}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_GENERAL_KERNEL(
+    size, GPU, ALL_LAYOUT, phi::SizeKernel<phi::GPUContext>, ALL_DTYPE) {
+  kernel->OutputAt(0)
+      .SetBackend(phi::Backend::CPU)
+      .SetDataType(phi::DataType::INT64);
+}
+#endif
diff --git a/paddle/phi/kernels/size_kernel.h b/paddle/phi/kernels/size_kernel.h
index 2d7a29104db081..6b4871778ea180 100644
--- a/paddle/phi/kernels/size_kernel.h
+++ b/paddle/phi/kernels/size_kernel.h
@@ -18,7 +18,7 @@
 
 namespace phi {
 
-template <typename T, typename Context>
+template <typename Context>
 void SizeKernel(const Context& ctx, const DenseTensor& input, DenseTensor* out);
 
 }  // namespace phi

From 88f0a519df9640a92f0a2aab083b4f6900e4864d Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 7 Sep 2022 05:15:27 +0000
Subject: [PATCH 2/3] trans to cuda manuly

---
 python/paddle/distributed/collective.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index 74e350b4a537c9..9115cd67f08233 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -1143,7 +1143,9 @@ def all_gather_object(object_list, obj, group=None):
 
     # gather len_of_tensor from all ranks
     list_len_of_tensor = []
-    all_gather(list_len_of_tensor, len_of_tensor, group)
+    all_gather(list_len_of_tensor,
+               len_of_tensor.cuda(paddle.distributed.ParallelEnv().device_id),
+               group)
     # get the max length from list
     max_len_of_tensor = int(max(list_len_of_tensor).item())
     # resize the input tensor to max length avoid hang in all gather

From 66a61d239d442c99e3a99773f87d84787e07d507 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 7 Sep 2022 12:04:41 +0000
Subject: [PATCH 3/3] fix copy error

---
 python/paddle/distributed/collective.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index 9115cd67f08233..19f0796be22dd6 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -1140,12 +1140,13 @@ def all_gather_object(object_list, obj, group=None):
     ), "all_gather_object doesn't support static graph mode."
 
     tensor, len_of_tensor = _convert_object_to_tensor(obj)
+    if paddle.get_device() != "cpu":
+        len_of_tensor = len_of_tensor._copy_to(
+            paddle.framework._current_expected_place(), False)
 
     # gather len_of_tensor from all ranks
     list_len_of_tensor = []
-    all_gather(list_len_of_tensor,
-               len_of_tensor.cuda(paddle.distributed.ParallelEnv().device_id),
-               group)
+    all_gather(list_len_of_tensor, len_of_tensor, group)
     # get the max length from list
     max_len_of_tensor = int(max(list_len_of_tensor).item())
     # resize the input tensor to max length avoid hang in all gather