From d85ceb56f224ef1e92c5c725b0c228a25985dd32 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Tue, 6 Sep 2022 11:23:29 +0000 Subject: [PATCH 1/3] simplify size op --- paddle/phi/kernels/cpu/size_kernel.cc | 32 ------------------ paddle/phi/kernels/gpu/amp_kernel.cu | 4 ++- paddle/phi/kernels/gpu/size_kernel.cu | 31 ----------------- .../size_kernel_impl.h => size_kernel.cc} | 33 +++++++++++-------- paddle/phi/kernels/size_kernel.h | 2 +- 5 files changed, 23 insertions(+), 79 deletions(-) delete mode 100644 paddle/phi/kernels/cpu/size_kernel.cc delete mode 100644 paddle/phi/kernels/gpu/size_kernel.cu rename paddle/phi/kernels/{impl/size_kernel_impl.h => size_kernel.cc} (56%) diff --git a/paddle/phi/kernels/cpu/size_kernel.cc b/paddle/phi/kernels/cpu/size_kernel.cc deleted file mode 100644 index 4019976ecec9c6..00000000000000 --- a/paddle/phi/kernels/cpu/size_kernel.cc +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/size_kernel.h" - -#include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/size_kernel_impl.h" - -PD_REGISTER_KERNEL(size, - CPU, - ALL_LAYOUT, - phi::SizeKernel, - uint8_t, - int16_t, - int, - int64_t, - phi::dtype::float16, - float, - double, - bool) {} diff --git a/paddle/phi/kernels/gpu/amp_kernel.cu b/paddle/phi/kernels/gpu/amp_kernel.cu index 51e11cc44b8563..230eb801d20d51 100644 --- a/paddle/phi/kernels/gpu/amp_kernel.cu +++ b/paddle/phi/kernels/gpu/amp_kernel.cu @@ -365,4 +365,6 @@ PD_REGISTER_KERNEL(update_loss_scaling, phi::UpdateLossScalingKernel, float, double, - phi::dtype::float16) {} + phi::dtype::float16) { + kernel->InputAt(1).SetBackend(phi::Backend::ALL_BACKEND); +} diff --git a/paddle/phi/kernels/gpu/size_kernel.cu b/paddle/phi/kernels/gpu/size_kernel.cu deleted file mode 100644 index fb6acd5599a8e5..00000000000000 --- a/paddle/phi/kernels/gpu/size_kernel.cu +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/size_kernel.h" - -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/size_kernel_impl.h" - -PD_REGISTER_KERNEL(size, - GPU, - ALL_LAYOUT, - phi::SizeKernel, - int16_t, - int, - int64_t, - phi::dtype::float16, - float, - double, - bool) {} diff --git a/paddle/phi/kernels/impl/size_kernel_impl.h b/paddle/phi/kernels/size_kernel.cc similarity index 56% rename from paddle/phi/kernels/impl/size_kernel_impl.h rename to paddle/phi/kernels/size_kernel.cc index f9757bc4477569..e197d3de28645e 100644 --- a/paddle/phi/kernels/impl/size_kernel_impl.h +++ b/paddle/phi/kernels/size_kernel.cc @@ -12,28 +12,33 @@ // See the License for the specific language governing permissions and // limitations under the License. -#pragma once +#include "paddle/phi/kernels/size_kernel.h" +#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_utils.h" namespace phi { -template +template void SizeKernel(const Context& ctx, const DenseTensor& input, DenseTensor* out) { - auto place = ctx.GetPlace(); - auto out_data = ctx.template Alloc(out); - auto cpu_place = phi::CPUPlace(); - if (place == cpu_place) { - out_data[0] = input.numel(); - } else { - DenseTensor cpu_tensor; - cpu_tensor.Resize(out->dims()); - auto cpu_data = ctx.template HostAlloc(&cpu_tensor); - cpu_data[0] = input.numel(); - phi::Copy(ctx, cpu_tensor, place, false, out); - } + auto* out_data = ctx.template HostAlloc(out); + out_data[0] = input.numel(); } } // namespace phi + +PD_REGISTER_GENERAL_KERNEL( + size, CPU, ALL_LAYOUT, phi::SizeKernel, ALL_DTYPE) { + kernel->OutputAt(0).SetDataType(phi::DataType::INT64); +} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_REGISTER_GENERAL_KERNEL( + size, GPU, ALL_LAYOUT, phi::SizeKernel, ALL_DTYPE) { + kernel->OutputAt(0) + .SetBackend(phi::Backend::CPU) + .SetDataType(phi::DataType::INT64); +} +#endif diff --git a/paddle/phi/kernels/size_kernel.h b/paddle/phi/kernels/size_kernel.h index 2d7a29104db081..6b4871778ea180 100644 --- a/paddle/phi/kernels/size_kernel.h +++ b/paddle/phi/kernels/size_kernel.h @@ -18,7 +18,7 @@ namespace phi { -template +template void SizeKernel(const Context& ctx, const DenseTensor& input, DenseTensor* out); } // namespace phi From 88f0a519df9640a92f0a2aab083b4f6900e4864d Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Wed, 7 Sep 2022 05:15:27 +0000 Subject: [PATCH 2/3] trans to cuda manuly --- python/paddle/distributed/collective.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py index 74e350b4a537c9..9115cd67f08233 100644 --- a/python/paddle/distributed/collective.py +++ b/python/paddle/distributed/collective.py @@ -1143,7 +1143,9 @@ def all_gather_object(object_list, obj, group=None): # gather len_of_tensor from all ranks list_len_of_tensor = [] - all_gather(list_len_of_tensor, len_of_tensor, group) + all_gather(list_len_of_tensor, + len_of_tensor.cuda(paddle.distributed.ParallelEnv().device_id), + group) # get the max length from list max_len_of_tensor = int(max(list_len_of_tensor).item()) # resize the input tensor to max length avoid hang in all gather From 66a61d239d442c99e3a99773f87d84787e07d507 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Wed, 7 Sep 2022 12:04:41 +0000 Subject: [PATCH 3/3] fix copy error --- python/paddle/distributed/collective.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py index 9115cd67f08233..19f0796be22dd6 100644 --- a/python/paddle/distributed/collective.py +++ b/python/paddle/distributed/collective.py @@ -1140,12 +1140,13 @@ def all_gather_object(object_list, obj, group=None): ), "all_gather_object doesn't support static graph mode." tensor, len_of_tensor = _convert_object_to_tensor(obj) + if paddle.get_device() != "cpu": + len_of_tensor = len_of_tensor._copy_to( + paddle.framework._current_expected_place(), False) # gather len_of_tensor from all ranks list_len_of_tensor = [] - all_gather(list_len_of_tensor, - len_of_tensor.cuda(paddle.distributed.ParallelEnv().device_id), - group) + all_gather(list_len_of_tensor, len_of_tensor, group) # get the max length from list max_len_of_tensor = int(max(list_len_of_tensor).item()) # resize the input tensor to max length avoid hang in all gather