diff --git a/.gitignore b/.gitignore index a2009a1ed30a1c..801790d0a47208 100644 --- a/.gitignore +++ b/.gitignore @@ -6,12 +6,14 @@ paddle/fluid/eager/api/generated/* paddle/fluid/op_use_default_grad_maker_DEV.spec paddle/fluid/op_use_default_grad_maker_PR.spec paddle/phi/api/backward/backward_api.h +paddle/phi/api/backward/sparse_bw_api.h paddle/phi/api/include/api.h paddle/phi/api/include/sparse_api.h paddle/phi/api/lib/api.cc paddle/phi/api/lib/dygraph_api.* paddle/phi/api/lib/backward_api.cc paddle/phi/api/lib/sparse_api.cc +paddle/phi/api/lib/sparse_bw_api.cc paddle/phi/extension.h paddle/phi/include/* paddle/phi/infermeta/generated.* @@ -54,6 +56,7 @@ paddle/infrt/dialect/pd_ops.td paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td tools/infrt/kernels.json +tools/infrt/kernel_signature.json paddle/infrt/dialect/pd_ops_info.h .lit_test_times.txt paddle/infrt/tests/dialect/Output diff --git a/cmake/external/llvm.cmake b/cmake/external/llvm.cmake index a7a9e85ffd7314..9f6fd32ad986c4 100644 --- a/cmake/external/llvm.cmake +++ b/cmake/external/llvm.cmake @@ -100,8 +100,8 @@ endfunction() function(mlir_add_rewriter td_base) set(LLVM_TARGET_DEFINITIONS ${td_base}.td) mlir_tablegen(${td_base}.cpp.inc -gen-rewriters "-I${CMAKE_SOURCE_DIR}/infrt/dialect/pass") - add_public_tablegen_target(${td_base}_IncGen) - add_custom_target(${td_base}_inc DEPENDS ${td_base}_IncGen) + add_public_tablegen_target(MLIR${td_base}IncGen) + add_dependencies(mlir-headers MLIR${td_base}IncGen) endfunction() # Execute the mlir script with infrt-exec program. diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index 45a76fdc1f1a2a..cfbe68eecbaca5 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -36,7 +36,7 @@ ENDIF() if(NOT DEFINED XPU_BASE_URL) SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev") - SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220228") + SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220307") else() SET(XPU_BASE_URL "${XPU_BASE_URL}") endif() diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 7affd59de162d5..1291e60cfe4ce1 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -293,11 +293,11 @@ function(op_library TARGET) # Define operators that don't need pybind here. foreach(manual_pybind_op "compare_all_op" "compare_op" "logical_op" "bitwise_op" "nccl_op" "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op") - - if ("${TARGET}" STREQUAL "${manual_pybind_op}") - set(pybind_flag 1) - endif() - endforeach() + + if ("${TARGET}" STREQUAL "${manual_pybind_op}") + set(pybind_flag 1) + endif() + endforeach() # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h. # Note that it's enough to just adding one operator to pybind in a *_op.cc file. @@ -478,7 +478,7 @@ function(op_library TARGET) if (${pybind_flag} EQUAL 0) # NOTE(*): activation use macro to regist the kernels, set use_op manually. if(${TARGET} STREQUAL "activation") - file(APPEND ${pybind_file} "USE_OP(relu);\n") + file(APPEND ${pybind_file} "USE_OP_ITSELF(relu);\n") elseif(${TARGET} STREQUAL "fake_dequantize") file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n") elseif(${TARGET} STREQUAL "fake_quantize") diff --git a/cmake/phi.cmake b/cmake/phi.cmake index f6e15758379ada..ebb686d8ad0f31 100644 --- a/cmake/phi.cmake +++ b/cmake/phi.cmake @@ -134,8 +134,8 @@ function(kernel_library TARGET) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu) list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu) endif() - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu) - list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu) + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}.cu) + list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}.cu) endif() endif() if (WITH_XPU) @@ -197,92 +197,88 @@ function(kernel_library TARGET) # kernel source file level # level 1: base device kernel - # - cpu_srcs / gpu_srcs / xpu_srcs / kps_srcs + # - cpu_srcs / gpu_srcs / xpu_srcs / gpudnn_srcs / kps_srcs # level 2: device-independent kernel # - common_srcs # level 3: Kernel implemented by reusing device-independent kernel # - selected_rows_srcs + set(base_device_kernels) + set(device_independent_kernel) + set(high_level_kernels) - # Build Target according different src organization - if((${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR - ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) AND - (${common_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0)) - # If the common_srcs/selected_rows_srcs depends on specific device srcs, build target using this rule. + # 1. Base device kernel compile + if (${cpu_srcs_len} GREATER 0) + cc_library(${TARGET}_cpu SRCS ${cpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + list(APPEND base_device_kernels ${TARGET}_cpu) + endif() + if (${gpu_srcs_len} GREATER 0) if (WITH_GPU) - if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) - nv_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - nv_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part) - endif() + nv_library(${TARGET}_gpu SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) elseif (WITH_ROCM) - if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) - hip_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - hip_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part) - endif() - elseif (WITH_XPU_KP) - if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) - xpu_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - xpu_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part) - endif() - else() - if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0) - cc_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - cc_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part) - endif() + hip_library(${TARGET}_gpu SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) endif() - # If there are only specific device srcs, build target using this rule. - elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) + list(APPEND base_device_kernels ${TARGET}_gpu) + endif() + if (${xpu_srcs_len} GREATER 0) + cc_library(${TARGET}_xpu SRCS ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + list(APPEND base_device_kernels ${TARGET}_xpu) + endif() + if (${gpudnn_srcs_len} GREATER 0) if (WITH_GPU) - if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) - nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - endif() + nv_library(${TARGET}_gpudnn SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) elseif (WITH_ROCM) - if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) - hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - endif() - elseif (WITH_XPU_KP) - if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) - xpu_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - endif() - else() - if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0) - cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - endif() + hip_library(${TARGET}_gpudnn SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) endif() - # If the selected_rows_srcs depends on common_srcs, build target using this rule. - elseif (${common_srcs_len} GREATER 0 AND ${selected_rows_srcs_len} GREATER 0) + list(APPEND base_device_kernels ${TARGET}_gpudnn) + endif() + if (${kps_srcs_len} GREATER 0) + # only when WITH_XPU_KP, the kps_srcs_len can be > 0 + xpu_library(${TARGET}_kps SRCS ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + list(APPEND base_device_kernels ${TARGET}_kps) + endif() + + # 2. Device-independent kernel compile + if (${common_srcs_len} GREATER 0) if (WITH_GPU) - nv_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) + nv_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) elseif (WITH_ROCM) - hip_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) + hip_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) elseif (WITH_XPU_KP) - xpu_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - xpu_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) + xpu_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) else() - cc_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) + cc_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) endif() - # If there are only common_srcs or selected_rows_srcs, build target using below rules. - elseif (${common_srcs_len} GREATER 0) + list(APPEND device_independent_kernel ${TARGET}_common) + endif() + + # 3. Reusing kernel compile + if (${selected_rows_srcs_len} GREATER 0) if (WITH_GPU) - nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + nv_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel}) elseif (WITH_ROCM) - hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + hip_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel}) elseif (WITH_XPU_KP) - xpu_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + xpu_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel}) else() - cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + cc_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel}) endif() - elseif (${selected_rows_srcs_len} GREATER 0) + list(APPEND high_level_kernels ${TARGET}_sr) + endif() + + # 4. Unify target compile + list(LENGTH base_device_kernels base_device_kernels_len) + list(LENGTH device_independent_kernel device_independent_kernel_len) + list(LENGTH high_level_kernels high_level_kernels_len) + if (${base_device_kernels_len} GREATER 0 OR ${device_independent_kernel_len} GREATER 0 OR + ${high_level_kernels_len} GREATER 0) if (WITH_GPU) - nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + nv_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels}) elseif (WITH_ROCM) - hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + hip_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels}) elseif (WITH_XPU_KP) - xpu_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + xpu_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels}) else() - cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + cc_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels}) endif() else() set(target_build_flag 0) diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt index 96bc4a710f8c1c..f88c993d85e2fa 100644 --- a/paddle/fluid/distributed/collective/CMakeLists.txt +++ b/paddle/fluid/distributed/collective/CMakeLists.txt @@ -7,3 +7,6 @@ cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup) if(WITH_NCCL) cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api) endif() +if(WITH_ASCEND_CL) + cc_library(processgroup_hccl SRCS ProcessGroupHCCL.cc DEPS place npu_stream enforce collective_helper device_context phi phi_api eager_api) +endif() diff --git a/paddle/fluid/distributed/collective/HCCLTools.h b/paddle/fluid/distributed/collective/HCCLTools.h new file mode 100644 index 00000000000000..09789bd4d37863 --- /dev/null +++ b/paddle/fluid/distributed/collective/HCCLTools.h @@ -0,0 +1,174 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "boost/variant.hpp" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/device/npu/enforce_npu.h" +#include "paddle/fluid/platform/device/npu/npu_info.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace distributed { + +class NPUEventManager { + public: + NPUEventManager() = default; + + ~NPUEventManager() { + if (is_created_) { + platform::NPUDeviceGuard guard(device_index_); + platform::NPUEventDestroy(event_); + } + } + + NPUEventManager(const NPUEventManager&) = delete; + NPUEventManager& operator=(const NPUEventManager&) = delete; + + NPUEventManager(NPUEventManager&& other) { + std::swap(is_created_, other.is_created_); + std::swap(device_index_, other.device_index_); + std::swap(event_, other.event_); + } + + NPUEventManager& operator=(NPUEventManager&& other) { + std::swap(is_created_, other.is_created_); + std::swap(device_index_, other.device_index_); + std::swap(event_, other.event_); + return *this; + } + + bool IsCreated() const { return is_created_; } + bool DeviceId() const { return device_index_; } + aclrtEvent GetRawNPUEvent() const { return event_; } + + void Record(const paddle::platform::NPUDeviceContext& ctx) { + auto device_index = ctx.GetPlace().device; + if (!is_created_) { + CreateEvent(device_index); + } + PADDLE_ENFORCE_EQ(device_index, device_index_, + platform::errors::PreconditionNotMet( + "NPUDeviceContext's device %d does not match" + "Event's device %d", + device_index, device_index_)); + + platform::NPUDeviceGuard guard(device_index_); + platform::NPUEventRecord(event_, ctx.stream()); + } + + bool Query() const { + aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE; + platform::NPUEventQuery(event_, &status); + if (status == ACL_EVENT_STATUS_COMPLETE) { + return true; + } + return false; + } + + void Block(const paddle::platform::NPUDeviceContext& ctx) const { + if (is_created_) { + auto device_index = ctx.GetPlace().device; + PADDLE_ENFORCE_EQ(device_index, device_index_, + platform::errors::PreconditionNotMet( + "CUDADeviceContext's device %d does not match" + "Event's device %d", + device_index, device_index_)); + platform::NPUDeviceGuard guard(device_index_); + platform::NPUStreamWaitEvent(ctx.stream(), event_); + } + } + + private: + bool is_created_{false}; + aclrtEvent event_{}; + int8_t device_index_{0}; + + private: + void CreateEvent(int device_index) { + device_index_ = device_index; + platform::NPUDeviceGuard guard(device_index); + platform::NPUEventCreate(&event_); + is_created_ = true; + } +}; + +class HCCLCommManager { + public: + explicit HCCLCommManager(HcclComm hcclComm) : hccl_comm_(hcclComm) {} + + HCCLCommManager() : HCCLCommManager(nullptr) {} + + ~HCCLCommManager() noexcept { + std::unique_lock lock(mutex_); + if (hccl_comm_) { + platform::dynload::HcclCommDestroy(hccl_comm_); + } + } + + static std::shared_ptr Create(int num_ranks, int rank, + HcclRootInfo* comm_id, + HcclComm hccl_comm) { + auto hccl_manager = std::make_shared(); + auto ret = platform::dynload::HcclCommInitRootInfo(num_ranks, comm_id, rank, + &hccl_comm); + using __NPU_STATUS_TYPE__ = decltype(ret); + constexpr auto __success_type__ = + platform::details::NPUStatusType<__NPU_STATUS_TYPE__>::kSuccess; + if (UNLIKELY(ret != __success_type__)) { + VLOG(0) << "Error: create hccl_id error."; + exit(-1); + } + + hccl_manager->hccl_id_ = comm_id; + hccl_manager->rank_ = rank; + hccl_manager->hccl_comm_ = hccl_comm; + return hccl_manager; + } + + HcclRootInfo* GetHcclId() const { + std::unique_lock lock(mutex_); + return hccl_id_; + } + + HcclComm GetHcclComm() const { + std::unique_lock lock(mutex_); + return hccl_comm_; + } + + HCCLCommManager(const HCCLCommManager&) = delete; + HCCLCommManager& operator=(const HCCLCommManager&) = delete; + HCCLCommManager& operator=(HCCLCommManager&& other) = delete; + + HCCLCommManager(HCCLCommManager&& other) { + std::unique_lock lock(other.mutex_); + std::swap(hccl_comm_, other.hccl_comm_); + } + + protected: + HcclComm hccl_comm_; + HcclRootInfo* hccl_id_; + int rank_; + mutable std::mutex mutex_; +}; + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc new file mode 100644 index 00000000000000..84f5ca48d25c84 --- /dev/null +++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc @@ -0,0 +1,356 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/collective/ProcessGroupHCCL.h" +#include "paddle/fluid/memory/malloc.h" +#include "paddle/fluid/platform/device/npu/hccl_helper.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/phi/api/include/api.h" +#include "paddle/phi/common/place.h" + +DECLARE_bool(hccl_blocking_wait); +// DECLARE_bool(use_stream_safe_npu_allocator); + +constexpr int64_t kWaitBlockTImeout = 10; + +namespace paddle { +namespace distributed { + +static HcclReduceOp ToHCCLRedType(ReduceOp reduction) { + static const std::map red_type = { + {ReduceOp::MIN, HCCL_REDUCE_MIN}, + {ReduceOp::MAX, HCCL_REDUCE_MAX}, + {ReduceOp::SUM, HCCL_REDUCE_SUM}, + {ReduceOp::PRODUCT, HCCL_REDUCE_PROD}, + }; + auto it = red_type.find(reduction); + PADDLE_ENFORCE_EQ( + it != red_type.end(), true, + platform::errors::InvalidArgument("Invalid hccl reduction. " + "Must be Min | Max | Prod | Sum")); + return it->second; +} + +std::string SerializeHCCLUniqueId(const HcclRootInfo& hcclID) { + const uint8_t* bytes = reinterpret_cast(&hcclID); + std::ostringstream oss; + for (size_t i = 0; i < sizeof(hcclID); ++i) { + oss << std::hex << static_cast(bytes[i]); + } + return oss.str(); +} + +// Get the list of devices from list of tensors +std::vector GetPlaceList(const std::vector& tensors) { + std::vector places; + places.reserve(tensors.size()); + for (auto& tensor : tensors) { + places.push_back(tensor.inner_place()); + } + return places; +} + +// Get the deviceList String from the list of devices +std::string GetKeyFromPlaces(const std::vector& places) { + std::string placeList; + for (auto& place : places) { + std::stringstream tmp; + tmp << place; + if (placeList.empty()) { + placeList += tmp.str(); + } else { + placeList += "," + tmp.str(); + } + } + return placeList; +} + +// bool CheckTensorsInNPUPlace(const std::vector& tensors) { +// return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) { +// return t.place() == platform::DeviceType::NPU; +// }); +// } + +void SyncDefaultStream( + const std::vector& places, + std::vector& hcclEvents, // NOLINT + std::vector>& dev_ctx) { // NOLINT + for (size_t i = 0; i < places.size(); ++i) { + auto* default_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(places[i])); + hcclEvents[i].Record(*dev_ctx[i]); + hcclEvents[i].Block(*default_ctx); + } +} + +std::shared_ptr ProcessGroupHCCL::CreateTask( + std::vector places, int rank, CommType comm_type, + const std::vector& inputs) { + return std::make_shared(places, rank, comm_type, + inputs); +} + +ProcessGroupHCCL::HCCLTask::HCCLTask(const std::vector& places, int rank, + CommType CommType, + const std::vector& inputs) + : Task(rank, inputs, CommType), places_(places) { + control_events_.resize(places.size()); + hcclComms_.resize(places.size()); +} + +ProcessGroupHCCL::HCCLTask::~HCCLTask() {} + +void ProcessGroupHCCL::HCCLTask::SetOutputs( + std::vector& outputs) { // NOLINT + outputs_ = std::make_shared>(outputs); +} + +void ProcessGroupHCCL::HCCLTask::SynchronizeStreams() { + for (size_t i = 0; i < places_.size(); ++i) { + auto* default_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(places_[i])); + platform::NPUStreamWaitEvent(default_ctx->stream(), + control_events_[i].GetRawNPUEvent()); + } +} + +bool ProcessGroupHCCL::HCCLTask::IsCompleted() { + for (size_t i = 0; i < places_.size(); ++i) { + if (!control_events_[i].Query()) { + return false; + } + } + + return true; +} + +// TODO(sandyhouse): Add timeout for wait, now timeout unused +bool ProcessGroupHCCL::HCCLTask::Wait(std::chrono::milliseconds timeout) { + SynchronizeStreams(); + if (FLAGS_hccl_blocking_wait) { + // NOTE(sandyhouse): It will block host for sync + while (!IsCompleted()) { + std::this_thread::sleep_for(std::chrono::milliseconds(kWaitBlockTImeout)); + } + } + return true; +} + +// Same as Wait +void ProcessGroupHCCL::HCCLTask::Synchronize() { Wait(kWaitTimeout); } + +ProcessGroupHCCL::ProcessGroupHCCL(const std::shared_ptr& store, + int rank, int size) + : ProcessGroup(rank, size), store_(store) {} + +void ProcessGroupHCCL::BroadcastUniqueHCCLID( + std::vector& hccl_ids) { // NOLINT + if (rank_ == 0) { + for (size_t i = 0; i < hccl_ids.size(); i++) { + auto key = "ProcessGroupHCCL/hccl_ids/" + std::to_string(i); + auto hccl_id = std::vector( + reinterpret_cast(&hccl_ids[i]), + reinterpret_cast(&hccl_ids[i]) + sizeof(HcclRootInfo)); + store_->set(key, hccl_id); + } + } else { + for (size_t i = 0; i < hccl_ids.size(); i++) { + auto key = "ProcessGroupHCCL/hccl_ids/" + std::to_string(i); + auto ret = store_->get(key); + std::memcpy(&hccl_ids[i], ret.data(), ret.size()); + } + } +} + +// create HCCLManager cache for places_key +void ProcessGroupHCCL::CreateHCCLManagerCache( + const std::string& places_key, const std::vector& places) { + PADDLE_ENFORCE_EQ(places_key.empty(), false, + platform::errors::PreconditionNotMet( + "Not able to create/get the HCCL Communicator since " + "the NPU place are not known")); + + std::vector> hccl_comms; + hccl_comms.resize(places.size()); + + // using vector just for broadcast + std::vector hccl_ids; + hccl_ids.resize(1); + auto& hccl_id = hccl_ids.front(); + + if (rank_ == 0) { + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclGetRootInfo(&hccl_id)); + } + BroadcastUniqueHCCLID(hccl_ids); + + VLOG(3) << "init hccl rank: " << rank_ << ", nranks: " << size_ + << ", place: " << places_key + << ", hccl uniqueid: " << SerializeHCCLUniqueId(hccl_id); + + std::vector> dev_ctx; + dev_ctx.resize(places.size()); + + std::unique_ptr comms(new HcclComm[places.size()]); + for (size_t i = 0; i < places.size(); ++i) { + platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + hccl_comms[i] = HCCLCommManager::Create(GetSize(), GetRank(), &hccl_id, + comms.get() + i); + dev_ctx[i].reset(new NPUDeviceContext(places[i])); + } + + std::vector events; + events.resize(places.size()); + + // These caches will be useful to process sync/wait/communicate + places_to_events_.emplace(places_key, std::move(events)); + places_to_hcclcomm_.emplace(places_key, std::move(hccl_comms)); + places_to_ctx_.emplace(places_key, std::move(dev_ctx)); +} + +template +std::shared_ptr ProcessGroupHCCL::Collective( + std::vector& inputs, std::vector& outputs, Fn fn, + CommType op_type) { + const auto places = GetPlaceList(inputs); + const auto key = GetKeyFromPlaces(places); + + { + std::lock_guard lock(mutex_); + if (places_to_hcclcomm_.find(key) == places_to_hcclcomm_.end()) { + CreateHCCLManagerCache(key, places); + } + } + + auto& hccl_comms = places_to_hcclcomm_[key]; + + SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]); + + auto task = CreateTask(places, rank_, op_type, inputs); + task->SetOutputs(outputs); + + // if (FLAGS_use_stream_safe_npu_allocator) { + // for (size_t i = 0; i < inputs.size(); ++i) { + // platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + // auto dense_tensor = + // std::dynamic_pointer_cast(inputs[i].impl()); + // memory::RecordStream(dense_tensor->Holder(), + // places_to_ctx_[key][i]->stream()); + // } + // } + + for (size_t i = 0; i < inputs.size(); ++i) { + platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + const auto& hccl_stream = places_to_ctx_[key][i]->stream(); + fn(inputs[i], outputs[i], hccl_comms[i]->GetHcclComm(), hccl_stream); + } + + for (size_t i = 0; i < inputs.size(); ++i) { + platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + task->control_events_[i].Record(*places_to_ctx_[key][i]); + } + return task; +} + +template +std::shared_ptr ProcessGroupHCCL::PointToPoint( + std::vector& tensors, Fn fn, int dst_rank, CommType op_type) { + const auto places = GetPlaceList(tensors); + const auto key = GetKeyFromPlaces(places); + + { + std::lock_guard lock(mutex_); + if (places_to_hcclcomm_.find(key) == places_to_hcclcomm_.end()) { + CreateHCCLManagerCache(key, places); + } + } + + auto& hccl_comms = places_to_hcclcomm_[key]; + + SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]); + + auto task = CreateTask(places, rank_, op_type, tensors); + + // construct uninitialize guard for device + + // if (FLAGS_use_stream_safe_npu_allocator) { + // for (size_t i = 0; i < tensors.size(); ++i) { + // platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + // auto dense_tensor = + // std::dynamic_pointer_cast(tensors[i].impl()); + // memory::RecordStream(dense_tensor->Holder(), + // places_to_ctx_[key][i]->stream()); + // } + // } + + for (size_t i = 0; i < tensors.size(); ++i) { + platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + const auto& hccl_stream = places_to_ctx_[key][i]->stream(); + fn(tensors[i], hccl_comms[i]->GetHcclComm(), hccl_stream, dst_rank); + } + + for (size_t i = 0; i < tensors.size(); ++i) { + platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + task->control_events_[i].Record(*places_to_ctx_[key][i]); + } + return task; +} + +std::shared_ptr ProcessGroupHCCL::AllReduce( + std::vector& tensors, const AllreduceOptions& opts) { + // PADDLE_ENFORCE_EQ( + // CheckTensorsInNPUPlace(tensors), true, + // platform::errors::InvalidArgument("All inputs should be in + // NPUPlace.")); + return Collective( + tensors, tensors, + [&](const Tensor& input, Tensor& output, HcclComm comm, + const aclrtStream& stream) { + auto input_tensor = + std::dynamic_pointer_cast(input.impl()); + auto output_tensor = + std::dynamic_pointer_cast(output.impl()); + return platform::dynload::HcclAllReduce( + input_tensor->data(), output_tensor->data(), input_tensor->numel(), + platform::ToHCCLDataType(input.type()), + ToHCCLRedType(opts.reduce_op), comm, stream); + }, + CommType::ALLREDUCE); +} + +std::shared_ptr ProcessGroupHCCL::Broadcast( + std::vector& tensors, const BroadcastOptions& opts) { + // PADDLE_ENFORCE_EQ( + // CheckTensorsInNPUPlace(tensors), true, + // platform::errors::InvalidArgument("All inputs should be in + // CudaPlace.")); + + return Collective( + tensors, tensors, + [&](Tensor& input, Tensor& output, HcclComm comm, + const aclrtStream& stream) { + const auto root = opts.source_rank * tensors.size() + opts.source_root; + auto input_tensor = + std::dynamic_pointer_cast(input.impl()); + auto output_tensor = + std::dynamic_pointer_cast(output.impl()); + return platform::dynload::HcclBroadcast( + input_tensor->data(), input_tensor->numel(), + platform::ToHCCLDataType(input.type()), root, comm, stream); + }, + CommType::BROADCAST); +} + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.h b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h new file mode 100644 index 00000000000000..f2376b4eed7600 --- /dev/null +++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h @@ -0,0 +1,152 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/distributed/collective/ProcessGroup.h" +#include "paddle/fluid/platform/device/npu/npu_stream.h" +#include "paddle/fluid/platform/device_context.h" + +#include "paddle/fluid/distributed/collective/HCCLTools.h" +#include "paddle/fluid/distributed/store/store.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/gen_comm_id_helper.h" +#include "paddle/fluid/platform/place.h" + +constexpr const char* HCCL_BACKEND_NAME = "HCCL"; + +namespace paddle { +namespace distributed { + +using Place = paddle::platform::Place; +using NPUStream = platform::stream::NPUStream; +using NPUDeviceContext = paddle::platform::NPUDeviceContext; + +class ProcessGroupHCCL : public ProcessGroup { + public: + class HCCLTask : public ProcessGroup::Task, + public std::enable_shared_from_this { + public: + HCCLTask(const std::vector& places, int rank, CommType CommType, + const std::vector& inputs); + + bool IsCompleted(); + + void SynchronizeStreams(); + + bool Wait(std::chrono::milliseconds timeout = kWaitTimeout); + + void Synchronize(); + + void SetOutputs(std::vector& outputs); // NOLINT + + virtual ~HCCLTask(); + + std::vector control_events_; + + protected: + std::vector places_; + std::vector> hcclComms_; + std::shared_ptr> outputs_; + + private: + }; + + ProcessGroupHCCL(const std::shared_ptr& store, int rank, int size); + + const std::string GetBackendName() const override { + return std::string(HCCL_BACKEND_NAME); + } + + std::shared_ptr AllReduce( + std::vector& tensors, + const AllreduceOptions& = AllreduceOptions()) override; + + std::shared_ptr Broadcast( + std::vector& tensors, + const BroadcastOptions& = BroadcastOptions()) override; + + std::shared_ptr Barrier( + const BarrierOptions& = BarrierOptions()) override; + + std::shared_ptr Send(std::vector& tensors, + int dst_rank) override; + + std::shared_ptr Recv(std::vector& tensors, + int src_rank) override; + + std::shared_ptr AllGather( + std::vector& in_tensors, + std::vector& out_tensors) override; + + std::shared_ptr AllToAll( + std::vector& in, std::vector& out) override; + + std::shared_ptr Reduce( + std::vector& tensors, const ReduceOptions& opts) override; + + std::shared_ptr Scatter(std::vector& in_tensors, + std::vector& out_tensors, + const ScatterOptions&) override; + + protected: + virtual std::shared_ptr CreateTask( + std::vector places, int rank, CommType opType, + const std::vector& inputs); + + std::shared_ptr store_; + std::shared_ptr hccl_comm_; + std::mutex mutex_; + std::unordered_map>> + places_to_hcclcomm_; + + std::unordered_map> + places_to_events_; + + std::unordered_map>> + places_to_ctx_; + + std::set used_place_ids_; + + private: + void BcastHCCLId(std::vector& hccl_ids, int root, // NOLINT + int server_fd); + + void BroadcastUniqueHCCLID(std::vector& hccl_ids); // NOLINT + + template + std::shared_ptr Collective( + std::vector& inputs, // NOLINT + std::vector& outputs, // NOLINT + Fn fn, CommType op_type); + + template + std::shared_ptr PointToPoint( + std::vector& tensors, // NOLINT + Fn fn, int dst_rank, CommType op_type); + + void CreateHCCLManagerCache(const std::string& places_key, + const std::vector& places); +}; + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index 88d8fb69eb6980..67715f410d443c 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -156,36 +156,27 @@ bool ProcessGroupNCCL::NCCLTask::Wait(std::chrono::milliseconds timeout) { // Same as Wait void ProcessGroupNCCL::NCCLTask::Synchronize() { Wait(kWaitTimeout); } -ProcessGroupNCCL::ProcessGroupNCCL(const ProcessGroupStrategy& strategy, +ProcessGroupNCCL::ProcessGroupNCCL(const std::shared_ptr& store, int rank, int size) - : ProcessGroup(rank, size), strategy_(strategy) {} - -void ProcessGroupNCCL::BcastNCCLId( - std::vector& nccl_ids, // NOLINT - int root, int server_fd) { - if (strategy_.local_rank_ == root) { - std::vector other_trainers; - for (auto& ep : strategy_.trainer_endpoints_) { - if (ep != strategy_.current_endpoint_) { - other_trainers.push_back(ep); - } - } - platform::SendBroadCastCommID(other_trainers, &nccl_ids); - } else { - platform::RecvBroadCastCommID(server_fd, strategy_.current_endpoint_, - &nccl_ids); - } -} + : ProcessGroup(rank, size), store_(store) {} void ProcessGroupNCCL::BroadcastUniqueNCCLID( std::vector& nccl_ids) { // NOLINT - - int server_fd = -1; - if (rank_ != 0) { - server_fd = platform::SocketServer::GetInstance(strategy_.current_endpoint_) - .socket(); + if (rank_ == 0) { + for (size_t i = 0; i < nccl_ids.size(); i++) { + auto key = "ProcessGroupNCCL/nccl_ids/" + std::to_string(i); + auto nccl_id = std::vector( + reinterpret_cast(&nccl_ids[i]), + reinterpret_cast(&nccl_ids[i]) + NCCL_UNIQUE_ID_BYTES); + store_->set(key, nccl_id); + } + } else { + for (size_t i = 0; i < nccl_ids.size(); i++) { + auto key = "ProcessGroupNCCL/nccl_ids/" + std::to_string(i); + auto ret = store_->get(key); + std::memcpy(&nccl_ids[i], ret.data(), ret.size()); + } } - BcastNCCLId(nccl_ids, 0, server_fd); } // create NCCLManager cache for places_key @@ -213,8 +204,8 @@ void ProcessGroupNCCL::CreateNCCLManagerCache( } BroadcastUniqueNCCLID(nccl_ids); - VLOG(3) << "init nccl rank: " << strategy_.local_rank_ - << ", nranks: " << strategy_.nranks_ << ", place: " << places_key + VLOG(3) << "init nccl rank: " << rank_ << ", nranks: " << size_ + << ", place: " << places_key << ", nccl uniqueid: " << SerializeNCCLUniqueId(nccl_id); std::vector> dev_ctx; diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h index d63a5e768382c6..aa2a2b8fa2088c 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h @@ -25,6 +25,7 @@ #include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/distributed/store/store.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/gen_comm_id_helper.h" #include "paddle/fluid/platform/place.h" @@ -75,7 +76,7 @@ class ProcessGroupNCCL : public ProcessGroup { private: }; - ProcessGroupNCCL(const ProcessGroupStrategy& strategy, int rank, int size); + ProcessGroupNCCL(const std::shared_ptr& store, int rank, int size); const std::string GetBackendName() const override { return std::string(NCCL_BACKEND_NAME); @@ -118,7 +119,7 @@ class ProcessGroupNCCL : public ProcessGroup { const std::vector& inputs); protected: - ProcessGroupStrategy strategy_; + std::shared_ptr store_; std::shared_ptr nccl_comm_; std::mutex mutex_; std::unordered_map>> diff --git a/paddle/fluid/distributed/store/store.h b/paddle/fluid/distributed/store/store.h index 2581a74d7e8187..7b4ae7e70ff6f0 100644 --- a/paddle/fluid/distributed/store/store.h +++ b/paddle/fluid/distributed/store/store.h @@ -25,15 +25,26 @@ namespace distributed { class Store { public: - Store() = delete; + Store() : _timeout(tcputils::kNoTimeout) {} explicit Store(const std::chrono::seconds& timeout) : _timeout(timeout) {} virtual ~Store() = default; - virtual int64_t add(const std::string& key, int64_t value) = 0; - virtual std::vector get(const std::string& key) = 0; - virtual void wait(const std::string& key) = 0; - virtual void set(const std::string& key, - const std::vector& value) = 0; + virtual int64_t add(const std::string& key, int64_t value) { + PADDLE_THROW(platform::errors::InvalidArgument( + "Implement the add method in the subclass.")); + } + virtual std::vector get(const std::string& key) { + PADDLE_THROW(platform::errors::InvalidArgument( + "Implement the add method in the subclass.")); + } + virtual void wait(const std::string& key) { + PADDLE_THROW(platform::errors::InvalidArgument( + "Implement the add method in the subclass.")); + } + virtual void set(const std::string& key, const std::vector& value) { + PADDLE_THROW(platform::errors::InvalidArgument( + "Implement the add method in the subclass.")); + } virtual const std::chrono::seconds& timeout() const { return _timeout; } diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt index 8cb69caf663696..698a698fc6d184 100644 --- a/paddle/fluid/eager/CMakeLists.txt +++ b/paddle/fluid/eager/CMakeLists.txt @@ -1,4 +1,4 @@ -set(eager_deps phi phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node) +set(eager_deps phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node) set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy) set(generated_deps dygraph_function dygraph_node) @@ -10,11 +10,11 @@ endif() add_subdirectory(api) add_subdirectory(accumulation) -cc_library(grad_node_info SRCS grad_node_info.cc DEPS phi phi_api) +cc_library(grad_node_info SRCS grad_node_info.cc DEPS phi_api phi_tensor) cc_library(grad_tensor_holder SRCS grad_tensor_holder.cc DEPS grad_node_info gradient_accumulator) -cc_library(autograd_meta SRCS autograd_meta.cc DEPS phi phi_api) -cc_library(utils SRCS utils.cc DEPS phi phi_api global_utils layer proto_desc operator op_registry variable_helper memcpy scale_op autograd_meta hook_utils) +cc_library(autograd_meta SRCS autograd_meta.cc DEPS phi_api phi_tensor) +cc_library(utils SRCS utils.cc DEPS phi_api phi_tensor global_utils layer proto_desc operator op_registry variable_helper memcpy scale_op autograd_meta hook_utils) cc_library(backward SRCS backward.cc DEPS grad_tensor_holder utils autograd_meta grad_node_info) add_subdirectory(tests) diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index 2fc846cccc22e8..dc79a8a45a2467 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -47,6 +47,9 @@ std::unordered_map> static std::unordered_map operators_with_attrs = {}; +/* --- Black Ops list that's NO NEED to apply code generation --- */ +static std::unordered_set black_ops_list = {"run_program"}; + static std::string LegalizeVariableName(const std::string& var_name) { std::string ret = var_name; std::replace(ret.begin(), ret.end(), '-', '_'); // replace all '-' to '_' @@ -73,12 +76,6 @@ static bool IgnoreGradAttribute(const std::string& op_type, } static void PrepareAttrMapForOps() { - // Handle "run_program_op" - static framework::ProgramDesc fake_prog; - operators_with_attrs["run_program"] = {}; - operators_with_attrs["run_program"]["global_block"] = - fake_prog.MutableBlock(0); - // Handle "fused_elemwise_add_activation" std::vector functor_list = {"a", "b"}; operators_with_attrs["fused_elemwise_add_activation"] = {}; @@ -2349,6 +2346,9 @@ static void DygraphCodeGeneration(const std::string& output_dir) { if (!CheckOpProto(op_proto)) continue; const std::string& op_type = op_proto->type(); + if (black_ops_list.count(op_type)) { + continue; + } /* ----------------------------- */ /* ---- Collect Information ---- */ diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt index c6bca01205e19c..53af6c1048d245 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt @@ -1,5 +1,5 @@ -set(api_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml") -set(backward_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml") +set(api_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api.yaml") +set(backward_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api.yaml") set(tmp_forwards_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.cc") set(tmp_forwards_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.h") set(tmp_nodes_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/tmp_nodes.cc") diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index d1e208541537c8..4f6f437163a8dc 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -23,12 +23,13 @@ core_ops_args_info = {} core_ops_args_type_info = {} +namespace = "" yaml_types_mapping = { - 'int' : 'int', 'int32_t' : 'int32_t', 'int64_t' : 'int64_t', 'size_t' : 'size_t', \ + 'int' : 'int', 'int32' : 'int32_t', 'int64' : 'int64_t', 'size_t' : 'size_t', \ 'float' : 'float', 'double' : 'double', 'bool' : 'bool', \ 'Backend' : 'paddle::experimental::Backend', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \ - 'int64_t[]' : 'std::vector', 'int[]' : 'std::vector', + 'int64[]' : 'std::vector', 'int[]' : 'std::vector', 'Tensor' : 'Tensor', 'Tensor[]' : 'std::vector', 'Tensor[Tensor[]]' : 'std::vector>', @@ -125,6 +126,7 @@ def GetAutoGradMetaVectorName(string): def ReadFwdFile(filepath): f = open(filepath, 'r') contents = yaml.load(f, Loader=yaml.FullLoader) + f.close() return contents @@ -133,9 +135,13 @@ def ReadBwdFile(filepath): contents = yaml.load(f, Loader=yaml.FullLoader) ret = {} for content in contents: - assert 'backward_api' in content.keys() - api_name = content['backward_api'] + if 'backward_api' in content.keys(): + api_name = content['backward_api'] + else: + assert False + ret[api_name] = content + f.close() return ret @@ -608,16 +614,23 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map, returns_str += f"return returns;\n" grad_node_name = GetGradNodeName(fwd_api_name) + + if len(namespace) > 0: + grad_api_namespace = f"paddle::experimental::{namespace}" + else: + grad_api_namespace = f"paddle::experimental" + FUNCTION_TEMPLATE = """ std::vector> {}::operator()(const std::vector>& grads) {{ // Call grad_api function - auto grad_api_returns = paddle::experimental::{}({}); + auto grad_api_returns = {}::{}({}); {} }} """ node_definition_str = FUNCTION_TEMPLATE.format( - grad_node_name, bwd_api_name, grad_api_args_str, returns_str) + grad_node_name, grad_api_namespace, bwd_api_name, grad_api_args_str, + returns_str) return node_definition_str @@ -671,7 +684,7 @@ def GenerateNodeCreationCodes( else: # Tuple api_result if IsPlainTensorType(rtype): - outputs_autograd_meta = f" egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);" + output_autograd_meta = f" egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);" else: assert IsVectorTensorType(rtype) output_autograd_meta = f" std::vector {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);\n" @@ -850,7 +863,11 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name, function_name = fwd_api_name else: function_name = fwd_api_name + "_intermediate" - forward_call_str = f"auto api_result = paddle::experimental::{function_name}({inputs_call_args_str});" + + if len(namespace) > 0: + forward_call_str = f"auto api_result = paddle::experimental::{namespace}::{function_name}({inputs_call_args_str});" + else: + forward_call_str = f"auto api_result = paddle::experimental::{function_name}({inputs_call_args_str});" # Get return type list & outputs num_outputs = len(forward_outputs_position_map.keys()) - len( @@ -1000,7 +1017,9 @@ def GenerateNodeCCFile(filepath, node_definition_str): #include "paddle/fluid/eager/utils.h" #include "paddle/fluid/eager/api/utils/global_utils.h" #include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h" +#include "paddle/fluid/eager/to_static/run_program_op_node.h" +#include "paddle/phi/api/include/sparse_api.h" """ file_contents += node_definition_str with open(filepath, 'a') as f: @@ -1024,6 +1043,7 @@ def GenerateForwardCCFile(filepath, forward_definition_str): #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" #include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h" +#include "paddle/phi/api/include/sparse_api.h" #include "paddle/fluid/eager/api/utils/global_utils.h" """ @@ -1042,6 +1062,7 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str): #include "paddle/phi/api/all.h" #include "paddle/fluid/eager/utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/eager/to_static/run_program_op_func.h" """ file_contents += GenerateCoreOpInfoDeclaration() @@ -1053,134 +1074,184 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str): if __name__ == "__main__": args = ParseArguments() - api_yaml_path = args.api_yaml_path - backward_yaml_path = args.backward_yaml_path - - fwd_api_list = ReadFwdFile(api_yaml_path) - grad_api_dict = ReadBwdFile(backward_yaml_path) + api_yaml_paths = args.api_yaml_path.split(",") + backward_yaml_paths = args.backward_yaml_path.split(",") # Generate per Dygraph API node_declaration_str = "" node_definition_str = "" forward_definition_str = "" forward_declaration_str = "" - for fwd_api in fwd_api_list: - # We only generate Ops with grad - if 'backward' not in fwd_api.keys(): - continue - assert 'api' in fwd_api.keys() - assert 'args' in fwd_api.keys() - assert 'output' in fwd_api.keys() - assert 'backward' in fwd_api.keys() - - no_need_buffer_set = set() - if 'no_need_buffer' in fwd_api.keys(): - no_need_buffer_set = ParseNoNeedBuffer(fwd_api['no_need_buffer']) - - fwd_api_name = fwd_api['api'] - fwd_args_str = fwd_api['args'] - fwd_returns_str = fwd_api['output'] - - bwd_api_name = fwd_api['backward'] - assert bwd_api_name in grad_api_dict.keys() - bwd_api = grad_api_dict[bwd_api_name] - - assert 'args' in bwd_api.keys() - assert 'output' in bwd_api.keys() - assert 'forward' in bwd_api.keys() - - # Parse Dispensable Inputs - optional_inputs = [] - if 'optional' in fwd_api.keys(): - optional_inputs = ParseDispensable(fwd_api['optional']) - - bwd_forward_str = bwd_api['forward'] - bwd_args_str = bwd_api['args'] - bwd_returns_str = bwd_api['output'] - - # Collect Forward Inputs/Outputs - forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForwardFromBackward( - bwd_forward_str) - print("Parsed Forward Inputs List: ", forward_inputs_list) - print("Prased Forward Attrs List: ", forward_attrs_list) - print("Parsed Forward Returns List: ", forward_returns_list) - - intermediate_outputs = [] - if 'intermediate' in fwd_api.keys(): - intermediate_outputs = ParseIntermediate(fwd_api['intermediate']) - - IntermediateValidationCheck(intermediate_outputs, forward_returns_list) - - # Collect Original Forward Inputs/Outputs and then perform validation checks - orig_forward_inputs_list, orig_forward_attrs_list, orig_forward_returns_list = ParseYamlForward( - fwd_args_str, fwd_returns_str) - print("Parsed Original Forward Inputs List: ", orig_forward_inputs_list) - print("Prased Original Forward Attrs List: ", orig_forward_attrs_list) - print("Parsed Original Forward Returns List: ", - orig_forward_returns_list) - - # Forward Validation Checks - ForwardsValidationCheck(forward_inputs_list, forward_attrs_list, - forward_returns_list, orig_forward_inputs_list, - orig_forward_attrs_list, - orig_forward_returns_list) - - # Parse Backward Inputs/Outputs - backward_inputs_list, backward_attrs_list, backward_returns_list = ParseYamlBackward( - bwd_args_str, bwd_returns_str) - print("Parsed Backward Inputs List: ", backward_inputs_list) - print("Prased Backward Attrs List: ", backward_attrs_list) - print("Parsed Backward Returns List: ", backward_returns_list) - - # Determine Forward Inputs/Outputs Position - forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap( - forward_inputs_list, forward_returns_list) - print("Generated Forward Input Position Map: ", - forward_inputs_position_map) - print("Generated Forward Output Position Map: ", - forward_outputs_position_map) - - # SlotName Matching - backward_fwd_input_map, backward_grad_input_map, backward_grad_output_map = SlotNameMatching( - backward_inputs_list, backward_returns_list, - forward_inputs_position_map, forward_outputs_position_map) - print("Generated Backward Fwd Input Map: ", backward_fwd_input_map) - print("Generated Backward Grad Input Map: ", backward_grad_input_map) - print("Generated Backward Grad Output Map: ", backward_grad_output_map) - - # Backward Validation Check - BackwardValidationCheck(backward_fwd_input_map, backward_grad_input_map, - backward_attrs_list) - - # Node Declaration Generation - node_declaration_str += GenerateNodeDeclaration( - fwd_api_name, backward_fwd_input_map, backward_attrs_list, - no_need_buffer_set) - print("Generated Node Declaration: ", node_declaration_str) - - node_definition_str += GenerateNodeDefinition( - fwd_api_name, bwd_api_name, backward_fwd_input_map, - backward_grad_input_map, backward_grad_output_map, - backward_attrs_list) - print("Generated Node Definition: ", node_definition_str) - - # Node Definition Generation - definition_declaration_pair = GenerateForwardDefinition( - fwd_api_name, bwd_api_name, forward_inputs_position_map, - forward_outputs_position_map, forward_attrs_list, - backward_fwd_input_map, backward_grad_input_map, - backward_grad_output_map, backward_attrs_list, optional_inputs, - intermediate_outputs) - print("Generated Forward Definition: ", forward_definition_str) - print("Generated Forward Declaration: ", forward_declaration_str) - forward_definition_str += definition_declaration_pair[0] - forward_declaration_str += definition_declaration_pair[1] - - # For python-level API dispatch - CollectCoreOpsInformation(fwd_api_name, forward_inputs_position_map, - forward_outputs_position_map, - forward_attrs_list) + for i in range(len(api_yaml_paths)): + api_yaml_path = api_yaml_paths[i] + backward_yaml_path = backward_yaml_paths[i] + + if "sparse" in api_yaml_path: + assert "sparse" in backward_yaml_path + namespace = "sparse" + else: + namespace = "" + + fwd_api_list = ReadFwdFile(api_yaml_path) + grad_api_dict = ReadBwdFile(backward_yaml_path) + + yaml_forward_definition_str = "" + yaml_forward_declaration_str = "" + yaml_node_declaration_str = "" + yaml_node_definition_str = "" + for fwd_api in fwd_api_list: + # We only generate Ops with grad + if 'backward' not in fwd_api.keys(): + continue + + assert 'api' in fwd_api.keys() + assert 'args' in fwd_api.keys() + assert 'output' in fwd_api.keys() + assert 'backward' in fwd_api.keys() + + no_need_buffer_set = set() + if 'no_need_buffer' in fwd_api.keys(): + no_need_buffer_set = ParseNoNeedBuffer(fwd_api[ + 'no_need_buffer']) + + fwd_api_name = fwd_api['api'] + fwd_args_str = fwd_api['args'] + fwd_returns_str = fwd_api['output'] + + bwd_api_name = fwd_api['backward'] + assert bwd_api_name in grad_api_dict.keys() + bwd_api = grad_api_dict[bwd_api_name] + + assert 'args' in bwd_api.keys() + assert 'output' in bwd_api.keys() + assert 'forward' in bwd_api.keys() + + # Parse Dispensable Inputs + optional_inputs = [] + if 'optional' in fwd_api.keys(): + optional_inputs = ParseDispensable(fwd_api['optional']) + + bwd_forward_str = bwd_api['forward'] + bwd_args_str = bwd_api['args'] + bwd_returns_str = bwd_api['output'] + + # Collect Forward Inputs/Outputs + forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForwardFromBackward( + bwd_forward_str) + print("Parsed Forward Inputs List: ", forward_inputs_list) + print("Prased Forward Attrs List: ", forward_attrs_list) + print("Parsed Forward Returns List: ", forward_returns_list) + + intermediate_outputs = [] + if 'intermediate' in fwd_api.keys(): + intermediate_outputs = ParseIntermediate(fwd_api[ + 'intermediate']) + + IntermediateValidationCheck(intermediate_outputs, + forward_returns_list) + + # Collect Original Forward Inputs/Outputs and then perform validation checks + orig_forward_inputs_list, orig_forward_attrs_list, orig_forward_returns_list = ParseYamlForward( + fwd_args_str, fwd_returns_str) + print("Parsed Original Forward Inputs List: ", + orig_forward_inputs_list) + print("Prased Original Forward Attrs List: ", + orig_forward_attrs_list) + print("Parsed Original Forward Returns List: ", + orig_forward_returns_list) + + # Forward Validation Checks + ForwardsValidationCheck( + forward_inputs_list, forward_attrs_list, forward_returns_list, + orig_forward_inputs_list, orig_forward_attrs_list, + orig_forward_returns_list) + + # Parse Backward Inputs/Outputs + backward_inputs_list, backward_attrs_list, backward_returns_list = ParseYamlBackward( + bwd_args_str, bwd_returns_str) + print("Parsed Backward Inputs List: ", backward_inputs_list) + print("Prased Backward Attrs List: ", backward_attrs_list) + print("Parsed Backward Returns List: ", backward_returns_list) + + # Determine Forward Inputs/Outputs Position + forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap( + forward_inputs_list, forward_returns_list) + print("Generated Forward Input Position Map: ", + forward_inputs_position_map) + print("Generated Forward Output Position Map: ", + forward_outputs_position_map) + + # SlotName Matching + backward_fwd_input_map, backward_grad_input_map, backward_grad_output_map = SlotNameMatching( + backward_inputs_list, backward_returns_list, + forward_inputs_position_map, forward_outputs_position_map) + print("Generated Backward Fwd Input Map: ", backward_fwd_input_map) + print("Generated Backward Grad Input Map: ", + backward_grad_input_map) + print("Generated Backward Grad Output Map: ", + backward_grad_output_map) + + # Backward Validation Check + BackwardValidationCheck(backward_fwd_input_map, + backward_grad_input_map, + backward_attrs_list) + + # Node Declaration Generation + yaml_node_declaration_str += GenerateNodeDeclaration( + fwd_api_name, backward_fwd_input_map, backward_attrs_list, + no_need_buffer_set) + print("Generated Node Declaration: ", node_declaration_str) + + yaml_node_definition_str += GenerateNodeDefinition( + fwd_api_name, bwd_api_name, backward_fwd_input_map, + backward_grad_input_map, backward_grad_output_map, + backward_attrs_list) + print("Generated Node Definition: ", node_definition_str) + + # Node Definition Generation + definition_declaration_pair = GenerateForwardDefinition( + fwd_api_name, bwd_api_name, forward_inputs_position_map, + forward_outputs_position_map, forward_attrs_list, + backward_fwd_input_map, backward_grad_input_map, + backward_grad_output_map, backward_attrs_list, optional_inputs, + intermediate_outputs) + print("Generated Forward Definition: ", forward_definition_str) + print("Generated Forward Declaration: ", forward_declaration_str) + yaml_forward_definition_str += definition_declaration_pair[0] + yaml_forward_declaration_str += definition_declaration_pair[1] + + # For python-level API dispatch + CollectCoreOpsInformation(fwd_api_name, forward_inputs_position_map, + forward_outputs_position_map, + forward_attrs_list) + + if len(namespace) > 0: + forward_definition_str += f"""namespace {namespace} {{ + {yaml_forward_definition_str} +}} +""" + + forward_declaration_str += f"""namespace {namespace} {{ + {yaml_forward_declaration_str} +}} +""" + + node_declaration_str += f"""namespace {namespace} {{ + {yaml_node_declaration_str} +}} +""" + + node_definition_str += f"""namespace {namespace} {{ + {yaml_node_definition_str} +}} +""" + + else: + forward_definition_str += yaml_forward_definition_str + forward_declaration_str += yaml_forward_declaration_str + node_declaration_str += yaml_node_declaration_str + node_definition_str += yaml_node_definition_str # Generate Files nodes_h_path = args.nodes_h_path diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py index d0506e45eb476c..abf3f86bdb03b8 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py @@ -14,7 +14,7 @@ import os import argparse -from eager_gen import yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap +from eager_gen import namespace, yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap skipped_fwd_api_names = set(["scale"]) @@ -126,16 +126,20 @@ def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map, }} """ + namespace_str = "" + if len(namespace) > 0: + namespace_str = f"{namespace}::" + if is_forward_only: - fwd_function_name = fwd_api_name + fwd_function_name = "paddle::experimental::" + namespace_str + fwd_api_name else: - fwd_function_name = GetForwardFunctionName(fwd_api_name) + fwd_function_name = namespace_str + GetForwardFunctionName(fwd_api_name) python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format( fwd_api_name, fwd_api_name, get_eager_tensor_str, parse_attributes_str, fwd_function_name, dygraph_function_call_str) - python_c_function_reg_str = f"{{\"final_state_{fwd_api_name}\", (PyCFunction)(void(*)(void))eager_final_state_api_{fwd_api_name}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {fwd_api_name} in dygraph.\"}}\n" + python_c_function_reg_str = f"{{\"final_state_{fwd_api_name}\", (PyCFunction)(void(*)(void)) {namespace_str}eager_final_state_api_{fwd_api_name}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {fwd_api_name} in dygraph.\"}}\n" return python_c_function_str, python_c_function_reg_str @@ -189,7 +193,7 @@ def GenerateCoreOpsInfoMap(): """ core_ops_infos_registry = """ - ,{\"get_final_state_core_ops_args_info\", + {\"get_final_state_core_ops_args_info\", (PyCFunction)(void(*)(void))eager_get_final_state_core_ops_args_info, METH_NOARGS, \"C++ interface function for eager_get_final_state_core_ops_args_info.\"}, {\"get_final_state_core_ops_args_type_info\", @@ -222,6 +226,7 @@ def GeneratePythonCWrappers(python_c_function_str, python_c_function_reg_str): #include "paddle/phi/common/data_type.h" #include "paddle/phi/common/scalar.h" #include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/api/include/sparse_api.h" #include "paddle/fluid/pybind/op_function_common.h" #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" #include "paddle/fluid/pybind/exception.h" @@ -254,57 +259,80 @@ def GeneratePythonCFile(filepath, python_c_str): if __name__ == "__main__": args = ParseArguments() - api_yaml_path = args.api_yaml_path - fwd_api_list = ReadFwdFile(api_yaml_path) - - python_c_function_list = [] - python_c_function_reg_list = [] - for fwd_api in fwd_api_list: - - # We only generate Ops with grad - is_forward_only = False - if 'backward' not in fwd_api.keys(): - is_forward_only = True - - assert 'api' in fwd_api.keys() - assert 'args' in fwd_api.keys() - assert 'output' in fwd_api.keys() - - fwd_api_name = fwd_api['api'] - fwd_args_str = fwd_api['args'] - fwd_returns_str = fwd_api['output'] - - if fwd_api_name in skipped_fwd_api_names: - continue - - # Parse Dispensable Inputs - optional_inputs = [] - if 'optional' in fwd_api.keys(): - optional_inputs = ParseDispensable(fwd_api['optional']) - - # Collect Original Forward Inputs/Outputs and then perform validation checks - forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForward( - fwd_args_str, fwd_returns_str) - print("Parsed Original Forward Inputs List: ", forward_inputs_list) - print("Prased Original Forward Attrs List: ", forward_attrs_list) - print("Parsed Original Forward Returns List: ", forward_returns_list) - - forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap( - forward_inputs_list, forward_returns_list) - print("Generated Forward Input Position Map: ", - forward_inputs_position_map) - print("Generated Forward Output Position Map: ", - forward_outputs_position_map) - - python_c_function_str, python_c_function_reg_str = GeneratePythonCFunction( - fwd_api_name, forward_inputs_position_map, forward_attrs_list, - forward_outputs_position_map, optional_inputs, is_forward_only) - python_c_function_list.append(python_c_function_str) - python_c_function_reg_list.append(python_c_function_reg_str) - print("Generated Python-C Function: ", python_c_function_str) - - python_c_functions_str = "\n".join(python_c_function_list) - python_c_functions_reg_str = ",\n".join(python_c_function_reg_list) + api_yaml_paths = args.api_yaml_path.split(",") + + python_c_functions_reg_str = "" + python_c_functions_str = "" + + for i in range(len(api_yaml_paths)): + api_yaml_path = api_yaml_paths[i] + + if "sparse" in api_yaml_path: + namespace = "sparse" + else: + namespace = "" + + fwd_api_list = ReadFwdFile(api_yaml_path) + + python_c_function_list = [] + python_c_function_reg_list = [] + for fwd_api in fwd_api_list: + + # We only generate Ops with grad + is_forward_only = False + if 'backward' not in fwd_api.keys(): + is_forward_only = True + + assert 'api' in fwd_api.keys() + assert 'args' in fwd_api.keys() + assert 'output' in fwd_api.keys() + + fwd_api_name = fwd_api['api'] + fwd_args_str = fwd_api['args'] + fwd_returns_str = fwd_api['output'] + + if fwd_api_name in skipped_fwd_api_names: + continue + + # Parse Dispensable Inputs + optional_inputs = [] + if 'optional' in fwd_api.keys(): + optional_inputs = ParseDispensable(fwd_api['optional']) + + # Collect Original Forward Inputs/Outputs and then perform validation checks + forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForward( + fwd_args_str, fwd_returns_str) + print("Parsed Original Forward Inputs List: ", forward_inputs_list) + print("Prased Original Forward Attrs List: ", forward_attrs_list) + print("Parsed Original Forward Returns List: ", + forward_returns_list) + + forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap( + forward_inputs_list, forward_returns_list) + print("Generated Forward Input Position Map: ", + forward_inputs_position_map) + print("Generated Forward Output Position Map: ", + forward_outputs_position_map) + + python_c_function_str, python_c_function_reg_str = GeneratePythonCFunction( + fwd_api_name, forward_inputs_position_map, forward_attrs_list, + forward_outputs_position_map, optional_inputs, is_forward_only) + python_c_function_list.append(python_c_function_str) + python_c_function_reg_list.append(python_c_function_reg_str) + print("Generated Python-C Function: ", python_c_function_str) + + # Append Namespace + python_c_functions_reg_str += ",\n".join( + python_c_function_reg_list) + "," + python_c_functions = "\n".join(python_c_function_list) + if len(namespace) > 0: + python_c_functions_str += f"""namespace {namespace} {{ + {python_c_functions} +}} +""" + + else: + python_c_functions_str += python_c_functions python_c_str = GeneratePythonCWrappers(python_c_functions_str, python_c_functions_reg_str) diff --git a/paddle/fluid/eager/to_static/run_program_op_func.h b/paddle/fluid/eager/to_static/run_program_op_func.h new file mode 100644 index 00000000000000..6f8bccd64e45f0 --- /dev/null +++ b/paddle/fluid/eager/to_static/run_program_op_func.h @@ -0,0 +1,82 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/fluid/eager/autograd_meta.h" +#include "paddle/fluid/eager/eager_tensor.h" +#include "paddle/fluid/eager/to_static/run_program_op_node.h" +#include "paddle/fluid/eager/utils.h" + +inline void run_program_dygraph_function( + const std::vector& x, + const std::vector& params, + std::vector& out, // NOLINT + std::vector& step_scope, // NOLINT + std::vector& dout, // NOLINT + const paddle::framework::AttributeMap& attrs) { + VLOG(2) << "start run run_program"; + // Call forward function + RunProgramAPI(x, params, out, step_scope, dout, attrs); + VLOG(2) << "start run run_program grad"; + + // Prepare Autograd Meta + auto deref_out = details::DereferenceTensors(out); + std::vector p_autograd_x = + egr::EagerUtils::nullable_autograd_meta(x); + std::vector p_autograd_params = + egr::EagerUtils::nullable_autograd_meta(params); + std::vector p_autograd_outs = + egr::EagerUtils::nullable_autograd_meta(deref_out); + + bool trace_backward = egr::Controller::Instance().HasGrad(); + bool require_any_grad = egr::EagerUtils::ComputeRequireGrad( + trace_backward, &p_autograd_x, &p_autograd_params); + + if (require_any_grad) { + std::vector out_names; + for (auto& t : deref_out) { + out_names.emplace_back(t.name()); + } + + egr::EagerUtils::PassStopGradient(false, &p_autograd_outs); + // Create GradOpNode (1 means [out_grad], 2 means [x_grad, paramx_grad]) + auto grad_node = std::make_shared(1, 2); + + grad_node->SetFwdOutNames(out_names); + // Set Attributes + grad_node->SetAttrMap(attrs); + // Set TensorWrappers + grad_node->SetFwdX(x); + grad_node->SetFwdParams(params); + grad_node->SetStepScope(step_scope); + + // Set Grad out rank as same as fwd input and set stop gradient to bwd + grad_node->SetGradOutMeta(&p_autograd_x, /*slot id*/ 0); + grad_node->SetGradOutMeta(&p_autograd_params, /*slot id*/ 1); + + grad_node->SetGradInMeta(&p_autograd_outs, 0); + // Set Next Edges + grad_node->AddEdges(&p_autograd_x, /*slot id*/ 0); + grad_node->AddEdges(&p_autograd_params, /*slot id*/ 1); + + egr::EagerUtils::SetOutRankWithSlot(&p_autograd_outs, 0); + + // Set History for output set current Grad Node for + egr::EagerUtils::SetHistory(&p_autograd_outs, grad_node); + egr::EagerUtils::CheckAndRetainGrad(deref_out); + } +} diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h new file mode 100644 index 00000000000000..ae5d86664a346f --- /dev/null +++ b/paddle/fluid/eager/to_static/run_program_op_node.h @@ -0,0 +1,468 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/eager/api/utils/global_utils.h" +#include "paddle/fluid/eager/grad_node_info.h" +#include "paddle/fluid/eager/tensor_wrapper.h" + +#include "paddle/fluid/operators/run_program_op.h" +#include "paddle/fluid/platform/enforce.h" + +namespace details { +using Tensor = paddle::experimental::Tensor; + +static std::vector DereferenceTensors( + const std::vector &tensor_ptr) { + std::vector res; + for (auto *t : tensor_ptr) { + res.emplace_back(*t); + } + return res; +} + +static std::vector GetTensorsName(const std::vector &ins) { + std::vector in_names; + for (auto &in_t : ins) { + in_names.emplace_back(in_t.name()); + } + return in_names; +} + +static std::vector GetTensorsName( + const std::vector &ins) { + std::vector in_names; + for (auto *in_t : ins) { + in_names.emplace_back(in_t->name()); + } + return in_names; +} + +static void CheckInputVarStatus(const Tensor &tensor) { + PADDLE_ENFORCE_EQ( + tensor.defined() && phi::DenseTensor::classof(tensor.impl().get()), true, + paddle::platform::errors::InvalidArgument( + "The input tensor %s of " + "RunProgram(Grad)Op holds " + "wrong type. Expect type is DenseTensor.", + tensor.name())); + + PADDLE_ENFORCE_EQ(tensor.initialized(), true, + paddle::platform::errors::InvalidArgument( + "The tensor in input tensor %s of " + "RunProgram(Grad)Op " + "is not initialized.", + tensor.name())); +} + +static void CheckOutputVarStatus(const paddle::framework::Variable &src_var, + const Tensor &dst_tensor) { + auto name = dst_tensor.name(); + PADDLE_ENFORCE_EQ(dst_tensor.defined(), true, + paddle::platform::errors::InvalidArgument( + "dst_tensor shall be defined.")); + + if (phi::DenseTensor::classof(dst_tensor.impl().get())) { + auto &src_tensor = src_var.Get(); + PADDLE_ENFORCE_EQ(phi::DenseTensor::classof(&src_tensor), true, + paddle::platform::errors::InvalidArgument( + "The output tensor %s get from " + "RunProgram(Grad)Op's internal scope holds " + "wrong type. Expect type is DenseTensor", + name)); + PADDLE_ENFORCE_EQ(src_tensor.initialized(), true, + paddle::platform::errors::InvalidArgument( + "The tensor in output tensor %s get from " + "RunProgram(Grad)Op's internal " + "scope is not initialized.", + name)); + } else if (phi::SelectedRows::classof(dst_tensor.impl().get())) { + auto &src_tensor = src_var.Get(); + PADDLE_ENFORCE_EQ(phi::SelectedRows::classof(&src_tensor), true, + paddle::platform::errors::InvalidArgument( + "The output tensodfr %s get from " + "RunProgram(Grad)Op's internal scope holds " + "wrong type. Expect type is SelectedRows", + name)); + PADDLE_ENFORCE_EQ(src_tensor.initialized(), true, + paddle::platform::errors::InvalidArgument( + "The tensor in output tensor %s get from " + "RunProgram(Grad)Op's " + "internal scope is not initialized.", + name)); + + } else { + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "The RunProgram(Grad)Op only support output " + "variable of type LoDTensor or SelectedRows", + name)); + } +} + +static void ShareTensorsIntoScope(const std::vector &tensors, + paddle::framework::Scope *scope) { + for (size_t i = 0; i < tensors.size(); ++i) { + auto name = tensors[i].name(); + if (name == "Fake_var" || !tensors[i].is_initialized()) { + continue; + } + auto *var = scope->Var(name); + CheckInputVarStatus(tensors[i]); + // share tensor + auto tensor_base = tensors[i].impl(); + if (phi::DenseTensor::classof(tensor_base.get())) { + auto *dst_tensor = var->GetMutable(); + auto t = std::dynamic_pointer_cast(tensor_base); + *dst_tensor = *t; + } else if (phi::SelectedRows::classof(tensor_base.get())) { + auto *dst_tensor = var->GetMutable(); + auto t = std::dynamic_pointer_cast(tensor_base); + *dst_tensor = *t; + } + } +} + +static void ShareTensorsFromScope( + const std::vector &tensors, + const paddle::framework::BlockDesc &global_block, + paddle::framework::Scope *scope) { + for (size_t i = 0; i < tensors.size(); ++i) { + // NOTE: In case of setting out_tmp.stop_gradient = True in model code, all + // parameters before generating out_tmp have no @GRAD, it will raise error + // because we can't find them in scope. So we skip sharing these vars or + // var@GRAD if they don't appear in global block. + auto &name = tensors[i]->name(); + if (name == paddle::framework::kEmptyVarName || name == "Fake_var" || + !global_block.HasVar(name)) { + VLOG(2) << "find tensor name is " << name << ", skip it!"; + continue; + } + // NOTE: Here skip not found var is dangerous, if a bug is caused here, + // the result is grad calculation error, which will be very hidden! + auto *var = scope->FindVar(name); + PADDLE_ENFORCE_NOT_NULL(var, paddle::platform::errors::NotFound( + "The output tensor %s is not in " + "RunProgram(Grad)Op'" + "s internal scope.", + name)); + CheckOutputVarStatus(*var, *tensors[i]); + // share tensor + // TODO(dev): Determine Tensor type by scope.var + // auto tensor_base = tensors[i]->impl(); + // if (phi::DenseTensor::classof(tensor_base.get())) { + if (var->IsType()) { + auto &src_tensor = var->Get(); + auto *dst_tensor = const_cast( + dynamic_cast(tensors[i]->impl().get())); + VLOG(2) << "share " << name << " from scope"; + *dst_tensor = src_tensor; + } else if (var->IsType()) { + // } else if (phi::SelectedRows::classof(tensor_base.get())) { + auto &src_tensor = var->Get(); + auto *dst_tensor = const_cast( + dynamic_cast(tensors[i]->impl().get())); + *dst_tensor = src_tensor; + } + } +} + +} // namespace details + +inline void RunProgramAPI( + const std::vector &x, + const std::vector ¶ms, + std::vector &out, // NOLINT + std::vector &step_scope, // NOLINT + std::vector &dout, // NOLINT + const paddle::framework::AttributeMap &attrs) { + VLOG(2) << "RunProgramOpKernel Compute"; + auto start_op_index = BOOST_GET_CONST(int64_t, attrs.at("start_op_index")); + auto end_op_index = BOOST_GET_CONST(int64_t, attrs.at("end_op_index")); + auto is_test = BOOST_GET_CONST(bool, attrs.at("is_test")); + auto program_id = BOOST_GET_CONST(int64_t, attrs.at("program_id")); + + // NOTE(chenweihang): In order not to add new variable type, use vector + // here. Originally, here can use scope directly. + auto *out_scope_vec = &step_scope; + PADDLE_ENFORCE_EQ( + out_scope_vec->size(), 1, + paddle::platform::errors::InvalidArgument( + "The OutScope of RunProgramGradOp should only hold one scope.")); + + // Step 2. prepare executor and init persistable variables + + // NOTE(Aurelius84): While training some models, forward can be called many + // times and then apply backpropagation all at once, such as Reinforcement + // Learning. Tensor data in multi-step training should be saved into single + // scope separately. Otherwise, the gradients can be miscalculated because + // always using the Tensor data of the last step in forward. + paddle::framework::Scope *global_inner_scope = out_scope_vec->front(); + VLOG(2) << "The number of sub scopes before forward: " + << out_scope_vec->front()->kids().size(); + paddle::framework::Scope &scope = global_inner_scope->NewScope(); + + // share input_vars & parameters into scope + details::ShareTensorsIntoScope(x, &scope); + details::ShareTensorsIntoScope(params, &scope); + + auto *global_block = + BOOST_GET_CONST(paddle::framework::BlockDesc *, attrs.at("global_block")); + const auto &place = egr::Controller::Instance().GetExpectedPlace(); + + if (end_op_index > start_op_index) { + auto input_names = details::GetTensorsName(x); + auto output_names = details::GetTensorsName(out); + auto dout_names = details::GetTensorsName(dout); + auto *program = global_block->Program(); + + auto cache_info = paddle::framework::GetExecutorInfoFromCache( + *program, place, start_op_index, end_op_index, + /*is_grad=*/false, program_id, &scope); + auto ¶llel_executor = cache_info.first; + // all out_vars are skip_eager_var + auto &skip_eager_delete_vars = + paddle::framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars( + program_id, false); + if (cache_info.second /*is_new_created*/) { + parallel_executor->SkipMemoryReuse(/*scope_idx=*/0, input_names); + skip_eager_delete_vars.insert(skip_eager_delete_vars.end(), + output_names.begin(), output_names.end()); + skip_eager_delete_vars.insert(skip_eager_delete_vars.end(), + dout_names.begin(), dout_names.end()); + paddle::framework::details::ParseSafeEagerDeletionSkipVars( + *program, end_op_index, output_names, &skip_eager_delete_vars); + } + + // Step 3. run ops + parallel_executor->RunWithoutFetch(skip_eager_delete_vars); + } + // Step 4. Get Output + details::ShareTensorsFromScope(out, *global_block, &scope); + details::ShareTensorsFromScope(dout, *global_block, &scope); + + // Debug info: scope info when run end + VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front()); + // Step 5. Drop all children scopes while testing. + if (is_test) { + out_scope_vec->front()->DropKids(); + } + VLOG(2) << "The number of sub scopes after forward: " + << out_scope_vec->front()->kids().size(); + // #ifdef PADDLE_WITH_MKLDNN + // if (FLAGS_use_mkldnn) paddle::platform::DontClearMKLDNNCache(place); + // #endif +} + +inline void RunProgramGradAPI( + const std::vector &x, + const std::vector ¶ms, + const std::vector &out_grad, + const std::vector &step_scope, // NOLINT + const paddle::framework::AttributeMap &attrs, + std::vector &x_grad, // NOLINT + std::vector ¶ms_grad // NOLINT + ) { + // if all output vars are set to stop_gradient, grad op no need to executed + if (x_grad.empty() && params_grad.empty()) return; + + // TODO(dev): Remove this line hard code. And need to deal with the out_grad + // name problem. + // const_cast(out_grad[0]) + // .set_name("matmul_v2_0.tmp_0@GRAD"); + + auto *global_block = + BOOST_GET_CONST(paddle::framework::BlockDesc *, attrs.at("global_block")); + auto orig_end_op_index = BOOST_GET_CONST(int64_t, attrs.at("end_op_index")); + + auto program_id = BOOST_GET_CONST(int64_t, attrs.at("program_id")); + // NOTE: skip `shape` and `fill_constant` op created by + // fluid.backward.gradients, one forward output will generate one `shape` + // and `fill_constant` + int64_t start_op_index = orig_end_op_index + (out_grad.size() * 2); + int64_t end_op_index = global_block->OpSize(); + + auto *out_scope_vec = &step_scope; + PADDLE_ENFORCE_EQ( + out_scope_vec->size(), 1, + paddle::platform::errors::InvalidArgument( + "The OutScope of RunProgramGradOp should only hold one scope.")); + + paddle::framework::Scope *global_inner_scope = out_scope_vec->front(); + auto sub_scope_num = global_inner_scope->kids().size(); + VLOG(2) << "The number of sub scopes before backward: " << sub_scope_num; + PADDLE_ENFORCE_GT(sub_scope_num, 0, + paddle::platform::errors::InvalidArgument( + "The OutScope of RunProgramGradOp should hold at " + "least one sub scope.")); + + auto &scope = *(global_inner_scope->kids().front()); + const auto &place = egr::Controller::Instance().GetExpectedPlace(); + + if (end_op_index > start_op_index) { + auto out_grad_names = details::GetTensorsName(out_grad); + // NOTE: after PR22939 [Add double grad] merged, the grad op maker's + // SetOutput will set to None if the input var stop_gradient=True, + // it will cause an NotFound error when ctx.OutputNames() is called + std::vector x_grad_names; + std::vector param_grad_names; + if (!x_grad.empty()) { + x_grad_names = details::GetTensorsName(x_grad); + } + if (!params_grad.empty()) { + param_grad_names = details::GetTensorsName(params_grad); + } + + // Step 2. prepare executor and scope + auto *program = global_block->Program(); + auto cache_info = paddle::framework::GetExecutorInfoFromCache( + *program, place, start_op_index, end_op_index, + /*is_grad*/ true, program_id, &scope); + auto ¶llel_executor = cache_info.first; + + auto &skip_eager_delete_vars = + paddle::framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars( + program_id, true); + if (cache_info.second /*is_new_created*/) { + parallel_executor->SkipMemoryReuse(/*scope_idx=*/0, out_grad_names); + + skip_eager_delete_vars.insert(skip_eager_delete_vars.end(), + x_grad_names.begin(), x_grad_names.end()); + paddle::framework::details::AppendSkipDeletionVars( + param_grad_names, &skip_eager_delete_vars); + } + + details::ShareTensorsIntoScope(out_grad, &scope); + // Debug info: scope info when run end + VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front()); + + // Step 3. run ops + parallel_executor->RunWithoutFetch( + /*skip_eager_delete_vars=*/skip_eager_delete_vars); + } + + // Step 4. get outputs + details::ShareTensorsFromScope(x_grad, *global_block, &scope); + details::ShareTensorsFromScope(params_grad, *global_block, &scope); + + // Step5. drop current scope + // global_inner_scope->DeleteScope(&scope); + VLOG(2) << "The number of sub scopes after backward: " + << global_inner_scope->kids().size(); +} + +class GradNodeRunProgram : public egr::GradNodeBase { + public: + GradNodeRunProgram(size_t bwd_in_slot_num, size_t bwd_out_slot_num) + : egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {} + + ~GradNodeRunProgram() override = default; + // Functor: perform backward computations + virtual std::vector> operator()( + const std::vector> &grads) + override { + VLOG(3) << "Running Eager Backward Node: GradNodeRunProgram"; + PADDLE_ENFORCE_EQ( + grads.size(), 1, + paddle::platform::errors::InvalidArgument( + "The out_grads.size() of RunProgramGradOp should be equal to 1.")); + + VLOG(3) << "out_grads[0].size() : " << grads[0].size(); + std::vector x_grad; + std::vector params_grad; + ConstructGradTensors(x_, &x_grad); + ConstructGradTensors(params_, ¶ms_grad); + std::vector x_grad_ptr; + std::vector params_grad_ptr; + for (auto &i : x_grad) { + x_grad_ptr.emplace_back(&i); + } + for (auto &i : params_grad) { + params_grad_ptr.emplace_back(&i); + } + + // auto x_grad_ptr = ConstructGradTensors(x_); + // auto params_grad_ptr = ConstructGradTensors(params_); + + PADDLE_ENFORCE_EQ( + grads[0].size(), fwd_out_names_.size(), + paddle::platform::errors::InvalidArgument( + "The grads[0].size() and fwd_out_names_.size() should be equal.")); + for (size_t i = 0; i < fwd_out_names_.size(); ++i) { + const_cast(grads[0][i]) + .set_name(fwd_out_names_[i] + "@GRAD"); + } + + RunProgramGradAPI(x_, params_, grads[0], step_scope_, attrs_, x_grad_ptr, + params_grad_ptr); + VLOG(3) << "End Eager Backward Node: GradNodeRunProgram"; + return {x_grad, params_grad}; + // return {x_grad, details::DereferenceTensors(params_grad_ptr)}; + } + + // SetAttrMap + void SetAttrMap(const paddle::framework::AttributeMap &attrs) { + attrs_ = attrs; + } + + void SetFwdX(const std::vector &tensors) { + x_ = tensors; + } + + void SetFwdParams(const std::vector &tensors) { + params_ = tensors; + } + + void SetStepScope(const std::vector &scopes) { + step_scope_ = scopes; + } + + void SetFwdOutNames(std::vector out_names) { + fwd_out_names_ = out_names; + } + + protected: + void ConstructGradTensors( + const std::vector &fwd_tensors, + std::vector *grad_tensors) { + // TODO(dev): Need an elegant way to determine inforamtion of grad_tensor, + // such as: name, tensor type(DenseTensor or SelectedRows). + VLOG(3) << "fwd_tensors.size(): " << fwd_tensors.size(); + for (auto &fwd_t : fwd_tensors) { + grad_tensors->emplace_back(fwd_t.impl()); + auto &grad_t = grad_tensors->back(); + grad_t.set_name(fwd_t.name() + "@GRAD"); + } + } + + void ConstructGradTensors( + const std::vector &fwd_tensors) { + VLOG(3) << "fwd_tensors.size(): " << fwd_tensors.size(); + for (auto &fwd_t : fwd_tensors) { + auto grad_tesnor = egr::EagerUtils::unsafe_autograd_meta(fwd_t)->Grad(); + grad_tesnor.set_name(fwd_t.name() + "@GRAD"); + } + } + + private: + // TensorWrappers + std::vector x_; + std::vector params_; + std::vector step_scope_; + + std::vector fwd_out_names_; + + // Attribute Map + paddle::framework::AttributeMap attrs_; +}; diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index e486799495c7ab..aa92a3b2226c1f 100755 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -443,7 +443,7 @@ cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framewo #cc_binary(test_executor SRCS test_executor.cc DEPS executor op_registry ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ) #cc_binary(new_executor SRCS new_exec_test.cc DEPS operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler) -set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator phi_custom_kernel) +set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator) cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES}) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 66dfb81755f1c9..948eaab40b4f64 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -139,7 +139,7 @@ set(IR_PASS_DEPS graph_viz_pass multi_devices_graph_pass coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass sync_batch_norm_pass runtime_context_cache_pass graph_to_program_pass - fix_op_run_order_pass) + fix_op_run_order_pass fuse_gemm_epilogue_pass) if (WITH_CINN) set(IR_PASS_DEPS ${IR_PASS_DEPS} build_cinn_pass) diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index c99200ec98aa8f..fdf74d2f769fcd 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -1,4 +1,5 @@ /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -175,6 +176,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { !defined(_WIN32) && !defined(__APPLE__) AppendPassWithCheck(strategy_.enable_auto_fusion_, "fusion_group_pass"); #endif + +#if (defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060) + AppendPassWithCheck(strategy_.fuse_gemm_epilogue_, + "fuse_gemm_epilogue_pass"); +#endif AppendPassWithCheck(strategy_.fuse_elewise_add_act_ops_, "fuse_elewise_add_act_pass"); // for single card training, fuse_all_reduce_ops is unnecessary. @@ -507,3 +513,6 @@ USE_PASS(mkldnn_placement_pass); !defined(_WIN32) && !defined(__APPLE__) USE_PASS(fusion_group_pass); #endif +#if (defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060) +USE_PASS(fuse_gemm_epilogue_pass); +#endif diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 70a083dd70bc3b..5eb584aaefa981 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -1,4 +1,5 @@ // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -124,6 +125,8 @@ struct BuildStrategy { paddle::optional fuse_broadcast_ops_{paddle::none}; // replace batch_norm with sync_batch_norm. bool sync_batch_norm_{false}; + // Fuse GEMM+Epilogue via cublasLt epilogue. + bool fuse_gemm_epilogue_{false}; // mkldnn_enabled_op_types specify the operator type list to // use MKLDNN acceleration. It is null in default, means diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc index 7232a707916dd5..91ef59575c3aa2 100644 --- a/paddle/fluid/framework/infershape_utils.cc +++ b/paddle/fluid/framework/infershape_utils.cc @@ -232,16 +232,8 @@ class CompatMetaTensor : public phi::MetaTensor { } } - void share_meta(const MetaTensor& meta_tensor) override { + void share_dims(const MetaTensor& meta_tensor) override { set_dims(meta_tensor.dims()); - set_dtype(meta_tensor.dtype()); - // VarDesc doesn't contains layout, so we cannot share layout - // set_layout(meta_tensor.layout()); - - // special case 1: share lod of LoDTensor - share_lod(meta_tensor); - - // special case 2: share height and rows of SelectedRows in runtime if (is_runtime_) { auto* var = BOOST_GET(Variable*, var_); if (var->IsType()) { @@ -254,6 +246,16 @@ class CompatMetaTensor : public phi::MetaTensor { } } + void share_meta(const MetaTensor& meta_tensor) override { + set_dtype(meta_tensor.dtype()); + // VarDesc doesn't contains layout, so we cannot share layout + // set_layout(meta_tensor.layout()); + + // special case 1: share lod of LoDTensor + share_lod(meta_tensor); + share_dims(meta_tensor); + } + private: const LoD& GetRuntimeLoD() const { auto* var = BOOST_GET_CONST(Variable*, var_); diff --git a/paddle/fluid/framework/infershape_utils.h b/paddle/fluid/framework/infershape_utils.h index 64c8371d583ffe..b692b6ffab0801 100644 --- a/paddle/fluid/framework/infershape_utils.h +++ b/paddle/fluid/framework/infershape_utils.h @@ -29,7 +29,7 @@ namespace framework { phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, const std::string& op_type); -#define DELCARE_INFER_SHAPE_FUNCTOR(op_type, functor_name, fn) \ +#define DECLARE_INFER_SHAPE_FUNCTOR(op_type, functor_name, fn) \ struct functor_name : public paddle::framework::InferShapeBase { \ void operator()( \ paddle::framework::InferShapeContext* ctx) const override { \ diff --git a/paddle/fluid/framework/infershape_utils_test.cc b/paddle/fluid/framework/infershape_utils_test.cc index 53dcc19fcbae88..2eeefb19a1aa8c 100644 --- a/paddle/fluid/framework/infershape_utils_test.cc +++ b/paddle/fluid/framework/infershape_utils_test.cc @@ -110,9 +110,9 @@ void InferShapeUtilsTestKernel( } // namespace framework } // namespace paddle -DELCARE_INFER_SHAPE_FUNCTOR(infer_shape_utils_test, +DECLARE_INFER_SHAPE_FUNCTOR(infer_shape_utils_test, InferShapeUtilsTestInferShapeFunctor, - PT_INFER_META(paddle::framework::TestInferMeta)); + PD_INFER_META(paddle::framework::TestInferMeta)); REGISTER_OPERATOR(infer_shape_utils_test, paddle::framework::InferShapeUtilsTestOp, paddle::framework::InferShapeUtilsTestOpMaker, diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 3f2da8cbeb3adb..623c8a048c2417 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -158,6 +158,7 @@ endif() cc_library(fuse_bn_act_pass SRCS fuse_bn_act_pass.cc DEPS pass graph_pattern_detector ) cc_library(fuse_bn_add_act_pass SRCS fuse_bn_add_act_pass.cc DEPS pass graph_pattern_detector ) cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector ) +cc_library(fuse_gemm_epilogue_pass SRCS fuse_gemm_epilogue_pass.cc DEPS pass graph_pattern_detector ) cc_library(fuse_relu_depthwise_conv_pass SRCS fuse_relu_depthwise_conv_pass.cc DEPS pass graph_pattern_detector ) set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library") diff --git a/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc new file mode 100644 index 00000000000000..f48224cbdc24fe --- /dev/null +++ b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc @@ -0,0 +1,471 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h" +#include +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { + +void FuseGemmEpiloguePass::ApplyImpl(ir::Graph *graph) const { + EpiloguePassActivationCache cache; + + graph = FuseLinearActFwd(graph, {"relu", "gelu"}, false, false, &cache); + graph = FuseLinearActFwd(graph, {"relu"}, true, true, &cache); + graph = FuseLinearActFwd(graph, {"gelu"}, true, false, &cache); + graph = FuseLinearFwd(graph, false); + graph = FuseLinearFwd(graph, true); + graph = FuseLinearActBwd(graph, {"relu_grad"}, true, &cache); + graph = FuseLinearActBwd(graph, {"gelu_grad"}, false, &cache); + graph = FuseLinearBwd(graph, false); + graph = FuseLinearBwd(graph, true); +} + +ir::Graph *FuseGemmEpiloguePass::FuseLinearFwd(ir::Graph *graph, + bool is_training) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); + const std::string scope_name("gemm_epilogue"); + FusePassBase::Init(scope_name, graph); + + GraphPatternDetector gpd; + auto *x = gpd.mutable_pattern() + ->NewNode(patterns::PDNodeName(scope_name, "x")) + ->AsInput() + ->assert_is_op_input("matmul_v2", "X"); + patterns::LinearAct linear_act_pattern(gpd.mutable_pattern(), "linear_act"); + + linear_act_pattern(x, {}, is_training, false); + + int found_linear_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *g) { + VLOG(4) << "handle LinearAct fuse"; + + GET_IR_NODE_FROM_SUBGRAPH(matmul_op, matmul, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_w, matmul_w, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_out, matmul_out, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_add_op, ele_add, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_bias, ele_bias, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out, linear_act_pattern); + + std::vector matmul_x_shape = subgraph.at(x)->Var()->GetShape(); + std::vector matmul_w_shape = matmul_w->Var()->GetShape(); + + // Note (Ming Huang): We only support matmul_v2 from paddle.nn.Linear + // currently. The conditions below are used to verify wether matmul_v2 + // is created by paddle.nn.Linear + auto matmul_op_desc = matmul_op->Op(); + if (!IsGemmFromLinear_(matmul_x_shape, matmul_w_shape, matmul_op_desc)) + return; + + OpDesc fused_gemm_epilogue_op_desc(matmul_op->Op()->Block()); + std::string activation = "none"; + fused_gemm_epilogue_op_desc.SetType("fused_gemm_epilogue"); + fused_gemm_epilogue_op_desc.SetInput("X", {subgraph.at(x)->Name()}); + fused_gemm_epilogue_op_desc.SetInput("Y", {matmul_w->Name()}); + fused_gemm_epilogue_op_desc.SetInput("Bias", {ele_bias->Name()}); + fused_gemm_epilogue_op_desc.SetOutput("Out", {ele_out->Name()}); + fused_gemm_epilogue_op_desc.SetAttr("activation", activation); + fused_gemm_epilogue_op_desc.SetAttr("op_role", + matmul_op_desc->GetAttr("op_role")); + auto gemm_epilogue_node = g->CreateOpNode(&fused_gemm_epilogue_op_desc); + + IR_NODE_LINK_TO(subgraph.at(x), gemm_epilogue_node); + IR_NODE_LINK_TO(matmul_w, gemm_epilogue_node); + IR_NODE_LINK_TO(ele_bias, gemm_epilogue_node); + IR_NODE_LINK_TO(gemm_epilogue_node, ele_out); + + GraphSafeRemoveNodes(g, {matmul_op, matmul_out, ele_add_op}); + + VLOG(4) << "\n\t " << subgraph.at(x)->Name() << " and " << matmul_w->Name() + << " -> " << matmul_op->Name() << " -> " << matmul_out->Name() + << "\n\t " << matmul_out->Name() << " and " << ele_bias->Name() + << " -> " << ele_add_op->Name() << " -> " << ele_out->Name() + << "\n\t " << ele_out->Name(); + found_linear_count++; + }; + + gpd(graph, handler); + + AddStatis(found_linear_count); + return graph; +} + +ir::Graph *FuseGemmEpiloguePass::FuseLinearActFwd( + ir::Graph *graph, const std::unordered_set &act_types, + bool is_training, bool is_act_grad_x_from_act, + EpiloguePassActivationCache *cache) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); + + const std::string scope_name("gemm_epilogue"); + FusePassBase::Init(scope_name, graph); + + GraphPatternDetector gpd; + auto *x = gpd.mutable_pattern() + ->NewNode(patterns::PDNodeName(scope_name, "x")) + ->AsInput() + ->assert_is_op_input("matmul_v2", "X"); + patterns::LinearAct linear_act_pattern(gpd.mutable_pattern(), "linear_act"); + + linear_act_pattern(x, act_types, is_training, is_act_grad_x_from_act); + + int found_linear_act_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *g) { + VLOG(4) << "handle LinearAct fuse"; + + GET_IR_NODE_FROM_SUBGRAPH(matmul_op, matmul, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_w, matmul_w, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_out, matmul_out, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_add_op, ele_add, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_bias, ele_bias, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(act_op, act, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, linear_act_pattern); + + std::vector matmul_x_shape = subgraph.at(x)->Var()->GetShape(); + std::vector matmul_w_shape = matmul_w->Var()->GetShape(); + + // Note (Ming Huang): We only support matmul_v2 from paddle.nn.Linear + // currently. The conditions below are used to verify wether matmul_v2 + // is created by paddle.nn.Linear + auto matmul_op_desc = matmul_op->Op(); + if (!IsGemmFromLinear_(matmul_x_shape, matmul_w_shape, matmul_op_desc)) + return; + + auto activation = act_op->Op()->Type(); + + OpDesc fused_gemm_epilogue_op_desc(matmul_op->Op()->Block()); + fused_gemm_epilogue_op_desc.SetType("fused_gemm_epilogue"); + fused_gemm_epilogue_op_desc.SetInput("X", {subgraph.at(x)->Name()}); + fused_gemm_epilogue_op_desc.SetInput("Y", {matmul_w->Name()}); + fused_gemm_epilogue_op_desc.SetInput("Bias", {ele_bias->Name()}); + fused_gemm_epilogue_op_desc.SetOutput("Out", {act_out->Name()}); + fused_gemm_epilogue_op_desc.SetAttr("activation", activation); + fused_gemm_epilogue_op_desc.SetAttr("op_role", + matmul_op_desc->GetAttr("op_role")); + + auto gemm_epilogue_node = g->CreateOpNode(&fused_gemm_epilogue_op_desc); + + IR_NODE_LINK_TO(subgraph.at(x), gemm_epilogue_node); + IR_NODE_LINK_TO(matmul_w, gemm_epilogue_node); + IR_NODE_LINK_TO(ele_bias, gemm_epilogue_node); + IR_NODE_LINK_TO(gemm_epilogue_node, act_out); + + // Only need to check weight.shape[1] for auxiliary pointer + // and mark it the act op is fused for backward epilogue fusion. + // That because cuBlasLt epilogue's restriction. + if (is_training) { + int divisor_of_n = activation == "relu" ? 128 : 8; + if (matmul_w_shape[1] % divisor_of_n) return; + + VarDesc reserve_space(patterns::PDNodeName(scope_name, "ReserveSpace")); + auto *reserve_space_node = g->CreateVarNode(&reserve_space); + + cache->InsertFusedActivation( + GetReserveSpaceCacheKey(act_out->Var()->Name(), g->GetBlockId()), + reserve_space_node); + + gemm_epilogue_node->Op()->SetOutput("ReserveSpace", + {reserve_space_node->Name()}); + + if (!is_act_grad_x_from_act) { + GET_IR_NODE_FROM_SUBGRAPH(act_grad_op, act_grad, linear_act_pattern); + act_grad_op->Op()->RenameInput(ele_out->Name(), + reserve_space_node->Name()); + IR_NODE_LINK_TO(reserve_space_node, act_grad_op); + } + IR_NODE_LINK_TO(gemm_epilogue_node, reserve_space_node); + } + + GraphSafeRemoveNodes(g, + {matmul_op, matmul_out, ele_add_op, ele_out, act_op}); + + VLOG(4) << "\n\t " << subgraph.at(x)->Name() << " and " << matmul_w->Name() + << " -> " << matmul_op->Name() << " -> " << matmul_out->Name() + << "\n\t " << matmul_out->Name() << " and " << ele_bias->Name() + << " -> " << ele_add_op->Name() << " -> " << ele_out->Name() + << "\n\t " << ele_out->Name() << " -> " << act_op->Name() << " -> " + << act_out->Name(); + found_linear_act_count++; + }; + + gpd(graph, handler); + + AddStatis(found_linear_act_count); + return graph; +} + +ir::Graph *FuseGemmEpiloguePass::FuseLinearBwd(ir::Graph *graph, + bool without_x_gradient) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); + const std::string scope_name("gemm_epilogue"); + FusePassBase::Init(scope_name, graph); + + GraphPatternDetector gpd; + auto *dout = + gpd.mutable_pattern() + ->NewNode(patterns::PDNodeName(scope_name, "dout")) + ->AsInput() + ->assert_is_op_input("elementwise_add_grad", GradVarName("Out")); + + patterns::ElewiseAddMatmulAct ele_add_matmul_act_pattern( + gpd.mutable_pattern(), "ele_add_matmul_act"); + ele_add_matmul_act_pattern(dout, {}, without_x_gradient, false); + + int found_ele_add_matmul_act_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *g) { + VLOG(4) << "handle ElewiseAddMatmulAct fuse"; + + GET_IR_NODE_FROM_SUBGRAPH(ele_add_grad_op, ele_add_grad, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_grad_bias, ele_grad_bias, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_grad_dx, ele_grad_dx, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_grad_dbias, ele_grad_dbias, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_op, matmul_grad, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_x, matmul_grad_x, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_w, matmul_grad_w, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_dw, matmul_grad_dw, + ele_add_matmul_act_pattern); + + Node *matmul_grad_dx = nullptr; + if (!without_x_gradient) { + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_dx_ptr, matmul_grad_dx, + ele_add_matmul_act_pattern); + matmul_grad_dx = matmul_grad_dx_ptr; + } + + std::vector matmul_grad_x_shape = matmul_grad_x->Var()->GetShape(); + std::vector matmul_grad_w_shape = matmul_grad_w->Var()->GetShape(); + + // Note (Ming Huang): We only support matmul_v2_grad from paddle.nn.Linear + // currently. The conditions below are used to verify wether matmul_v2 + // is created by paddle.nn.Linear + auto matmul_grad_op_desc = matmul_grad_op->Op(); + if (!IsGemmFromLinear_(matmul_grad_x_shape, matmul_grad_w_shape, + matmul_grad_op_desc)) + return; + + OpDesc fused_gemm_epilogue_grad_op_desc(ele_add_grad_op->Op()->Block()); + std::string activation_grad = "none"; + fused_gemm_epilogue_grad_op_desc.SetType("fused_gemm_epilogue_grad"); + fused_gemm_epilogue_grad_op_desc.SetInput("DOut", + {subgraph.at(dout)->Name()}); + fused_gemm_epilogue_grad_op_desc.SetInput("X", {matmul_grad_x->Name()}); + fused_gemm_epilogue_grad_op_desc.SetInput("Y", {matmul_grad_w->Name()}); + if (matmul_grad_dx) { + fused_gemm_epilogue_grad_op_desc.SetOutput("DX", + {matmul_grad_dx->Name()}); + } + fused_gemm_epilogue_grad_op_desc.SetOutput("DY", {matmul_grad_dw->Name()}); + fused_gemm_epilogue_grad_op_desc.SetOutput("DBias", + {ele_grad_dbias->Name()}); + fused_gemm_epilogue_grad_op_desc.SetAttr("activation_grad", + activation_grad); + fused_gemm_epilogue_grad_op_desc.SetAttr( + "op_role", matmul_grad_op_desc->GetAttr("op_role")); + + auto gemm_epilogue_grad_node = + g->CreateOpNode(&fused_gemm_epilogue_grad_op_desc); + + IR_NODE_LINK_TO(subgraph.at(dout), gemm_epilogue_grad_node); + IR_NODE_LINK_TO(matmul_grad_x, gemm_epilogue_grad_node); + IR_NODE_LINK_TO(matmul_grad_w, gemm_epilogue_grad_node); + IR_NODE_LINK_TO(gemm_epilogue_grad_node, matmul_grad_dw); + IR_NODE_LINK_TO(gemm_epilogue_grad_node, ele_grad_dbias); + if (matmul_grad_dx) { + IR_NODE_LINK_TO(gemm_epilogue_grad_node, matmul_grad_dx); + } + + GraphSafeRemoveNodes(g, {ele_add_grad_op, ele_grad_dx, matmul_grad_op}); + + std::string matmul_grad_dx_name = + matmul_grad_dx != nullptr ? matmul_grad_dx->Name() : " "; + VLOG(4) << "\n\t " << subgraph.at(dout)->Name() << " and " + << ele_grad_bias->Name() << " -> " << ele_add_grad_op->Name() + << " -> " << ele_grad_dx->Name() << " and " + << ele_grad_dbias->Name() << "\n\t " << ele_grad_dx->Name() << ", " + << matmul_grad_x->Name() << " and " << matmul_grad_w->Name() + << " -> " << matmul_grad_op->Name() << " -> " + << matmul_grad_w->Name() << " and " << matmul_grad_dx_name; + found_ele_add_matmul_act_count++; + }; + + gpd(graph, handler); + + AddStatis(found_ele_add_matmul_act_count); + return graph; +} + +ir::Graph *FuseGemmEpiloguePass::FuseLinearActBwd( + ir::Graph *graph, const std::unordered_set &act_grad_types, + bool is_act_grad_x_from_act, EpiloguePassActivationCache *cache) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); + const std::string scope_name("gemm_epilogue"); + FusePassBase::Init(scope_name, graph); + + GraphPatternDetector gpd; + auto *dout = + gpd.mutable_pattern() + ->NewNode(patterns::PDNodeName(scope_name, "dout")) + ->AsInput() + ->assert_is_op_input("elementwise_add_grad", GradVarName("Out")); + + patterns::ElewiseAddMatmulAct ele_add_matmul_act_pattern( + gpd.mutable_pattern(), "ele_add_matmul_act"); + ele_add_matmul_act_pattern(dout, act_grad_types, false, + is_act_grad_x_from_act); + + int found_ele_add_matmul_act_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *g) { + VLOG(4) << "handle ElewiseAddMatmulAct fuse"; + + GET_IR_NODE_FROM_SUBGRAPH(ele_add_grad_op, ele_add_grad, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_grad_bias, ele_grad_bias, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_grad_dx, ele_grad_dx, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_grad_dbias, ele_grad_dbias, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_op, matmul_grad, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_x, matmul_grad_x, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_w, matmul_grad_w, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_dx, matmul_grad_dx, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_dw, matmul_grad_dw, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(act_grad_op, act_grad, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(act_grad_dx, act_grad_dx, + ele_add_matmul_act_pattern); + + auto key = + GetReserveSpaceCacheKey(matmul_grad_x->Var()->Name(), g->GetBlockId()); + if (!cache->HasFusedActivation(key)) { + return; + } + auto *reserve_space_node = cache->GetFusedActivationSpace(key); + + std::vector matmul_grad_x_shape = matmul_grad_x->Var()->GetShape(); + std::vector matmul_grad_w_shape = matmul_grad_w->Var()->GetShape(); + + // Note (Ming Huang): We only support matmul_v2_grad from paddle.nn.Linear + // currently. The conditions below are used to verify wether matmul_v2 + // is created by paddle.nn.Linear + auto matmul_grad_op_desc = matmul_grad_op->Op(); + if (!IsGemmFromLinear_(matmul_grad_x_shape, matmul_grad_w_shape, + matmul_grad_op_desc)) + return; + + auto activation_grad = act_grad_op->Op()->Type(); + + OpDesc fused_gemm_epilogue_grad_op_desc(ele_add_grad_op->Op()->Block()); + fused_gemm_epilogue_grad_op_desc.SetType("fused_gemm_epilogue_grad"); + fused_gemm_epilogue_grad_op_desc.SetInput("DOut", + {subgraph.at(dout)->Name()}); + fused_gemm_epilogue_grad_op_desc.SetInput("X", {matmul_grad_x->Name()}); + fused_gemm_epilogue_grad_op_desc.SetInput("Y", {matmul_grad_w->Name()}); + fused_gemm_epilogue_grad_op_desc.SetInput("ReserveSpace", + {reserve_space_node->Name()}); + fused_gemm_epilogue_grad_op_desc.SetOutput("DX", {act_grad_dx->Name()}); + fused_gemm_epilogue_grad_op_desc.SetOutput("DY", {matmul_grad_dw->Name()}); + fused_gemm_epilogue_grad_op_desc.SetOutput("DBias", + {ele_grad_dbias->Name()}); + fused_gemm_epilogue_grad_op_desc.SetAttr("activation_grad", + activation_grad); + fused_gemm_epilogue_grad_op_desc.SetAttr( + "op_role", matmul_grad_op_desc->GetAttr("op_role")); + + auto gemm_epilogue_grad_node = + g->CreateOpNode(&fused_gemm_epilogue_grad_op_desc); + + IR_NODE_LINK_TO(subgraph.at(dout), gemm_epilogue_grad_node); + IR_NODE_LINK_TO(matmul_grad_x, gemm_epilogue_grad_node); + IR_NODE_LINK_TO(matmul_grad_w, gemm_epilogue_grad_node); + IR_NODE_LINK_TO(gemm_epilogue_grad_node, act_grad_dx); + IR_NODE_LINK_TO(gemm_epilogue_grad_node, matmul_grad_dw); + IR_NODE_LINK_TO(gemm_epilogue_grad_node, ele_grad_dbias); + IR_NODE_LINK_TO(reserve_space_node, gemm_epilogue_grad_node); + + GraphSafeRemoveNodes(g, {ele_add_grad_op, ele_grad_dx, matmul_grad_op, + matmul_grad_dx, act_grad_op}); + + VLOG(4) << "\n\t " << subgraph.at(dout)->Name() << " and " + << ele_grad_bias->Name() << " -> " << ele_add_grad_op->Name() + << " -> " << ele_grad_dx->Name() << " and " + << ele_grad_dbias->Name() << "\n\t " << ele_grad_dx->Name() << ", " + << matmul_grad_x->Name() << " and " << matmul_grad_w->Name() + << " -> " << matmul_grad_op->Name() << " -> " + << matmul_grad_dx->Name() << " and " << matmul_grad_w->Name() + << "\n\t " << matmul_grad_dx->Name() << " -> " + << act_grad_op->Name() << " -> " << act_grad_dx->Name(); + found_ele_add_matmul_act_count++; + }; + + gpd(graph, handler); + + AddStatis(found_ele_add_matmul_act_count); + return graph; +} + +bool FuseGemmEpiloguePass::IsGemmFromLinear_( + const std::vector &x_shape, const std::vector &w_shape, + OpDesc *matmul_v2_op) const { + if (w_shape.size() != 2 || x_shape.size() < 2) return false; + for (auto attr_name : + {"fused_reshape_Out", "fused_reshape_X", "fused_reshape_Y", + "fused_transpose_Out", "fused_transpose_X", "fused_transpose_Y"}) { + if (matmul_v2_op->HasAttr(attr_name)) { + std::vector tmp_vec = + BOOST_GET_CONST(std::vector, matmul_v2_op->GetAttr(attr_name)); + if (tmp_vec.size() > 0) return false; + } + } + if (BOOST_GET_CONST(bool, matmul_v2_op->GetAttr("trans_x")) || + BOOST_GET_CONST(bool, matmul_v2_op->GetAttr("trans_y"))) + return false; + + return true; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(fuse_gemm_epilogue_pass, + paddle::framework::ir::FuseGemmEpiloguePass); diff --git a/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h new file mode 100644 index 00000000000000..575ffee73d60e9 --- /dev/null +++ b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h @@ -0,0 +1,100 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * Fuse the ElewiseAdd and activation + */ +class Graph; +class Node; + +class EpiloguePassActivationCache { + public: + EpiloguePassActivationCache() {} + + EpiloguePassActivationCache(const EpiloguePassActivationCache &) = delete; + void operator=(const EpiloguePassActivationCache &) = delete; + + bool HasFusedActivation(const std::string &key) const { + return fused_activation_space_map_.count(key); + } + + ir::Node *GetFusedActivationSpace(const std::string &key) { + if (HasFusedActivation(key)) { + return fused_activation_space_map_.find(key)->second; + } + PADDLE_THROW(platform::errors::InvalidArgument( + "The key (%d) of EpiloguePassActivationCache does not exist.", key)); + } + + void InsertFusedActivation(const std::string &key, ir::Node *const value) { + if (!HasFusedActivation(key)) { + mtx.lock(); + fused_activation_space_map_.insert({key, value}); + mtx.unlock(); + } else { + PADDLE_THROW(platform::errors::AlreadyExists( + "The key (%d) of EpiloguePassActivationCache already exist.", key)); + } + } + + private: + std::unordered_map fused_activation_space_map_; + std::mutex mtx; +}; + +class FuseGemmEpiloguePass : public FusePassBase { + public: + virtual ~FuseGemmEpiloguePass() {} + + protected: + void ApplyImpl(ir::Graph *graph) const override; + + ir::Graph *FuseLinearFwd(ir::Graph *graph, bool is_training) const; + ir::Graph *FuseLinearActFwd(ir::Graph *graph, + const std::unordered_set &act_types, + bool is_training, bool is_act_grad_x_from_act, + EpiloguePassActivationCache *cache) const; + ir::Graph *FuseLinearBwd(ir::Graph *graph, bool without_x_gradient) const; + ir::Graph *FuseLinearActBwd( + ir::Graph *graph, const std::unordered_set &act_grad_types, + bool is_act_grad_x_from_act, EpiloguePassActivationCache *cache) const; + + private: + bool IsGemmFromLinear_(const std::vector &x_shape, + const std::vector &w_shape, + OpDesc *matmul_v2_op) const; + const std::string GetReserveSpaceCacheKey(const std::string var_name, + int block_id) const { + return std::to_string(block_id) + var_name; + } +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 72f0f790043dbc..18068e22b7f3c3 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -1491,31 +1491,6 @@ PDNode *patterns::BatchNormAddActGrad::operator()( return bn_grad; } -PDNode *patterns::ElewiseAddAct::operator()( - paddle::framework::ir::PDNode *ele_x_var, - std::unordered_set act_types) { - auto *ele_y_var = pattern->NewNode(ele_y_repr()) - ->assert_is_op_input("elementwise_add", "Y"); - - auto *ele_add = - pattern->NewNode(ele_add_repr())->assert_is_op("elementwise_add"); - - auto *ele_out_var = pattern->NewNode(elewise_add_out_repr()) - ->assert_is_op_output("elementwise_add", "Out"); - - ele_out_var->AsIntermediate()->assert_is_ops_input(act_types); - - auto *act = pattern->NewNode(act_repr())->assert_is_ops(act_types); - - auto *act_out_var = - pattern->NewNode(act_out_repr())->assert_is_ops_output(act_types, "Out"); - - ele_add->LinksFrom({ele_x_var, ele_y_var}).LinksTo({ele_out_var}); - act->LinksFrom({ele_out_var}).LinksTo({act_out_var}); - - return act_out_var; -} - PDNode *patterns::ElewiseAddActInplaceGrad::operator()( paddle::framework::ir::PDNode *d_act_out_var, std::unordered_set act_types) { @@ -1556,6 +1531,159 @@ PDNode *patterns::ElewiseAddActInplaceGrad::operator()( return ele_add_grad; } +PDNode *patterns::ElewiseAddAct::operator()( + paddle::framework::ir::PDNode *ele_x_var, + std::unordered_set act_types) { + auto *ele_y_var = pattern->NewNode(ele_y_repr()) + ->assert_is_op_input("elementwise_add", "Y"); + + auto *ele_add = + pattern->NewNode(ele_add_repr())->assert_is_op("elementwise_add"); + + auto *ele_out_var = pattern->NewNode(elewise_add_out_repr()) + ->assert_is_op_output("elementwise_add", "Out"); + + ele_out_var->AsIntermediate()->assert_is_ops_input(act_types); + + auto *act = pattern->NewNode(act_repr())->assert_is_ops(act_types); + + auto *act_out_var = + pattern->NewNode(act_out_repr())->assert_is_ops_output(act_types, "Out"); + + ele_add->LinksFrom({ele_x_var, ele_y_var}).LinksTo({ele_out_var}); + act->LinksFrom({ele_out_var}).LinksTo({act_out_var}); + + return act_out_var; +} + +PDNode *patterns::LinearAct::operator()( + paddle::framework::ir::PDNode *linear_x_var, + const std::unordered_set &act_types, bool with_grad_link, + bool is_act_grad_x_from_act) { + auto *matmul_w_var = + pattern->NewNode(matmul_w_repr())->assert_is_op_input("matmul_v2", "Y"); + + auto *matmul = pattern->NewNode(matmul_repr())->assert_is_op("matmul_v2"); + + auto *matmul_out_var = pattern->NewNode(matmul_out_repr()) + ->assert_is_op_output("matmul_v2", "Out"); + + matmul_out_var->AsIntermediate()->assert_is_op_input("elementwise_add", "X"); + + auto *ele_bias_var = pattern->NewNode(ele_bias_repr()) + ->assert_is_op_input("elementwise_add", "Y"); + + auto *ele_add = + pattern->NewNode(ele_add_repr())->assert_is_op("elementwise_add"); + + auto *ele_out_var = pattern->NewNode(elewise_add_out_repr()) + ->assert_is_op_output("elementwise_add", "Out"); + + matmul->LinksFrom({linear_x_var, matmul_w_var}).LinksTo({matmul_out_var}); + ele_add->LinksFrom({matmul_out_var, ele_bias_var}).LinksTo({ele_out_var}); + + if (with_grad_link) { + matmul_out_var->assert_is_op_input("elementwise_add_grad", "X"); + auto *elementwise_add_grad_op = pattern->NewNode("elementwise_add_grad") + ->assert_is_op("elementwise_add_grad"); + elementwise_add_grad_op->LinksFrom({matmul_out_var}); + } + + if (act_types.size() > 0) { + ele_out_var->AsIntermediate()->assert_is_ops_input(act_types); + + auto *act = pattern->NewNode(act_repr())->assert_is_ops(act_types); + auto *act_out_var = pattern->NewNode(act_out_repr()) + ->assert_is_ops_output(act_types, "Out"); + + act->LinksFrom({ele_out_var}).LinksTo({act_out_var}); + + if (with_grad_link && !is_act_grad_x_from_act) { + std::unordered_set act_grad_types; + for (const auto &act : act_types) { + std::string act_grad(act); + act_grad.append("_grad"); + act_grad_types.insert(act_grad); + } + + ele_out_var->assert_is_ops_input(act_grad_types, "X"); + auto *act_grad_op = + pattern->NewNode(act_grad_repr())->assert_is_ops(act_grad_types); + act_grad_op->LinksFrom({ele_out_var}); + } + + return act_out_var; + } + + return ele_out_var; +} + +PDNode *patterns::ElewiseAddMatmulAct::operator()( + paddle::framework::ir::PDNode *dout_var, + const std::unordered_set &act_grad_types, + bool without_x_gradient, bool is_act_grad_x_from_act) { + auto *ele_grad_bias_var = + pattern->NewNode(ele_grad_bias_repr()) + ->assert_is_op_input("elementwise_add_grad", "Y"); + auto *ele_add_grad = pattern->NewNode(ele_add_grad_repr()) + ->assert_is_op("elementwise_add_grad"); + auto *ele_grad_dx_var = + pattern->NewNode(ele_grad_dx_repr()) + ->assert_is_op_output("elementwise_add_grad", GradVarName("X")); + auto *ele_grad_dbias_var = + pattern->NewNode(ele_grad_dbias_repr()) + ->assert_is_op_output("elementwise_add_grad", GradVarName("Y")); + ele_add_grad->LinksFrom({dout_var, ele_grad_bias_var}) + .LinksTo({ele_grad_dx_var, ele_grad_dbias_var}); + + ele_grad_dx_var->AsIntermediate()->assert_is_op_input("matmul_v2_grad", + GradVarName("Out")); + + auto *matmul_grad_x_var = pattern->NewNode(matmul_grad_x_repr()) + ->assert_is_op_input("matmul_v2_grad", "X"); + auto *matmul_grad_w_var = pattern->NewNode(matmul_grad_w_repr()) + ->assert_is_op_input("matmul_v2_grad", "Y"); + auto *matmul_grad = + pattern->NewNode(matmul_grad_repr())->assert_is_op("matmul_v2_grad"); + auto *matmul_grad_dx_var = + pattern->NewNode(matmul_grad_dx_repr()) + ->assert_is_op_output("matmul_v2_grad", GradVarName("X")); + auto *matmul_grad_dw_var = + pattern->NewNode(matmul_grad_dw_repr()) + ->assert_is_op_output("matmul_v2_grad", GradVarName("Y")); + matmul_grad->LinksFrom( + {ele_grad_dx_var, matmul_grad_x_var, matmul_grad_w_var}); + if (without_x_gradient) { + matmul_grad->LinksTo({matmul_grad_dw_var}); + } else { + matmul_grad->LinksTo({matmul_grad_dx_var, matmul_grad_dw_var}); + } + + if (!without_x_gradient && act_grad_types.size() > 0) { + matmul_grad_dx_var->AsIntermediate()->assert_is_ops_input( + act_grad_types, GradVarName("Out")); + + auto *act_grad = + pattern->NewNode(act_grad_repr())->assert_is_ops(act_grad_types); + auto *act_grad_dx_var = + pattern->NewNode(act_grad_dx_repr()) + ->assert_is_ops_output(act_grad_types, GradVarName("X")); + + auto *act_grad_x_var = matmul_grad_x_var; + if (!is_act_grad_x_from_act) { + auto *ele_out_var = pattern->NewNode(ele_out_repr()) + ->assert_is_ops_input(act_grad_types, "X"); + act_grad_x_var = ele_out_var; + } + + act_grad->LinksFrom({matmul_grad_dx_var, act_grad_x_var}) + .LinksTo({act_grad_dx_var}); + return act_grad; + } + + return matmul_grad; +} + // conv_type: conv2d, conv3d, conv2d_transpose PDNode *patterns::ConvBias::operator()( paddle::framework::ir::PDNode *conv_input, std::string conv_type) { diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 654b47fc97963e..062d2f9dedce65 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -885,6 +885,65 @@ struct ElewiseAddActInplaceGrad : public PatternBase { PATTERN_DECL_NODE(ele_y); }; +// The following patterns are used to fuse linear and act (ReLu or GeLU) +// formula: act(F.linear(x)) +// op: matmul_v2 + elementwise_add + act +// named nodes: matmul, elementwise_add, act +// matmul_w, matmul_out +// ele_bias, elewise_add_out, act_out +struct LinearAct : public PatternBase { + LinearAct(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "linear_act") {} + + PDNode* operator()(PDNode* x, + const std::unordered_set& act_types, + bool with_grad_link, bool is_act_grad_x_from_act); + + // declare operator node's name + PATTERN_DECL_NODE(matmul); + PATTERN_DECL_NODE(ele_add); + PATTERN_DECL_NODE(act); + PATTERN_DECL_NODE(act_grad); + // declare variable node's name + PATTERN_DECL_NODE(matmul_w); + PATTERN_DECL_NODE(matmul_out); + PATTERN_DECL_NODE(elewise_add_out); + PATTERN_DECL_NODE(ele_bias); + PATTERN_DECL_NODE(act_out); +}; + +// The following patterns are used to fuse linear_grad and act_grad (ReLu or +// GeLU) +// formula: the backward of F.linear( act(x) ) +// op: elementwise_add_grad + matmul_v2_grad + act_grad +// named nodes: ele_add_grad, matmul_grad, act_grad +// ele_grad_bias, ele_grad_dx, ele_grad_dbias +// matmul_grad_x, matmul_grad_dx, matmul_grad_dx +// matmul_grad_dw, act_grad_dx +struct ElewiseAddMatmulAct : public PatternBase { + ElewiseAddMatmulAct(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "elewiseadd_matmul_act") {} + + PDNode* operator()(PDNode* x, + const std::unordered_set& act_grad_types, + bool without_x_gradient, bool is_act_grad_x_from_act); + + // declare operator node's name + PATTERN_DECL_NODE(ele_add_grad); + PATTERN_DECL_NODE(matmul_grad); + PATTERN_DECL_NODE(act_grad); + // declare variable node's name + PATTERN_DECL_NODE(ele_out); + PATTERN_DECL_NODE(ele_grad_bias); + PATTERN_DECL_NODE(ele_grad_dx); + PATTERN_DECL_NODE(ele_grad_dbias); + PATTERN_DECL_NODE(matmul_grad_x); + PATTERN_DECL_NODE(matmul_grad_w); + PATTERN_DECL_NODE(matmul_grad_dx); + PATTERN_DECL_NODE(matmul_grad_dw); + PATTERN_DECL_NODE(act_grad_dx); +}; + // Conv with Elementwise_add as bias // op: conv + elementwise_add // named nodes: diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc index 0a95444f852dd0..d578ada0db00fe 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc @@ -15,8 +15,9 @@ #include "paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.h" #include -#include #include + +#include #include "paddle/fluid/framework/ir/pass_tester_helper.h" #include "paddle/fluid/framework/op_registry.h" @@ -27,7 +28,7 @@ USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP(leaky_relu); USE_OP_DEVICE_KERNEL(leaky_relu, MKLDNN); USE_OP(gelu); -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP(tanh); USE_OP_DEVICE_KERNEL(tanh, MKLDNN); diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc index 2c3359ffa8e46f..a69cc0d6b866d0 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc @@ -46,7 +46,7 @@ USE_OP(matmul_grad); USE_OP(square); USE_OP(transpose2_grad); USE_OP(concat_grad); -USE_OP(elementwise_mul_grad); +USE_OP_ITSELF(elementwise_mul_grad); USE_OP(sigmoid_grad); USE_OP(tanh_grad); USE_OP(sum); @@ -54,7 +54,7 @@ USE_OP(slice_grad); USE_OP(lookup_table_grad); USE_OP(sqrt); USE_OP(elementwise_max); -USE_OP(elementwise_div); +USE_OP_ITSELF(elementwise_div); USE_OP(sgd); USE_OP(squared_l2_norm); USE_OP(memcpy_h2d); diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index c45bf32d8b710c..eb40a49b4066a7 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -286,8 +286,8 @@ struct OpKernelRegistrarFunctorEx, \ paddle::framework::EmptyGradOpMaker) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index eff6d9a9102d2b..f8e30c1ee294ec 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -539,6 +539,20 @@ bool ExecutionContext::HasInput(const std::string& name) const { return var != nullptr; } +bool ExecutionContext::HasInputs(const std::string& name) const { + const auto& ins = ctx_.inputs; + auto it = ins.find(name); + if (it == ins.end() || it->second.empty()) { + return false; + } + for (const auto* input : it->second) { + if (input == nullptr) { + return false; + } + } + return true; +} + bool ExecutionContext::HasOutput(const std::string& name) const { auto* var = OutputVar(name); return var != nullptr; @@ -2189,6 +2203,51 @@ void OperatorWithKernel::BuildPhiKernelContext( std::move(experimental::MakePhiScalarFromVar(*ins_vector.front()))); } + } else if (attr_defs[i].type_index == + std::type_index(typeid(std::vector))) { + auto& attr = Attrs().at(attr_names[i]); + if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + pt_kernel_context->EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + pt_kernel_context->EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + pt_kernel_context->EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + pt_kernel_context->EmplaceBackAttr(std::move(scalar_list)); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported cast op attribute `%s` to vector when " + "construct KernelContext.", + attr_names[i])); + } } else { // TODO(chenweihang): support other attrs later auto& attr = Attrs().at(attr_names[i]); @@ -2212,7 +2271,11 @@ void OperatorWithKernel::BuildPhiKernelContext( } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { if (std::type_index(attr.type()) == - std::type_index(typeid(std::vector))) { + std::type_index(typeid(std::vector))) { + pt_kernel_context->EmplaceBackAttr( + BOOST_GET_CONST(std::vector, attr)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { // Emplace Back Attr according to the type of Phi_Kernel args. const auto& vector_int_attr = BOOST_GET_CONST(std::vector, attr); const std::vector vector_int64_attr(vector_int_attr.begin(), diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index e33d4feb82a9e7..1a1171f1dba4d7 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -295,6 +295,8 @@ class ExecutionContext { virtual bool HasInput(const std::string& name) const; + virtual bool HasInputs(const std::string& name) const; + virtual bool HasOutput(const std::string& name) const; virtual size_t InputSize(const std::string& name) const { @@ -449,7 +451,7 @@ class ExecutionArgumentMappingContext : public phi::ArgumentMappingContext { : ctx_(ctx) {} bool HasInput(const std::string& name) const override { - return ctx_.HasInput(name); + return ctx_.HasInputs(name); } bool HasOutput(const std::string& name) const override { diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc index bf9d1baaf394f0..47dffd47b7cbbf 100644 --- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc +++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc @@ -675,7 +675,7 @@ TEST(BuildCinnPassTest, NoNeedBufferInput) { USE_PASS(build_cinn_pass); USE_OP(mul); -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_ITSELF(elementwise_add); -USE_OP(relu_grad); +USE_OP_ITSELF(relu_grad); USE_OP_ITSELF(elementwise_add_grad); diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc index e8badab27b9b97..cdccc4c5546900 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc @@ -301,5 +301,5 @@ TEST(CinnCompilerTest, Compile) { USE_PASS(build_cinn_pass); USE_PASS(graph_viz_pass); USE_OP(mul); -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_ITSELF(elementwise_add); diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index e1ce705533ab4b..3d8a5ab21f00fc 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -33,6 +33,7 @@ if(NOT WIN32) endif() if(WITH_CNCL) cc_library(cncl_context SRCS cncl_context.cc DEPS collective_helper device_context tensor var_type_traits) + cc_library(reducer SRCS reducer.cc DEPS layer) endif() if(WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL) cc_library(heter_ccl_context SRCS heter_ccl_context.cc DEPS collective_helper device_context tensor var_type_traits) @@ -41,7 +42,7 @@ if(NOT WIN32) endif(NOT WIN32) if(WITH_GLOO) cc_library(imperative_gloo_context SRCS gloo_context.cc DEPS collective_helper device_context tensor var_type_traits) - if ( WIN32 OR (NOT (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL) )) + if ( WIN32 OR (NOT (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL OR WITH_CNCL) )) cc_library(reducer SRCS reducer.cc DEPS layer) endif() endif() diff --git a/paddle/fluid/imperative/execution_context.h b/paddle/fluid/imperative/execution_context.h index fe5ac73b004691..fbc47f81fd3316 100644 --- a/paddle/fluid/imperative/execution_context.h +++ b/paddle/fluid/imperative/execution_context.h @@ -133,6 +133,11 @@ class DygraphExecutionContext : public framework::ExecutionContext { return (it != var_map_in_.end() && it->second.size() > 0); } + bool HasInputs(const std::string& name) const override { + auto it = var_map_in_.find(name); + return (it != var_map_in_.end() && it->second.size() > 0); + } + bool HasOutput(const std::string& name) const override { auto it = var_map_out_.find(name); return (it != var_map_out_.end() && it->second.size() > 0); diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index 2317bfdd7c0d5e..bae49fb381a475 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -247,6 +247,7 @@ PreparedOp PrepareImpl(const NameVarMap& ins, #endif #ifdef PADDLE_WITH_XPU_KP + expected_kernel_key.place_ = platform::XPUPlace(); bool use_xpu_kp_kernel_rt = FLAGS_run_kp_kernel && paddle::platform::is_xpu_kp_support_op(op.Type(), expected_kernel_key); diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index 30dbe07d7afca6..d7c0c8cc547e6b 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -332,6 +332,7 @@ void BuildDygraphPhiKernelContext( } for (size_t i = 0; i < attr_names.size(); ++i) { + VLOG(1) << "############## attr_name: " << i << " : " << attr_names[i]; if (attr_defs[i].type_index == std::type_index(typeid(phi::ScalarArray))) { if (attrs.find(attr_names[i]) != attrs.end()) { // shape is in the attribute @@ -409,6 +410,60 @@ void BuildDygraphPhiKernelContext( experimental::MakePhiScalarFromVar(ins_vector[0]->Var()))); } + } else if (attr_defs[i].type_index == + std::type_index(typeid(std::vector))) { + auto& attr = GetAttr(attrs, default_attrs, attr_names[i]); + if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + kernel_ctx->EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + kernel_ctx->EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + kernel_ctx->EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + kernel_ctx->EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + kernel_ctx->EmplaceBackAttr(std::move(scalar_list)); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported cast op attribute `%s` to vector when " + "construct KernelContext.", + attr_names[i])); + } } else { // TODO(chenweihang): support other attrs later auto& attr = GetAttr(attrs, default_attrs, attr_names[i]); @@ -432,7 +487,11 @@ void BuildDygraphPhiKernelContext( } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { if (std::type_index(attr.type()) == - std::type_index(typeid(std::vector))) { + std::type_index(typeid(std::vector))) { + kernel_ctx->EmplaceBackAttr( + BOOST_GET_CONST(std::vector, attr)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { // Emplace Back Attr according to the type of Phi_Kernel args. const auto& vector_int_attr = BOOST_GET_CONST(std::vector, attr); const std::vector vector_int64_attr(vector_int_attr.begin(), diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index 3a6365b2af21ae..fec9afbf3b403c 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -31,7 +31,7 @@ namespace imperative { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \ - defined(PADDLE_WITH_ASCEND_CL) + defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CNCL) // div the nranks void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) { framework::Tensor *tensor = @@ -67,6 +67,9 @@ void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) { #ifdef PADDLE_WITH_XPU_BKCL // TODO(liuyuhui) support xpu about div nranks in the future #endif + } else if (platform::is_mlu_place(tensor->place())) { + // TODO(zhangna) + VLOG(4) << "divnrank for mlu not support yet"; } } @@ -222,6 +225,56 @@ void SplitTensorsWithType( } #endif +#ifdef PADDLE_WITH_CNCL +// context is used to select the stream for concat +template <> +void ConcatTensorsWithType( + const platform::MLUDeviceContext &context, + const std::vector &dense_tensors_, + framework::Variable *p_dense_contents, + framework::proto::VarType::Type type) { + switch (type) { + case framework::proto::VarType::FP16: + ConcatTensorsForAllReduce( + context, dense_tensors_, p_dense_contents); + break; + case framework::proto::VarType::FP32: + ConcatTensorsForAllReduce( + context, dense_tensors_, p_dense_contents); + break; + default: + PADDLE_THROW(platform::errors::Unimplemented( + "Data type (%s) is not supported when it concats tensors for " + "allreduce.", + framework::DataTypeToString(type))); + } +} + +// context is used to select the stream for split +template <> +void SplitTensorsWithType( + const platform::MLUDeviceContext &context, + framework::Variable *p_dense_contents, + std::vector *p_dense_tensors, + framework::proto::VarType::Type type) { + switch (type) { + case framework::proto::VarType::FP16: + SplitTensorsForAllReduce( + context, p_dense_contents, p_dense_tensors); + break; + case framework::proto::VarType::FP32: + SplitTensorsForAllReduce( + context, p_dense_contents, p_dense_tensors); + break; + default: + PADDLE_THROW(platform::errors::Unimplemented( + "Data type (%s) is not supported when it splits tensors for " + "allreduce.", + framework::DataTypeToString(type))); + } +} +#endif + void Group::ConcatTensors(const platform::DeviceContext &context) { auto place = context.GetPlace(); if (platform::is_gpu_place(place)) { @@ -253,6 +306,16 @@ void Group::ConcatTensors(const platform::DeviceContext &context) { PADDLE_THROW(platform::errors::PermissionDenied( "Paddle can't concat npu grads since it's not compiled with HCCL," "Please recompile or reinstall Paddle with HCCL support.")); +#endif + } else if (platform::is_mlu_place(place)) { +#ifdef PADDLE_WITH_CNCL + ConcatTensorsWithType( + static_cast(context), + dense_tensors_, &dense_contents_, dtype_); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't concat mlu grads since it's not compiled with CNCL," + "Please recompile or reinstall Paddle with CNCL support.")); #endif } else if (platform::is_cpu_place(place)) { ConcatTensorsWithType( @@ -295,6 +358,16 @@ void Group::SplitTensors(const platform::DeviceContext &context) { PADDLE_THROW(platform::errors::PermissionDenied( "Paddle can't split npu grad since it's not compiled with HCCL," "Please recompile or reinstall Paddle with HCCL support.")); +#endif + } else if (platform::is_mlu_place(place)) { +#ifdef PADDLE_WITH_CNCL + SplitTensorsWithType( + static_cast(context), + &dense_contents_, &dense_tensors_, dtype_); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't split mlu grad since it's not compiled with CNCL," + "Please recompile or reinstall Paddle with CNCL support.")); #endif } else if (platform::is_cpu_place(place)) { SplitTensorsWithType( @@ -746,6 +819,11 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) { // TODO(liuyuhui) support XPU set constant VLOG(3) << "XPU doesn't support set_constant"; } +#elif defined(PADDLE_WITH_CNCL) + if (platform::is_mlu_place(group_tensor.place())) { + // TODO(liuyuhui) support MLU set constant + VLOG(3) << "MLU doesn't support set_constant"; + } #else auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place_); if (HasGrad(var_index)) { @@ -846,12 +924,13 @@ void Reducer::MarkGroupReady(size_t group_index) { cv_.notify_all(); } }); -#elif defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL) || \ - defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL) +#elif defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL) || \ + defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL) || \ + defined(PADDLE_WITH_CNCL) FusedAllReduceSchedule(run_order, group, next_group_); #else PADDLE_THROW(platform::errors::PreconditionNotMet( - "Not compiled with BKCL or NCCL or GLOO.")); + "Not compiled with BKCL or NCCL or CNCL or GLOO.")); #endif } } diff --git a/paddle/fluid/imperative/reducer.h b/paddle/fluid/imperative/reducer.h index cca773b840c279..9fac4b41cbde01 100644 --- a/paddle/fluid/imperative/reducer.h +++ b/paddle/fluid/imperative/reducer.h @@ -45,7 +45,7 @@ namespace imperative { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \ - defined(PADDLE_WITH_ASCEND_CL) + defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CNCL) template struct DivNRanksFunctor { diff --git a/paddle/fluid/imperative/tests/CMakeLists.txt b/paddle/fluid/imperative/tests/CMakeLists.txt index e4f1cfdb3baeed..09de0106ed6190 100644 --- a/paddle/fluid/imperative/tests/CMakeLists.txt +++ b/paddle/fluid/imperative/tests/CMakeLists.txt @@ -21,6 +21,6 @@ cc_test(test_prepare_op SRCS test_prepare_op.cc DEPS prepared_operator op_info s cc_test(test_tracer SRCS test_tracer.cc DEPS tracer layer proto_desc operator op_registry variable_helper mul_op reduce_sum_op elementwise_add_op memcpy) cc_test(test_hooks SRCS test_hooks.cc DEPS tracer basic_engine layer proto_desc operator op_registry variable_helper mul_op elementwise_add_op memcpy) cc_test(test_eager SRCS test_eager.cc DEPS tracer layer prepared_operator mul_op) -if (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL) +if (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_CNCL) cc_test(test_group SRCS test_group.cc DEPS reducer concat_and_split memcpy) endif() diff --git a/paddle/fluid/imperative/tests/test_group.cc b/paddle/fluid/imperative/tests/test_group.cc index 6c304278d21fde..5e674af1a08a87 100644 --- a/paddle/fluid/imperative/tests/test_group.cc +++ b/paddle/fluid/imperative/tests/test_group.cc @@ -72,8 +72,10 @@ void GroupConcatSplit(Place place, size_t size) { value.push_back(static_cast(1.0 * j)); } - if (std::is_same::value) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + if (std::is_same::value || + std::is_same::value) { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(PADDLE_WITH_CNCL) paddle::memory::Copy(place, data, cpu_place, value.data(), sizeof(T) * value.size(), 0); #endif @@ -180,5 +182,19 @@ TEST(TestGroup, TestXPUConcatSplit) { } #endif +#if defined(PADDLE_WITH_CNCL) +TEST(TestGroup, TestMLUConcatSplit) { + platform::MLUPlace mlu_place(0); + platform::CPUPlace cpu_place; + + int size = 3; + GroupConcatSplit(cpu_place, size); + GroupConcatSplit(mlu_place, size); + + size = 15; + GroupConcatSplit(cpu_place, size); + GroupConcatSplit(mlu_place, size); +} +#endif } // namespace imperative } // namespace paddle diff --git a/paddle/fluid/imperative/tests/test_prepare_op.cc b/paddle/fluid/imperative/tests/test_prepare_op.cc index f5ca13cb99ad3d..17cbe06748234a 100644 --- a/paddle/fluid/imperative/tests/test_prepare_op.cc +++ b/paddle/fluid/imperative/tests/test_prepare_op.cc @@ -226,7 +226,7 @@ TEST(test_prepare_op, test_prepare_data_cpu_mkldnn) { } // namespace paddle USE_OP_ITSELF(split); -USE_OP(relu); +USE_OP_ITSELF(relu); #ifdef PADDLE_WITH_MKLDNN USE_OP_DEVICE_KERNEL(relu, MKLDNN); #endif diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 4336a5c77c178f..01c9d2847e0c85 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -18,12 +18,14 @@ #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/imperative/amp_auto_cast.h" +#include "paddle/fluid/imperative/execution_context.h" #include "paddle/fluid/imperative/op_base.h" #include "paddle/fluid/platform/denormal.h" #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/string/string_helper.h" +#include "paddle/phi/common/place.h" DECLARE_bool(use_mkldnn); DECLARE_string(tracer_mkldnn_ops_on); @@ -382,5 +384,36 @@ bool Tracer::ComputeRequiredGrad(const NameTensorMap& ins, return false; } +phi::KernelSignature Tracer::GetExpectedKernelSignature( + const std::string& type, const NameVarBaseMap& ins, + const NameVarBaseMap& outs, framework::AttributeMap attrs) const { + auto op = framework::OpRegistry::CreateOp(type, {}, {}, {}, false); + framework::RuntimeContext ctx({}, {}); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(phi::CPUPlace()); + const auto& op_info = op->Info(); + auto* attr_checker = op_info.Checker(); + if (attr_checker) { + attr_checker->Check(&attrs, true, /*only_check_exist_value=*/true); + } + static paddle::framework::AttributeMap empty_attrs_map = {}; + const paddle::framework::AttributeMap& default_attrs = + attr_checker == nullptr ? empty_attrs_map + : attr_checker->GetDefaultAttrMap(); + auto dygraph_exe_ctx = + imperative::DygraphExecutionContext( + *op, framework::Scope(), *dev_ctx, ctx, ins, outs, attrs, + default_attrs); + auto* opbase_with_kernel = + dynamic_cast(op.get()); + PADDLE_ENFORCE_NE(opbase_with_kernel, nullptr, + platform::errors::InvalidArgument( + "This op type:`%s` is not a OperatorWithKernel, only " + "OperatorWithKernel can get KernelSignature", + type)); + return phi::KernelSignature( + std::move(opbase_with_kernel->GetExpectedPhiKernelArgs(dygraph_exe_ctx))); +} + } // namespace imperative } // namespace paddle diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index 73ecbbe6143ca8..fd13fce6a6e17a 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -28,6 +28,7 @@ #include "paddle/fluid/imperative/jit/program_desc_tracer.h" #include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/platform/macros.h" +#include "paddle/phi/core/compat/arg_map_context.h" namespace paddle { namespace imperative { @@ -154,6 +155,10 @@ class Tracer { } } + phi::KernelSignature GetExpectedKernelSignature( + const std::string& type, const NameVarBaseMap& ins, + const NameVarBaseMap& outs, framework::AttributeMap attrs) const; + paddle::framework::GarbageCollector* MutableGarbageCollectorIfNotExists( const platform::Place& place); diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 6eeb5d64253597..1f83e606c3fded 100755 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -31,7 +31,7 @@ cc_library(paddle_infer_contrib SRCS paddle_infer_contrib.cc DEPS zero_copy_tens cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc) set(paddle_inference_api_deps lod_tensor scope reset_tensor_array - analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator phi_custom_kernel) + analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator) if(WITH_CRYPTO) list(APPEND paddle_inference_api_deps paddle_crypto) diff --git a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc index 8c61200f7f57cd..b69292827aa136 100644 --- a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc @@ -89,5 +89,5 @@ class DropoutOpConverter : public OpConverter { } // namespace inference } // namespace paddle -USE_OP(dropout); +USE_OP_ITSELF(dropout); REGISTER_TRT_OP_CONVERTER(dropout, DropoutOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc index f2dc5ba1c7c2c8..7f7313fbcb5969 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc @@ -52,7 +52,7 @@ TEST(Relu6OpConverter, main) { test_activation("relu6"); } } // namespace inference } // namespace paddle -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP(sigmoid); USE_OP(tanh); USE_OP(relu6); diff --git a/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc index 474fd92071fb07..cf377396087637 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc @@ -57,4 +57,4 @@ TEST(DropoutOpConverter, main) { } // namespace inference } // namespace paddle -USE_OP(dropout); +USE_OP_ITSELF(dropout); diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu index 57177cfa8b421e..336005d883b0f5 100644 --- a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu @@ -16,7 +16,6 @@ #include #include "paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h" -#include "paddle/fluid/operators/detection/yolo_box_op.h" namespace paddle { namespace inference { diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 91a0352e1915e9..e77be832c0cc89 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -161,7 +161,7 @@ cc_library(common_infer_shape_functions SRCS common_infer_shape_functions.cc DEP set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows_utils lapack_function lod_tensor maxouting unpooling pooling lod_rank_table context_project -sequence_pooling segment_pooling executor device_memory_aligment generator) +sequence_pooling executor device_memory_aligment generator) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse matrix_solve) diff --git a/paddle/fluid/operators/abs_op.cc b/paddle/fluid/operators/abs_op.cc index c28026a4bd43aa..e1460629fb18a4 100644 --- a/paddle/fluid/operators/abs_op.cc +++ b/paddle/fluid/operators/abs_op.cc @@ -141,8 +141,8 @@ class AbsDoubleGradOp : public framework::OperatorWithKernel { } // namespace operators } // namespace paddle -DELCARE_INFER_SHAPE_FUNCTOR(abs, AbsInferShapeFunctor, - PT_INFER_META(phi::UnchangedInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(abs, AbsInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/activation_cudnn_op.cu.cc b/paddle/fluid/operators/activation_cudnn_op.cu.cc index 0ac29e6d3ada73..b4a97e24cf2923 100644 --- a/paddle/fluid/operators/activation_cudnn_op.cu.cc +++ b/paddle/fluid/operators/activation_cudnn_op.cu.cc @@ -132,7 +132,9 @@ struct CudnnReluGradFunctor : public CudnnActivationGradFunctor { explicit CudnnReluGradFunctor(const CUDADeviceContext& ctx) : CudnnActivationGradFunctor(ctx, 0.0, GPUDNN_ACTIVATION_RELU) {} - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -146,7 +148,9 @@ struct CudnnRelu6GradFunctor : public CudnnActivationGradFunctor { : CudnnActivationGradFunctor(ctx, 6.0, GPUDNN_ACTIVATION_CLIPPED_RELU) {} - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -159,7 +163,9 @@ struct CudnnSigmoidGradFunctor : public CudnnActivationGradFunctor { explicit CudnnSigmoidGradFunctor(const CUDADeviceContext& ctx) : CudnnActivationGradFunctor(ctx, 0.0, GPUDNN_ACTIVATION_SIGMOID) {} - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -172,7 +178,9 @@ struct CudnnTanhGradFunctor : public CudnnActivationGradFunctor { explicit CudnnTanhGradFunctor(const CUDADeviceContext& ctx) : CudnnActivationGradFunctor(ctx, 0.0, GPUDNN_ACTIVATION_TANH) {} - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -197,7 +205,8 @@ class CudnnActivationGradKernel public: using T = typename Functor::ELEMENT_TYPE; void Compute(const framework::ExecutionContext& context) const override { - static_assert(Functor::FwdDeps() == kDepOut, "Forward deps must be Out."); + static_assert(Functor::FwdDeps() == ActBwdOpFwdDeps::kDepOut, + "Forward deps must be Out."); const framework::Tensor *X, *Out, *dOut; X = Out = dOut = nullptr; diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 73d65b7c6e7e0a..66f1bcc8b68692 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -34,7 +34,8 @@ using paddle::framework::Tensor; template static constexpr bool CanInplaceAct() { - return GradFunctor::FwdDeps() == kDepOut || GradFunctor::FwdDeps() == kNoDeps; + return GradFunctor::FwdDeps() == ActBwdOpFwdDeps::kDepOut || + GradFunctor::FwdDeps() == ActBwdOpFwdDeps::kNoDeps; } #define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT) \ @@ -921,7 +922,8 @@ class ActivationOpDoubleGrad : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - if (static_cast(kDepValue) & static_cast(kDepX)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepX)) { if (ctx->HasOutput("DX")) { ctx->ShareDim("X", "DX"); ctx->ShareLoD("X", "DX"); @@ -931,7 +933,8 @@ class ActivationOpDoubleGrad : public framework::OperatorWithKernel { ctx->ShareLoD("X", "DDOut"); } } - if (static_cast(kDepValue) & static_cast(kDepOut)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepOut)) { if (ctx->HasOutput("DOut")) { ctx->ShareDim("Out", "DOut"); ctx->ShareLoD("Out", "DOut"); @@ -960,13 +963,15 @@ class ActivationOpDoubleGrad2 : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - if (static_cast(kDepValue) & static_cast(kDepX)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepX)) { if (ctx->HasOutput("DDOut")) { ctx->ShareDim("X", "DDOut"); ctx->ShareLoD("X", "DDOut"); } } - if (static_cast(kDepValue) & static_cast(kDepOut)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepOut)) { if (ctx->HasOutput("DDOut")) { ctx->ShareDim("Out", "DDOut"); ctx->ShareLoD("Out", "DDOut"); @@ -987,7 +992,8 @@ class ActivationOpTripleGrad : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - if (static_cast(kDepValue) & static_cast(kDepX)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepX)) { if (ctx->HasOutput("DX")) { ctx->ShareDim("X", "DX"); ctx->ShareLoD("X", "DX"); @@ -997,7 +1003,8 @@ class ActivationOpTripleGrad : public framework::OperatorWithKernel { ctx->ShareLoD("X", "DDOut"); } } - if (static_cast(kDepValue) & static_cast(kDepOut)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepOut)) { if (ctx->HasOutput("D_DOut")) { ctx->ShareDim("Out", "D_DOut"); ctx->ShareLoD("Out", "D_DOut"); @@ -1464,6 +1471,18 @@ namespace plat = paddle::platform; FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_OP); FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CPU_KERNEL); +REGISTER_ACTIVATION_OP(cos, Cos, CosFunctor, CosGradFunctor) +REGISTER_ACTIVATION_OP(tan, Tan, TanFunctor, TanGradFunctor); +REGISTER_ACTIVATION_OP(acos, Acos, AcosFunctor, AcosGradFunctor); +REGISTER_ACTIVATION_OP(sin, Sin, SinFunctor, SinGradFunctor); +REGISTER_ACTIVATION_OP(asin, Asin, AsinFunctor, AsinGradFunctor); +REGISTER_ACTIVATION_OP(atan, Atan, AtanFunctor, AtanGradFunctor); +REGISTER_ACTIVATION_OP(sinh, Sinh, SinhFunctor, SinhGradFunctor); +REGISTER_ACTIVATION_OP(cosh, Cosh, CoshFunctor, CoshGradFunctor); +REGISTER_ACTIVATION_OP(asinh, Asinh, AsinhFunctor, AsinhGradFunctor); +REGISTER_ACTIVATION_OP(acosh, Acosh, AcoshFunctor, AcoshGradFunctor); +REGISTER_ACTIVATION_OP(atanh, Atanh, AtanhFunctor, AtanhGradFunctor); + /* ========================== sigmoid register ============================= */ // 1. Register Sigmoid Operator @@ -1584,16 +1603,6 @@ REGISTER_OPERATOR( ops::ActivationOpDoubleGrad2::FwdDeps()>, ops::ActivationDoubleGradOpInplaceInferer); -REGISTER_ACTIVATION_CPU_KERNEL(relu, Relu, ReluCPUFunctor, ReluGradFunctor); - -REGISTER_OP_CPU_KERNEL( - relu_grad_grad, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>); /* ========================================================================== */ /* ======================== leaky relu register ============================ */ diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index ff41da86f7bb6b..4b79397b6cdf2e 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -35,16 +35,14 @@ limitations under the License. */ #include "paddle/fluid/platform/mkldnn_helper.h" #endif +#include "paddle/phi/kernels/funcs/activation_functor.h" + namespace paddle { namespace operators { using framework::To32BitIndex; -enum ActBwdOpFwdDeps { - kNoDeps = 0x00, // Do not need any forward input/output - kDepX = 0x01, // Only need forward input X - kDepOut = 0x02, // Only need forward output Out -}; +using ActBwdOpFwdDeps = phi::funcs::ActBwdOpFwdDeps; /* The following operator can be used to process SelectedRows, because the * output of those operator for zero is zero too. @@ -89,7 +87,8 @@ inline void ExtractActivationGradTensor( auto x_grad_var = context.OutputVar(framework::GradVarName("X")); const framework::Variable* out_var = nullptr; - if (static_cast(kDepValue) & static_cast(kDepOut)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepOut)) { out_var = context.InputVar("Out"); PADDLE_ENFORCE_NOT_NULL( out_var, platform::errors::NotFound( @@ -139,7 +138,7 @@ inline void ExtractActivationGradTensor( "Output(Out), variable name = %s", context.OutputName(framework::GradVarName("X")))); - if (static_cast(kDepValue) & static_cast(kDepX)) { + if (static_cast(kDepValue) & static_cast(ActBwdOpFwdDeps::kDepX)) { auto x_var = context.InputVar("X"); PADDLE_ENFORCE_NOT_NULL(x_var, platform::errors::NotFound( "Cannot get the tensor from the " @@ -248,6 +247,24 @@ struct SigmoidFunctor : public BaseActivationFunctor { } }; +#define USE_PHI_FUNCTOR(name) \ + template \ + using name##Functor = phi::funcs::name##Functor; \ + template \ + using name##GradFunctor = phi::funcs::name##GradFunctor; + +USE_PHI_FUNCTOR(Cos) +USE_PHI_FUNCTOR(Tan) +USE_PHI_FUNCTOR(Acos) +USE_PHI_FUNCTOR(Sin) +USE_PHI_FUNCTOR(Asin) +USE_PHI_FUNCTOR(Atan) +USE_PHI_FUNCTOR(Sinh) +USE_PHI_FUNCTOR(Cosh) +USE_PHI_FUNCTOR(Asinh) +USE_PHI_FUNCTOR(Acosh) +USE_PHI_FUNCTOR(Atanh) + template struct SigmoidGradFunctor : public BaseActivationFunctor { template { dx.device(d) = dout * out * (static_cast(1) - out); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; /* @@ -293,7 +312,9 @@ struct SigmoidGradGradFunctor : public BaseActivationFunctor { ddout.device(*d) = (static_cast(1) - out) * out * ddx; } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; /* @@ -351,7 +372,9 @@ struct SigmoidTripleGradFunctor : public BaseActivationFunctor { (static_cast(1) - static_cast(2) * out) * dout * d_dOutNew; } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // silu(x) = x / (1 + exp(-x)) @@ -376,7 +399,7 @@ struct SiluGradFunctor : public BaseActivationFunctor { (static_cast(1) + (temp2 / temp1))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // Originally: logsigmoid(x) = -log (1 + exp(-x)) @@ -414,7 +437,7 @@ struct LogSigmoidGradFunctor : public BaseActivationFunctor { dout * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp())); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // exp(x) = e^x @@ -434,7 +457,9 @@ struct ExpGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // expm1(x) = e^x - 1 @@ -454,38 +479,23 @@ struct Expm1GradFunctor : public BaseActivationFunctor { dx.device(d) = dout * out + dout; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // relu(x) = max(x, 0) -template -struct ReluCPUFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr([] HOSTDEVICE(T v) { - return v > static_cast(0) ? v : static_cast(0); - }); - } -}; template -struct ReluCUDAFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.cwiseMax(static_cast(0)); - } -}; +using ReluCPUFunctor = phi::funcs::ReluCPUFunctor; +template +using ReluGradFunctor = phi::funcs::ReluGradFunctor; template -struct ReluGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * (out > static_cast(0)).template cast(); - } +using ReluGradGradFunctor = phi::funcs::ReluGradGradFunctor; - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } -}; +template +using ReluCUDAFunctor = phi::funcs::ReluCUDAFunctor; // tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) template @@ -504,7 +514,9 @@ struct TanhGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (static_cast(1) - out * out); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -534,7 +546,9 @@ struct TanhGradGradFunctor : public BaseActivationFunctor { ddout.device(*d) = (static_cast(1) - out * out) * ddx; } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; /* Out @@ -589,7 +603,9 @@ struct TanhTripleGradFunctor : public BaseActivationFunctor { static_cast(2) * out * dout * d_dOutNew; } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // tanhshrink(x) = x - tanh(x) @@ -610,7 +626,7 @@ struct TanhShrinkGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (x.tanh() * x.tanh()); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // tanhshrink(x) = x - tanh(x) @@ -646,7 +662,7 @@ struct HardShrinkGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (temp1 || temp2).template cast(); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < -lambda; 0 @@ -682,7 +698,7 @@ struct SoftShrinkGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (temp1 + temp2).template cast(); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // sqrt(x) = x^(1/2) @@ -702,7 +718,9 @@ struct SqrtGradFunctor : public BaseActivationFunctor { dx.device(d) = static_cast(0.5) * dout / out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // rsqrt(x) = x^(-1/2) @@ -722,7 +740,9 @@ struct RsqrtGradFunctor : public BaseActivationFunctor { dx.device(d) = static_cast(-0.5) * dout * out * out * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // ceil(x) = ceiling(x) @@ -742,7 +762,9 @@ struct ZeroGradFunctor : public BaseActivationFunctor { dx.device(d) = static_cast(0) * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kNoDeps; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kNoDeps; + } }; // floor(x) = flooring(x) @@ -754,373 +776,6 @@ struct FloorFunctor : public BaseActivationFunctor { } }; -template -struct Sine { - HOSTDEVICE T operator()(const T& val) const { return sin(val); } -}; - -template <> -struct Sine { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(sin(static_cast(val))); - } -}; - -template -struct Cosine { - HOSTDEVICE T operator()(const T& val) const { return cos(val); } -}; - -template <> -struct Cosine { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(cos(static_cast(val))); - } -}; - -// cosine'(x) = -sin(x) -template -struct CosGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = -dout * x.unaryExpr(Sine()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -// cosine(x) = cos(x) -template -struct CosFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Cosine()); - } -}; - -// sine'(x) = cos(x) -template -struct SinGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * x.unaryExpr(Cosine()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -// sine(x) = sin(x) -template -struct SinFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Sine()); - } -}; - -template -struct Tangent { - HOSTDEVICE T operator()(const T& val) const { return tan(val); } -}; - -template <> -struct Tangent { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(tan(static_cast(val))); - } -}; - -// Tangent'(x) = -Tangent(x) -template -struct TanGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout / x.unaryExpr(Cosine()).square(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -// Tangent(x) = tan(x) -template -struct TanFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Tangent()); - } -}; - -template -struct Sinh { - HOSTDEVICE T operator()(const T& val) const { return sinh(val); } -}; - -template <> -struct Sinh { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(sinhf(static_cast(val))); - } -}; - -template -struct Cosh { - HOSTDEVICE T operator()(const T& val) const { return cosh(val); } -}; - -template <> -struct Cosh { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(coshf(static_cast(val))); - } -}; - -// sinh(x) = sinh(x) -template -struct SinhFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Sinh()); - } -}; - -// cosh(x) = cosh(x) -template -struct CoshFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Cosh()); - } -}; - -// sinh'(x) = cosh(x) -template -struct SinhGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * x.unaryExpr(Cosh()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -// cosh'(x) = sinh(x) -template -struct CoshGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * x.unaryExpr(Sinh()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct Acos { - HOSTDEVICE T operator()(const T& val) const { return acos(val); } -}; - -template <> -struct Acos { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(acos(static_cast(val))); - } -}; - -// Acos(x) = acos(x) -template -struct AcosFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Acos()); - } -}; - -// acos'(x) = -1/sqrt(1-x^2) -template -struct AcosGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = - -dout * static_cast(1) / (static_cast(1) - x.square()).sqrt(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct Asin { - HOSTDEVICE T operator()(const T& val) const { return asin(val); } -}; - -template <> -struct Asin { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(asin(static_cast(val))); - } -}; - -// Asin(x) = asin(x) -template -struct AsinFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Asin()); - } -}; - -// asin'(x) = 1/sqrt(1-x^2) -template -struct AsinGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = - dout * static_cast(1) / (static_cast(1) - x.square()).sqrt(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct Atan { - HOSTDEVICE T operator()(const T& val) const { return atan(val); } -}; - -template <> -struct Atan { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(atan(static_cast(val))); - } -}; - -// Atan(x) = atan(x) -template -struct AtanFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Atan()); - } -}; - -// atan'(x) = 1 / (1 + x^2) -template -struct AtanGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * static_cast(1) / (static_cast(1) + x.square()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct Acosh { - HOSTDEVICE T operator()(const T& val) const { return acosh(val); } -}; - -template <> -struct Acosh { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(acosh(static_cast(val))); - } -}; - -// Acosh(x) = acosh(x) -template -struct AcoshFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Acosh()); - } -}; - -// acosh'(x) = 1/sqrt(x^2 - 1) -template -struct AcoshGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = - dout * static_cast(1) / (x * x - static_cast(1)).sqrt(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct Asinh { - HOSTDEVICE T operator()(const T& val) const { return asinh(val); } -}; - -template <> -struct Asinh { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(asinh(static_cast(val))); - } -}; - -// Asinh(x) = asinh(x) -template -struct AsinhFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Asinh()); - } -}; - -// asinh'(x) = 1/sqrt(x^2 + 1) -template -struct AsinhGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = - dout * static_cast(1) / (x.square() + static_cast(1)).sqrt(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct Atanh { - HOSTDEVICE T operator()(const T& val) const { return atanh(val); } -}; - -template <> -struct Atanh { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(atanh(static_cast(val))); - } -}; - -// Atanh(x) = atanh(x) -template -struct AtanhFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Atanh()); - } -}; - -// atanh'(x) = 1/(1 - x^2) -template -struct AtanhGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * static_cast(1) / (static_cast(1) - x.square()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - // round(x) = [x] template struct RoundFunctor : public BaseActivationFunctor { @@ -1147,7 +802,9 @@ struct ReciprocalGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * static_cast(-1) * out * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // log(x) = natural logarithm of x @@ -1167,7 +824,7 @@ struct LogGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (static_cast(1) / x); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // log2(x) = logarithm to the base 2 of the elements of x @@ -1188,7 +845,7 @@ struct Log2GradFunctor : public BaseActivationFunctor { dx.device(d) = dout * static_cast(1) / (x * static_cast(log(2))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // log10(x) = logarithm to the base 10 of the elements of x @@ -1209,7 +866,7 @@ struct Log10GradFunctor : public BaseActivationFunctor { dx.device(d) = dout * static_cast(1) / (x * static_cast(log(10))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // log1p(x) = natural logarithm of x+1 @@ -1229,7 +886,7 @@ struct Log1pGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (static_cast(1) / (x + static_cast(1))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // square(x) = x^2 @@ -1249,7 +906,7 @@ struct SquareGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * static_cast(2) * x; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1285,7 +942,7 @@ struct BReluGradFunctor : public BaseActivationFunctor { .template cast(); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // relu6(x) = min(max(0, x), 6) @@ -1319,7 +976,9 @@ struct Relu6GradFunctor : public BaseActivationFunctor { .template cast(); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // HardSwish = min(max(0, x+3), 6) * x / 6 @@ -1364,7 +1023,7 @@ struct HardSwishGradFunctor : public BaseActivationFunctor { static_cast(1) * (static_cast(1) - tmp)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // For numerical stability, using the following formula instead of softplus(x) = @@ -1409,7 +1068,7 @@ struct SoftplusGradFunctor : public BaseActivationFunctor { .select(dout, dout / (static_cast(1) + (-x_beta).exp())); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // mish(x) = x * tanh(softplus(x)) @@ -1449,7 +1108,7 @@ struct MishGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (tsp + x * (static_cast(1) - tsp * tsp) * gsp); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // softsign(x) = x / (1 + |x|) @@ -1472,7 +1131,7 @@ struct SoftsignGradFunctor : public BaseActivationFunctor { dout * (static_cast(1) / (static_cast(1) + x.abs()).square()); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1504,7 +1163,9 @@ struct SoftReluGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (static_cast(1) - (-out).exp()) * temp; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -1539,7 +1200,7 @@ struct LeakyReluGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (temp1 + temp2).template cast(); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1573,7 +1234,7 @@ struct ELUGradFunctor : public BaseActivationFunctor { .select(dout, dout * (out + static_cast(alpha))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1592,7 +1253,7 @@ struct ELUGradNegativeAlphaFunctor : public BaseActivationFunctor { .select(dout, dout * static_cast(alpha) * x.exp()); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1672,7 +1333,7 @@ struct CELUGradFunctor : public BaseActivationFunctor { dout * (x / static_cast(alpha)).exp() * temp_a_neg * temp_x_neg; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5198 @@ -1701,7 +1362,7 @@ struct PowGradFunctor : public BaseActivationFunctor { x.pow(static_cast(factor) - static_cast(1)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1766,7 +1427,7 @@ struct STanhGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * a * b * (static_cast(1) - temp); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1797,7 +1458,7 @@ struct ThresholdedReluGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (x > th).template cast(); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1832,7 +1493,9 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor { static_cast(slope); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -1865,7 +1528,7 @@ struct SwishGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * ((static_cast(beta) * out) + temp2); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; /* @@ -1902,7 +1565,7 @@ inline void ExtractActivationDoubleGradTensor( "Cannot get the tensor from the Variable Output, variable name = %s", ctx.OutputName("DDX"))); - if (static_cast(kDepValue) & static_cast(kDepX)) { + if (static_cast(kDepValue) & static_cast(ActBwdOpFwdDeps::kDepX)) { auto x_var = ctx.InputVar("X"); PADDLE_ENFORCE_NOT_NULL( x_var, platform::errors::NotFound( @@ -1925,7 +1588,8 @@ inline void ExtractActivationDoubleGradTensor( VLOG(10) << "Inplace activation of Op: " << ctx.Type(); *X = *ddX; } - if (static_cast(kDepValue) & static_cast(kDepOut)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepOut)) { auto out_var = ctx.InputVar("Out"); PADDLE_ENFORCE_NOT_NULL( out_var, @@ -2000,28 +1664,7 @@ struct AbsGradGradFunctor : public BaseActivationFunctor { ddout.device(*d) = ddx * x.sign(); } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct ReluGradGradFunctor : public BaseActivationFunctor { - template - void operator()(const Device& dev, const framework::Tensor* X, - const framework::Tensor* Out, const framework::Tensor* ddX, - framework::Tensor* ddOut, framework::Tensor* dOut, - framework::Tensor* dX) const { - auto* d = dev.eigen_device(); - auto ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddX, "Input", "DDX", "ReluGradGrad")); - auto out = framework::EigenVector::Flatten( - GET_DATA_SAFELY(Out, "Output", "Out", "ReluGradGrad")); - if (ddOut) { - auto ddout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ReluGradGrad")); - ddout.device(*d) = ddx * (out > static_cast(0)).template cast(); - } - } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -2050,7 +1693,7 @@ struct LeakyReluGradGradFunctor : public BaseActivationFunctor { .template cast(); } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -2088,7 +1731,7 @@ struct ELUGradGradFunctor : public BaseActivationFunctor { .template cast(); } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -2127,7 +1770,7 @@ struct CELUGradGradFunctor : public BaseActivationFunctor { .template cast(); } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -2156,7 +1799,9 @@ struct SqrtGradGradFunctor : public BaseActivationFunctor { ddout.device(*d) = ddx * static_cast(0.5) / out; } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -2185,7 +1830,9 @@ struct RsqrtGradGradFunctor : public BaseActivationFunctor { ddout.device(*d) = ddx * static_cast(-0.5) * out * out * out; } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -2214,7 +1861,7 @@ struct SquareGradGradFunctor : public BaseActivationFunctor { ddout.device(*d) = ddx * static_cast(2) * x; } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // TODO(dengkaipeng): double gradient calculation for Square/Sqrt need @@ -2840,7 +2487,7 @@ struct LogGradGradFunctor : public BaseActivationFunctor { } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; } // namespace operators @@ -2849,20 +2496,9 @@ struct LogGradGradFunctor : public BaseActivationFunctor { #define FOR_EACH_ACTIVATION_OP(__macro) \ __macro(silu, Silu, SiluFunctor, SiluGradFunctor); \ __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \ - __macro(atan, Atan, AtanFunctor, AtanGradFunctor); \ __macro(softshrink, SoftShrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \ __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor); \ __macro(floor, Floor, FloorFunctor, ZeroGradFunctor); \ - __macro(cos, Cos, CosFunctor, CosGradFunctor); \ - __macro(tan, Tan, TanFunctor, TanGradFunctor); \ - __macro(acos, Acos, AcosFunctor, AcosGradFunctor); \ - __macro(sin, Sin, SinFunctor, SinGradFunctor); \ - __macro(asin, Asin, AsinFunctor, AsinGradFunctor); \ - __macro(sinh, Sinh, SinhFunctor, SinhGradFunctor); \ - __macro(cosh, Cosh, CoshFunctor, CoshGradFunctor); \ - __macro(asinh, Asinh, AsinhFunctor, AsinhGradFunctor); \ - __macro(acosh, Acosh, AcoshFunctor, AcoshGradFunctor); \ - __macro(atanh, Atanh, AtanhFunctor, AtanhGradFunctor); \ __macro(round, Round, RoundFunctor, ZeroGradFunctor); \ __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \ __macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor); \ diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps index e1afb3919f813b..92a101451e211f 100644 --- a/paddle/fluid/operators/activation_op.kps +++ b/paddle/fluid/operators/activation_op.kps @@ -18,28 +18,6 @@ limitations under the License. */ namespace paddle { namespace operators { -template -struct CudaReluFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - - // relu(x) = max(x, 0) - __device__ __forceinline__ T operator()(const T x) const { - return x > zero ? x : zero; - } -}; - -template -struct CudaReluGradFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - - // dx = dout * (out > 0) - __device__ __forceinline__ T operator()(const T dout, const T out) const { - return out > zero ? dout : zero; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } -}; - template struct CudaLeakyReluFunctor : public BaseActivationFunctor { T zero = static_cast(0.0f); @@ -69,7 +47,7 @@ struct CudaLeakyReluGradFunctor : public BaseActivationFunctor { return x > zero ? dout : static_cast(alpha) * dout; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -93,7 +71,9 @@ struct CudaSigmoidGradFunctor : public BaseActivationFunctor { return dout * out * (one - out); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -122,7 +102,7 @@ struct CudaSiluGradFunctor : public BaseActivationFunctor { return static_cast(dout * (temp * (one + x * (one - temp)))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -159,30 +139,7 @@ struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor { return static_cast(dout * (temp2 / (exp(-temp1) + temp2))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaAtanFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // atan(x) = atan(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(atan(x)); - } -}; - -template -struct CudaAtanGradFunctor : public BaseActivationFunctor { - T one = static_cast(1.0f); - - // dx = dout / (1 + x^2) - __device__ __forceinline__ T operator()(const T dout, const T x) const { - return dout / (one + x * x); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -219,7 +176,7 @@ struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor { return (x >= -l && x <= l) ? zero : dout; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -262,191 +219,9 @@ struct CudaZeroGradFunctor : public BaseActivationFunctor { return static_cast(0.0f); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kNoDeps; } -}; - -template -struct CudaCosFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // cos(x) = cos(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(cos(x)); - } -}; - -template -struct CudaCosGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // dx = dout * (-sin(x)) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(-dout * sin(x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaSinFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // sin(x) = sin(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(sin(x)); - } -}; - -template -struct CudaSinGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // dx = dout * cos(x) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout * cos(x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaTanFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // tan(x) = tan(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(tan(x)); - } -}; - -template -struct CudaTanGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // dx = dout / cos(x)^2 - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout / (cos(x) * cos(x))); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaAsinFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // asin(x) = asin(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(asin(x)); - } -}; - -template -struct CudaAsinGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - - // dx = dout / sqrt(1 - x^2) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout / sqrt(one - x * x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaAcosFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // acos(x) = acos(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(acos(x)); - } -}; - -template -struct CudaAcosGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - - // dx = -dout / sqrt(1 - x^2) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(-dout / sqrt(one - x * x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaCoshFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // cosh(x) = cosh(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(cosh(x)); - } -}; - -template -struct CudaCoshGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // dx = dout * sinh(x) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout * sinh(x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaSinhFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // sinh(x) = sinh(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(sinh(x)); - } -}; - -template -struct CudaSinhGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // dx = dout * cosh(x) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout * cosh(x)); + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kNoDeps; } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } }; template @@ -469,86 +244,9 @@ struct CudaTanhGradFunctor : public BaseActivationFunctor { return dout * (one - out * out); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } -}; - -template -struct CudaAcoshFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // Acosh(x) = acosh(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(acosh(x)); - } -}; - -template -struct CudaAcoshGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - // dx = dout * 1 / sqrt(x^2 - 1) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout * one / sqrt(x * x - one)); + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaAsinhFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // Asinh(x) = asinh(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(asinh(x)); - } -}; - -template -struct CudaAsinhGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - - // dx = dout * 1/sqrt(x^2 + 1) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout * one / sqrt(x * x + one)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaAtanhFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // Atanh(x) = atanh(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(atanh(x)); - } -}; - -template -struct CudaAtanhGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - // dx = dout * 1/(1- x^2) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout * one / (one - x * x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } }; template @@ -566,7 +264,9 @@ struct CudaReciprocalGradFunctor : public BaseActivationFunctor { return -dout * out * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -587,7 +287,9 @@ struct CudaExpGradFunctor : public BaseActivationFunctor { return dout * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -608,7 +310,9 @@ struct CudaExpm1GradFunctor : public BaseActivationFunctor { return dout * out + dout; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -629,7 +333,7 @@ struct CudaLogGradFunctor : public BaseActivationFunctor { return dout / x; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -647,7 +351,7 @@ struct CudaSquareGradFunctor : public BaseActivationFunctor { return dout * two * x; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -670,7 +374,9 @@ struct CudaSqrtGradFunctor : public BaseActivationFunctor { return one_half * dout / out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -693,7 +399,9 @@ struct CudaRsqrtGradFunctor : public BaseActivationFunctor { return minus_one_half * dout * out * out * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -717,7 +425,7 @@ struct CudaLog1pGradFunctor : public BaseActivationFunctor { return dout / (one + x); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -741,7 +449,7 @@ struct CudaLog2GradFunctor : public BaseActivationFunctor { return dout / (x * log_two); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -765,7 +473,7 @@ struct CudaLog10GradFunctor : public BaseActivationFunctor { return dout / (x * log_ten); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -804,7 +512,7 @@ struct CudaBReluGradFunctor : public BaseActivationFunctor { return (x > t_min_cast && x < t_max_cast) ? dout : zero; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -849,7 +557,9 @@ struct CudaSoftReluGradFunctor : public BaseActivationFunctor { : static_cast(0.0f); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -893,7 +603,7 @@ struct CudaSTanhGradFunctor : public BaseActivationFunctor { return static_cast(dout * a * b * (one - temp * temp)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -939,7 +649,7 @@ struct CudaSoftplusGradFunctor : public BaseActivationFunctor { return x_beta > t ? arg_dout : static_cast(dout / (one + exp(-x_beta))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -962,7 +672,7 @@ struct CudaSoftsignGradFunctor : public BaseActivationFunctor { return dout / (temp * temp); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -996,7 +706,9 @@ struct CudaRelu6GradFunctor : public BaseActivationFunctor { return (out > zero && out < t) ? dout : zero; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -1022,7 +734,7 @@ struct CudaTanhShrinkGradFunctor : public BaseActivationFunctor { return static_cast(dout * tanh(x) * tanh(x)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1056,7 +768,7 @@ struct CudaHardShrinkGradFunctor : public BaseActivationFunctor { return (x > -t && x < t) ? zero : dout; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1097,7 +809,9 @@ struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor { return (out > zero && out < one) ? dout * static_cast(slope) : zero; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -1141,7 +855,7 @@ struct CudaSwishGradFunctor : public BaseActivationFunctor { return static_cast(dout * (temp2 + temp3)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1190,7 +904,7 @@ struct CudaMishGradFunctor : public BaseActivationFunctor { return static_cast(dout * (tsp + x * (one - tsp * tsp) * gsp)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1222,7 +936,7 @@ struct CudaThresholdedReluGradFunctor : public BaseActivationFunctor { return x > static_cast(threshold) ? dout : zero; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1274,7 +988,7 @@ struct CudaHardSwishGradFunctor : public BaseActivationFunctor { return dout * (temp1 * temp2 * (two * x + o) / s + one - temp2); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1320,7 +1034,9 @@ struct CudaELUGradFunctor : public BaseActivationFunctor { return static_cast(dout * (out_pos + out_neg * (out + a))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -1347,7 +1063,7 @@ struct CudaELUGradNegativeAlphaFunctor : public BaseActivationFunctor { return static_cast(dout * (x_pos + x_neg * (out + a))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1429,7 +1145,7 @@ struct CudaCELUGradFunctor : public BaseActivationFunctor { temp_a_neg * temp_x_pos + exp(x / a) * temp_a_neg * temp_x_neg)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1477,13 +1193,14 @@ class ActivationGradCudaKernel std::vector ins = {d_out}; std::vector outs = {d_x}; - if (static_cast(Functor::FwdDeps()) == static_cast(kDepOut)) { + if (static_cast(Functor::FwdDeps()) == + static_cast(ActBwdOpFwdDeps::kDepOut)) { // Only need forward output Out ins.push_back(out); paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, &outs, functor); } else if (static_cast(Functor::FwdDeps()) == - static_cast(kDepX)) { + static_cast(ActBwdOpFwdDeps::kDepX)) { // Only need forward input X ins.push_back(x); paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, @@ -1509,7 +1226,9 @@ namespace plat = paddle::platform; ops::ActivationCudaKernel>, \ ops::ActivationCudaKernel>); \ + ops::functor>, \ + ops::ActivationCudaKernel>); \ REGISTER_OP_CUDA_KERNEL( \ act_type##_grad, \ ops::ActivationGradCudaKernel>, \ ops::ActivationGradCudaKernel>); + ops::grad_functor>, \ + ops::ActivationGradCudaKernel>); #define REGISTER_ACTIVATION_CUDA_KERNEL_INT(act_type, op_name, functor, \ grad_functor) \ @@ -1531,7 +1252,9 @@ namespace plat = paddle::platform; ops::ActivationCudaKernel>, \ ops::ActivationCudaKernel>); \ + ops::functor>, \ + ops::ActivationCudaKernel>); \ REGISTER_OP_CUDA_KERNEL( \ act_type##_grad, \ ops::ActivationGradCudaKernel>, \ ops::ActivationGradCudaKernel>); + ops::grad_functor>, \ + ops::ActivationGradCudaKernel>); /* ======================== leaky relu register ============================ */ REGISTER_ACTIVATION_CUDA_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor, @@ -1594,50 +1319,6 @@ REGISTER_OP_CUDA_KERNEL( ops::CELUGradGradFunctor>); /* ========================================================================== */ -/* =========================== relu register ============================ */ -#ifdef PADDLE_WITH_HIP -REGISTER_ACTIVATION_CUDA_KERNEL(relu, Relu, CudaReluFunctor, - CudaReluGradFunctor); -REGISTER_OP_CUDA_KERNEL( - relu_grad_grad, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>); -#else -REGISTER_OP_CUDA_KERNEL( - relu, ops::ActivationCudaKernel>, - ops::ActivationCudaKernel>, - ops::ActivationCudaKernel>, - ops::ActivationCudaKernel>); -REGISTER_OP_CUDA_KERNEL( - relu_grad, ops::ActivationGradCudaKernel>, - ops::ActivationGradCudaKernel>, - ops::ActivationGradCudaKernel>, - ops::ActivationGradCudaKernel>); -REGISTER_OP_CUDA_KERNEL( - relu_grad_grad, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>); -#endif -/* ========================================================================== */ - /* =========================== sigmoid register ============================ */ REGISTER_ACTIVATION_CUDA_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor, @@ -1650,7 +1331,9 @@ REGISTER_OP_CUDA_KERNEL( ops::SigmoidDoubleGradKernel>, ops::SigmoidDoubleGradKernel>); + ops::SigmoidGradGradFunctor>, + ops::SigmoidDoubleGradKernel>); REGISTER_OP_CUDA_KERNEL( sigmoid_triple_grad, @@ -1659,7 +1342,10 @@ REGISTER_OP_CUDA_KERNEL( ops::SigmoidTripleGradKernel>, ops::SigmoidTripleGradKernel>); + ops::SigmoidTripleGradFunctor>, + ops::SigmoidTripleGradKernel< + plat::CUDADeviceContext, + ops::SigmoidTripleGradFunctor>); /* ========================================================================== */ /* =========================== tanh register ============================ */ @@ -1696,7 +1382,9 @@ REGISTER_OP_CUDA_KERNEL( ops::SqrtDoubleGradKernel>, ops::SqrtDoubleGradKernel>); + ops::SqrtGradGradFunctor>, + ops::SqrtDoubleGradKernel>); /* ========================================================================== */ /* =========================== rsqrt register ============================= @@ -1726,6 +1414,8 @@ REGISTER_OP_CUDA_KERNEL( ops::SquareGradGradFunctor>, ops::SquareDoubleGradKernel>, + ops::SquareDoubleGradKernel>, ops::SquareDoubleGradKernel>, ops::SquareDoubleGradKernel { } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(addmm, AddmmInferShapeFunctor, - PT_INFER_META(phi::AddmmInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(addmm, AddmmInferShapeFunctor, + PD_INFER_META(phi::AddmmInferMeta)); REGISTER_OPERATOR(addmm, ops::AddMMOp, ops::AddMMOpMaker, ops::AddMMOpGradMaker, ops::AddMMOpGradMaker, diff --git a/paddle/fluid/operators/amp/fp16_type_traits.h b/paddle/fluid/operators/amp/fp16_type_traits.h index f7aa0de97598df..56aebe90788fba 100644 --- a/paddle/fluid/operators/amp/fp16_type_traits.h +++ b/paddle/fluid/operators/amp/fp16_type_traits.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/float16.h" namespace paddle { @@ -32,6 +33,12 @@ class MPTypeTrait { using Type = float; }; +template <> +class MPTypeTrait { + public: + using Type = float; +}; + } // namespace details } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/arg_max_op.cc b/paddle/fluid/operators/arg_max_op.cc index 0f5c048b6be9c7..c5e4188ca2d6f7 100644 --- a/paddle/fluid/operators/arg_max_op.cc +++ b/paddle/fluid/operators/arg_max_op.cc @@ -15,23 +15,19 @@ limitations under the License. */ #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/arg_min_max_op_base.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + +DECLARE_INFER_SHAPE_FUNCTOR(arg_max, ArgMaxInferShapeFunctor, + PD_INFER_META(phi::ArgMinMaxInferMeta)); + REGISTER_OPERATOR( arg_max, paddle::operators::ArgMinMaxOp, paddle::operators::ArgMaxOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); - -REGISTER_OP_CPU_KERNEL( - arg_max, - paddle::operators::ArgMaxKernel, - paddle::operators::ArgMaxKernel, - paddle::operators::ArgMaxKernel, - paddle::operators::ArgMaxKernel, - paddle::operators::ArgMaxKernel, - paddle::operators::ArgMaxKernel); + paddle::framework::EmptyGradOpMaker, + ArgMaxInferShapeFunctor); + REGISTER_OP_VERSION(arg_max) .AddCheckpoint( R"ROC( diff --git a/paddle/fluid/operators/arg_max_op.cu b/paddle/fluid/operators/arg_max_op.cu deleted file mode 100644 index 14708c4df10f51..00000000000000 --- a/paddle/fluid/operators/arg_max_op.cu +++ /dev/null @@ -1,22 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/arg_min_max_op_base.cu.h" - -REGISTER_OP_CUDA_KERNEL( - arg_max, paddle::operators::ArgMinMaxOpCUDAKernel, - paddle::operators::ArgMinMaxOpCUDAKernel, - paddle::operators::ArgMinMaxOpCUDAKernel, - paddle::operators::ArgMinMaxOpCUDAKernel, - paddle::operators::ArgMinMaxOpCUDAKernel); diff --git a/paddle/fluid/operators/arg_min_max_op_base.cu.h b/paddle/fluid/operators/arg_min_max_op_base.cu.h deleted file mode 100644 index b77031f7fb4c9d..00000000000000 --- a/paddle/fluid/operators/arg_min_max_op_base.cu.h +++ /dev/null @@ -1,202 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#if defined(__NVCC__) || defined(__HIPCC__) - -#ifdef __NVCC__ -#include "cub/cub.cuh" -#endif -#ifdef __HIPCC__ -#include -namespace cub = hipcub; -#endif -#include -#include -#include -#include -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/transpose_op.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/phi/core/ddim.h" - -namespace paddle { -namespace operators { - -namespace { // NOLINT -template -using KeyValuePair = cub::KeyValuePair; -using Tensor = framework::Tensor; - -} // end namespace - -#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...) \ - case (1 << (log2_block_dim)): { \ - constexpr auto kBlockDim = (1 << (log2_block_dim)); \ - __VA_ARGS__; \ - } break - -#define FIXED_BLOCK_DIM_CASE(...) \ - FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__); - -template -__global__ void ArgCUDAKernel(const int64_t height, // n * h - const int64_t width, // c - const int64_t post_size, // h - const Reducer reducer, const T init, const T* in, - IndType* out) { - typedef cub::BlockReduce, BlockDim> BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - - for (int idx = blockIdx.x; idx < height; idx += gridDim.x) { - KeyValuePair kv_pair = {-1, init}; - int h = idx / post_size; - int w = idx % post_size; - for (int k = threadIdx.x; k < width; k += blockDim.x) { - kv_pair = - reducer({k, in[h * width * post_size + k * post_size + w]}, kv_pair); - } - kv_pair = BlockReduce(temp_storage).Reduce(kv_pair, reducer); - if (threadIdx.x == 0) { - out[idx] = static_cast(kv_pair.key); - } - __syncthreads(); - } -} - -template -void ComputeFullArg(const platform::CUDADeviceContext& ctx, const Tensor& input, - Tensor* indices, const int64_t pre, const int64_t post, - const int64_t n) { - auto cu_stream = ctx.stream(); - auto ComputeBlockSize = [](int64_t col) { - auto block_size = 8; - if (col > 512) - block_size = 1024; - else if (col > 256) - block_size = 512; - else if (col > 128) - block_size = 256; - else if (col > 64) - block_size = 128; - else if (col > 32) - block_size = 64; - else if (col > 16) - block_size = 32; - else if (col > 8) - block_size = 16; -#ifdef __HIPCC__ - block_size = std::min(block_size, 256); -#endif - return block_size; - }; - - int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0]; - int64_t height = pre * post; - int64_t width = n; - int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx; - - const T* in_data = input.data(); - IndType* out_data = indices->mutable_data(ctx.GetPlace()); - - if (typeid(Reducer) == typeid(cub::ArgMax)) { - switch (ComputeBlockSize(width)) { - FIXED_BLOCK_DIM_CASE( - ArgCUDAKernel<<>>( - height, width, post, Reducer(), std::numeric_limits::lowest(), - in_data, out_data)); - } - } else { - switch (ComputeBlockSize(width)) { - FIXED_BLOCK_DIM_CASE( - ArgCUDAKernel<<>>( - height, width, post, Reducer(), std::numeric_limits::max(), - in_data, out_data)); - } - } -} - -template -struct VisitDataCudaArgMinMaxFunctor { - const framework::ExecutionContext& ctx; - - explicit VisitDataCudaArgMinMaxFunctor(const framework::ExecutionContext& ctx) - : ctx(ctx) {} - template - void apply() const { - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - int axis = ctx.Attr("axis"); - const bool& flatten = ctx.Attr("flatten"); - - framework::DDim input_dims; - if (flatten) { - input_dims = phi::make_ddim({input->numel()}); - // if flatten, the axis just as 0 - axis = 0; - } else { - input_dims = input->dims(); - if (axis < 0) axis += input->dims().size(); - } - - int64_t numel = input->numel(); - int64_t groups = numel / input_dims[axis]; - int64_t pre = 1; - int64_t post = 1; - int64_t n = input_dims[axis]; - - for (int i = 0; i < axis; i++) { - pre *= input_dims[i]; - } - - for (int i = axis + 1; i < input_dims.size(); i++) { - post *= input_dims[i]; - } - - const auto& dev_ctx = ctx.cuda_device_context(); - ComputeFullArg(dev_ctx, *input, output, pre, post, n); - } -}; -template -class ArgMinMaxOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dtype = ctx.Attr("dtype"); - if (dtype < 0) { - framework::VisitDataTypeTiny( - static_cast( - framework::proto::VarType::INT64), - VisitDataCudaArgMinMaxFunctor(ctx)); - return; - } - framework::VisitDataTypeTiny( - static_cast(dtype), - VisitDataCudaArgMinMaxFunctor(ctx)); - } -}; - -#endif - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/arg_min_max_op_base.h b/paddle/fluid/operators/arg_min_max_op_base.h index d3ce61d183a3d3..585341beea12c1 100644 --- a/paddle/fluid/operators/arg_min_max_op_base.h +++ b/paddle/fluid/operators/arg_min_max_op_base.h @@ -27,193 +27,9 @@ limitations under the License. */ namespace paddle { namespace operators { -enum ArgMinMaxType { kArgMin, kArgMax }; - -template -struct ArgMinMaxFunctor {}; - -#define DECLARE_ARG_MIN_MAX_FUNCTOR(eigen_op_type, enum_argminmax_value) \ - template \ - struct ArgMinMaxFunctor { \ - void operator()(const DeviceContext& ctx, const framework::LoDTensor& in, \ - framework::LoDTensor* out, framework::DDim x_dims, \ - int64_t axis, bool keepdims) { \ - auto in_eigen = framework::EigenTensor::From(in, x_dims); \ - if (keepdims) { \ - auto out_eigen = framework::EigenTensor::From(*out); \ - out_eigen.device(*(ctx.eigen_device())) = \ - in_eigen.eigen_op_type(axis).template cast(); \ - } else { \ - auto out_eigen = framework::EigenTensor::From(*out); \ - out_eigen.device(*(ctx.eigen_device())) = \ - in_eigen.eigen_op_type(axis).template cast(); \ - } \ - } \ - } - -DECLARE_ARG_MIN_MAX_FUNCTOR(argmin, ArgMinMaxType::kArgMin); -DECLARE_ARG_MIN_MAX_FUNCTOR(argmax, ArgMinMaxType::kArgMax); - -template -struct VisitDataArgMinMaxFunctor { - const framework::ExecutionContext& ctx; - - explicit VisitDataArgMinMaxFunctor(const framework::ExecutionContext& ctx) - : ctx(ctx) {} - template - void apply() const { - auto& x = *(ctx.Input("X")); - auto& out = *(ctx.Output("Out")); - out.template mutable_data(ctx.GetPlace()); - auto axis = ctx.Attr("axis"); - auto keepdims = ctx.Attr("keepdims"); - const bool& flatten = ctx.Attr("flatten"); - // paddle do not have the scalar tensor, just return the shape [1] tensor - if (flatten) keepdims = true; - - // if flatten, will construct the new dims for the cacluate - framework::DDim x_dims; - if (flatten) { - x_dims = phi::make_ddim({x.numel()}); - // if flatten, the axis just as 0 - axis = 0; - } else { - x_dims = x.dims(); - if (axis < 0) axis += x_dims.size(); - } - auto& dev_ctx = ctx.template device_context(); - -#define CALL_ARG_MINMAX_FUNCTOR(rank) \ - ArgMinMaxFunctor \ - functor##rank; \ - functor##rank(dev_ctx, x, &out, x_dims, axis, keepdims) - - switch (x_dims.size()) { - case 1: - CALL_ARG_MINMAX_FUNCTOR(1); - break; - case 2: - CALL_ARG_MINMAX_FUNCTOR(2); - break; - case 3: - CALL_ARG_MINMAX_FUNCTOR(3); - break; - case 4: - CALL_ARG_MINMAX_FUNCTOR(4); - break; - case 5: - CALL_ARG_MINMAX_FUNCTOR(5); - break; - case 6: - CALL_ARG_MINMAX_FUNCTOR(6); - break; - default: - PADDLE_ENFORCE_LE( - x_dims.size(), 6, - platform::errors::InvalidArgument( - "%s operator doesn't supports tensors whose ranks are greater " - "than 6.", - (EnumArgMinMaxValue == kArgMin ? "argmin" : "argmax"))); - break; -#undef CALL_ARG_MINMAX_FUNCTOR - } - } -}; - -template -class ArgMinMaxKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dtype = ctx.Attr("dtype"); - if (dtype < 0) { - framework::VisitDataTypeTiny( - static_cast( - framework::proto::VarType::INT64), - VisitDataArgMinMaxFunctor(ctx)); - return; - } - framework::VisitDataTypeTiny( - static_cast(dtype), - VisitDataArgMinMaxFunctor(ctx)); - } -}; - -template -using ArgMinKernel = ArgMinMaxKernel; - -template -using ArgMaxKernel = ArgMinMaxKernel; - class ArgMinMaxOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "arg_min_max"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "arg_min_max"); - const auto& x_dims = ctx->GetInputDim("X"); - int64_t axis = ctx->Attrs().Get("axis"); - bool keepdims = ctx->Attrs().Get("keepdims"); - const bool& flatten = ctx->Attrs().Get("flatten"); - - PADDLE_ENFORCE_GE(axis, -x_dims.size(), - platform::errors::InvalidArgument( - "'axis'(%d) must be greater than or equal to" - " -Rank(X)(%d).", - axis, -x_dims.size())); - PADDLE_ENFORCE_LT( - axis, x_dims.size(), - platform::errors::InvalidArgument( - "'axis'(%d) must be less than Rank(X)(%d) of Input(X).", axis, - x_dims.size())); - - const int& dtype = ctx->Attrs().Get("dtype"); - PADDLE_ENFORCE_EQ( - (dtype < 0 || dtype == 2 || dtype == 3), true, - platform::errors::InvalidArgument( - "The attribute of dtype in argmin/argmax must be [%s] or [%s], but " - "received [%s]", - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64), - paddle::framework::DataTypeToString( - static_cast(dtype)))); - - auto x_rank = x_dims.size(); - if (axis < 0) axis += x_rank; - if (ctx->IsRuntime()) { - if (dtype == framework::proto::VarType::INT32) { - int64_t all_element_num = 0; - if (flatten) { - all_element_num = phi::product(x_dims); - - } else { - all_element_num = x_dims[axis]; - } - PADDLE_ENFORCE_LE( - all_element_num, INT_MAX, - platform::errors::InvalidArgument( - "The element num of the argmin/argmax input at axis is " - "%d, is larger than int32 maximum value:%d, you must " - "set the dtype of argmin/argmax to 'int64'.", - all_element_num, INT_MAX)); - } - } - std::vector vec; - if (flatten) { - vec.emplace_back(static_cast(1)); - } else { - for (int64_t i = 0; i < axis; i++) vec.emplace_back(x_dims[i]); - if (keepdims) { - vec.emplace_back(static_cast(1)); - } - for (int64_t i = axis + 1; i < x_rank; i++) vec.emplace_back(x_dims[i]); - } - ctx->SetOutputDim("Out", phi::make_ddim(vec)); - } }; class BaseArgMinMaxOpMaker : public framework::OpProtoAndCheckerMaker { diff --git a/paddle/fluid/operators/arg_min_op.cc b/paddle/fluid/operators/arg_min_op.cc index 0a4ba6fb0bfdfc..fb3abd01af8c39 100644 --- a/paddle/fluid/operators/arg_min_op.cc +++ b/paddle/fluid/operators/arg_min_op.cc @@ -12,26 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/arg_min_max_op_base.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + +DECLARE_INFER_SHAPE_FUNCTOR(arg_min, ArgMinInferShapeFunctor, + PD_INFER_META(phi::ArgMinMaxInferMeta)); REGISTER_OPERATOR( arg_min, paddle::operators::ArgMinMaxOp, paddle::operators::ArgMinOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); + paddle::framework::EmptyGradOpMaker, + ArgMinInferShapeFunctor); -REGISTER_OP_CPU_KERNEL( - arg_min, - paddle::operators::ArgMinKernel, - paddle::operators::ArgMinKernel, - paddle::operators::ArgMinKernel, - paddle::operators::ArgMinKernel, - paddle::operators::ArgMinKernel, - paddle::operators::ArgMinKernel); REGISTER_OP_VERSION(arg_min) .AddCheckpoint( R"ROC( diff --git a/paddle/fluid/operators/arg_min_op.cu b/paddle/fluid/operators/arg_min_op.cu deleted file mode 100644 index 23170bf0087906..00000000000000 --- a/paddle/fluid/operators/arg_min_op.cu +++ /dev/null @@ -1,21 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/arg_min_max_op_base.cu.h" -REGISTER_OP_CUDA_KERNEL( - arg_min, paddle::operators::ArgMinMaxOpCUDAKernel, - paddle::operators::ArgMinMaxOpCUDAKernel, - paddle::operators::ArgMinMaxOpCUDAKernel, - paddle::operators::ArgMinMaxOpCUDAKernel, - paddle::operators::ArgMinMaxOpCUDAKernel); diff --git a/paddle/fluid/operators/argsort_op.cc b/paddle/fluid/operators/argsort_op.cc index 9e525c20335d37..1a8aca777370bc 100644 --- a/paddle/fluid/operators/argsort_op.cc +++ b/paddle/fluid/operators/argsort_op.cc @@ -12,40 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/argsort_op.h" #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace operators { class ArgsortOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "argsort"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "argsort"); - OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "argsort"); - - auto in_dims = ctx->GetInputDim("X"); - int axis = ctx->Attrs().Get("axis"); - - auto num_dims = in_dims.size(); - PADDLE_ENFORCE_GE(axis, -num_dims, - platform::errors::InvalidArgument( - "'axis'(%d) must be greater than or equal to" - " -num_dims(%d).", - axis, -num_dims)); - PADDLE_ENFORCE_LT( - axis, num_dims, - platform::errors::InvalidArgument( - "'axis'(%d) must be less than num_dims(%d).", axis, num_dims)); - - ctx->ShareDim("X", "Out"); - ctx->ShareDim("X", "Indices"); - ctx->ShareLoD("X", "Out"); - ctx->ShareLoD("X", "Indices"); - } }; class ArgsortGradOp : public framework::OperatorWithKernel { @@ -122,18 +101,11 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(ArgsortGradNoNeedBufferVarsInferer, "X"); } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(argsort, ArgsortInferShapeFunctor, + PD_INFER_META(phi::ArgsortInferMeta)); REGISTER_OPERATOR(argsort, ops::ArgsortOp, ops::ArgsortOpMaker, ops::ArgsortGradOpMaker, - ops::ArgsortGradOpMaker); + ops::ArgsortGradOpMaker, + ArgsortInferShapeFunctor); REGISTER_OPERATOR(argsort_grad, ops::ArgsortGradOp, ops::ArgsortGradNoNeedBufferVarsInferer); -REGISTER_OP_CPU_KERNEL(argsort, - ops::ArgsortKernel, - ops::ArgsortKernel, - ops::ArgsortKernel, - ops::ArgsortKernel); -REGISTER_OP_CPU_KERNEL( - argsort_grad, ops::ArgsortGradientKernel, - ops::ArgsortGradientKernel, - ops::ArgsortGradientKernel, - ops::ArgsortGradientKernel); diff --git a/paddle/fluid/operators/argsort_op.cu b/paddle/fluid/operators/argsort_op.cu deleted file mode 100644 index 8b7a0b3eadb16b..00000000000000 --- a/paddle/fluid/operators/argsort_op.cu +++ /dev/null @@ -1,430 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include -#ifdef __NVCC__ -#include "cub/cub.cuh" -#endif -#ifdef __HIPCC__ -#include -namespace cub = hipcub; -#endif -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/argsort_op.h" -#include "paddle/fluid/operators/transpose_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -#ifdef __HIPCC__ -namespace rocprim { -namespace detail { -template <> -struct radix_key_codec_base - : radix_key_codec_integral {}; -} // namespace detail -} // namespace rocprim -#else -// set cub base traits in order to handle float16 -namespace cub { -template <> -struct NumericTraits - : BaseTraits {}; -} // namespace cub -#endif - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -// Iter for move to next row -struct SegmentOffsetIter { - EIGEN_DEVICE_FUNC - explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {} - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(int idx) const { - return idx * num_cols_; - } - - int num_cols_; -}; - -template -static __global__ void FillIndex(T* indices, T num_rows, T num_cols) { - int col_id = threadIdx.x; - int row_id = blockIdx.x; - - for (T j = row_id; j < num_rows; j += gridDim.x) { - for (T i = col_id; i < num_cols; i += blockDim.x) { - indices[j * num_cols + i] = i; - } - } -} - -template -static __global__ void FillFlattenGrad(const T* dO, const IndType* indices, - int64_t size, T* dX) { - int index = threadIdx.x + blockIdx.x * blockDim.x; - int stride = blockDim.x * gridDim.x; - for (int i = index; i < size; i += stride) { - dX[indices[i]] = dO[i]; - } -} - -template -static __global__ void FillGrad(const T* dO, const IndType* indices, T* dX, - IndType num_rows, IndType num_cols) { - int col_id = threadIdx.x; - int row_id = blockIdx.x; - - for (IndType j = row_id; j < num_rows; j += gridDim.x) { - for (IndType i = col_id; i < num_cols; i += blockDim.x) { - dX[j * num_cols + indices[j * num_cols + i]] = dO[j * num_cols + i]; - } - } -} - -// Sort by flag descending, True: descending. False: Ascending. -// Default is false. -template -void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input, - Tensor* output, Tensor* indices, const IndType num_rows, - const IndType num_cols, const bool descending) { - auto cu_stream = ctx.stream(); - - Tensor input_indices; - - const std::vector dims = {num_rows, num_cols}; - auto dim = phi::make_ddim(dims); - input_indices.Resize(dim); - input_indices.mutable_data(ctx.GetPlace()); - - size_t temp_storage_bytes = -1; - - auto ComputeBlockSize = [](IndType col) { - if (col > 512) - return 1024; - else if (col > 256 && col <= 512) - return 512; - else if (col > 128 && col <= 256) - return 256; - else if (col > 64 && col <= 128) - return 128; - else - return 64; - }; - - int block_size = ComputeBlockSize(num_cols); - - int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0]; - // actually, int num_rows < max_grid_size - int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX; - // Init a index array - FillIndex<<>>( - input_indices.data(), num_rows, num_cols); - - T* sorted_out_ptr; - IndType* sorted_indices_ptr; - - const T* inp = input->data(); - T* out = output->mutable_data(ctx.GetPlace()); - IndType* ind = indices->mutable_data(ctx.GetPlace()); - - sorted_out_ptr = out; - sorted_indices_ptr = ind; - - // create iter for counting input - cub::CountingInputIterator counting_iter(0); - // segment_offset is used for move to next row - cub::TransformInputIterator> - segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols)); - - gpuError_t err; - if (descending) { - err = cub::DeviceSegmentedRadixSort::SortPairsDescending( - nullptr, temp_storage_bytes, inp, sorted_out_ptr, - input_indices.data(), sorted_indices_ptr, num_cols * num_rows, - num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8, - cu_stream); - } else { - err = cub::DeviceSegmentedRadixSort::SortPairs( - nullptr, temp_storage_bytes, inp, sorted_out_ptr, - input_indices.data(), sorted_indices_ptr, num_cols * num_rows, - num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8, - cu_stream); - } - PADDLE_ENFORCE_GPU_SUCCESS(err); - - Tensor temp_storage; - temp_storage.mutable_data(ctx.GetPlace(), temp_storage_bytes); - - if (descending) { - err = cub::DeviceSegmentedRadixSort::SortPairsDescending( - temp_storage.data(), temp_storage_bytes, inp, sorted_out_ptr, - input_indices.data(), sorted_indices_ptr, num_cols * num_rows, - num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8, - cu_stream); - } else { - err = cub::DeviceSegmentedRadixSort::SortPairs( - temp_storage.data(), temp_storage_bytes, inp, sorted_out_ptr, - input_indices.data(), sorted_indices_ptr, num_cols * num_rows, - num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8, - cu_stream); - } - - PADDLE_ENFORCE_GPU_SUCCESS(err); -} - -template -void ArgFullAssign(const platform::CUDADeviceContext& ctx, const Tensor* dO, - const Tensor* indices, Tensor* dX, const IndType num_rows, - const IndType num_cols) { - auto cu_stream = ctx.stream(); - - auto ComputeBlockSize = [](IndType col) { - if (col > 512) - return 1024; - else if (col > 256 && col <= 512) - return 512; - else if (col > 128 && col <= 256) - return 256; - else if (col > 64 && col <= 128) - return 128; - else - return 64; - }; - - int block_size = ComputeBlockSize(num_cols); - - int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0]; - // actually, int num_rows < max_grid_size - int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX; - FillGrad<<>>( - dO->data(), indices->data(), dX->data(), num_rows, - num_cols); -} - -template -void ArgFlattenAssign(const platform::CUDADeviceContext& ctx, const Tensor* dO, - const Tensor* indices, int64_t size, Tensor* dX) { - auto cu_stream = ctx.stream(); - - const int64_t block_size = - std::min(size, static_cast(ctx.GetMaxThreadsPerBlock())); - int64_t max_threads = ctx.GetMaxPhysicalThreadCount(); - const int64_t max_blocks = - std::max(((max_threads - 1) / block_size + 1), static_cast(1)); - const int64_t grid_size = - std::min(max_blocks, (size + block_size - 1) / block_size); - - FillFlattenGrad<<>>( - dO->data(), indices->data(), size, dX->data()); -} - -template -class ArgsortOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - auto* indices = ctx.Output("Indices"); - int axis = ctx.Attr("axis"); - bool descending = ctx.Attr("descending"); - - auto in_dims = input->dims(); - axis = (axis < 0) ? (in_dims.size() + axis) : axis; - - const T* in_data = input->data(); - auto size = input->numel(); - T* out_data = output->mutable_data(ctx.GetPlace()); - int64_t* ids_data = indices->mutable_data(ctx.GetPlace()); - - // Use thrust for parallel acceleration when the input size is equal to the - // length of the ‘axis’ dimension. - // Compared to the following 'Special case for full sort', ascending sort is - // 34 times faster and descending sort is 31 times faster. - if (size == in_dims[axis]) { - thrust::sequence(thrust::device, ids_data, ids_data + size); - thrust::copy(thrust::device, in_data, in_data + size, out_data); - thrust::sort_by_key(thrust::device, out_data, out_data + size, ids_data); - if (descending) { - thrust::reverse(thrust::device, out_data, out_data + size); - thrust::reverse(thrust::device, ids_data, ids_data + size); - } - return; - } - - // Special case for full sort, speedup ~190x. - if (axis == -1 || axis + 1 == in_dims.size()) { - const int64_t input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t input_width = in_dims[in_dims.size() - 1]; - const auto& dev_ctx = ctx.cuda_device_context(); - ArgFullSort(dev_ctx, input, output, indices, input_height, - input_width, descending); - } else { - // if not full sort, do transpose first - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.push_back(i); - } - trans.push_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.push_back(i); - } - trans.push_back(axis); - framework::DDim trans_dims(in_dims); - for (int i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - } - - Tensor trans_inp; - T* trans_inp_data = trans_inp.mutable_data(trans_dims, ctx.GetPlace()); - int ndims = trans.size(); - const auto& dev_ctx = ctx.cuda_device_context(); - // Do transpose - TransCompute(ndims, dev_ctx, *input, - &trans_inp, trans); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t input_width = trans_dims[trans_dims.size() - 1]; - - Tensor tmp_out; - tmp_out.mutable_data(trans_dims, ctx.GetPlace()); - T* out_data = output->mutable_data(ctx.GetPlace()); - - Tensor tmp_indices; - // temp indices for sorting - tmp_indices.mutable_data(trans_dims, ctx.GetPlace()); - indices->mutable_data(ctx.GetPlace()); - - ArgFullSort(dev_ctx, &trans_inp, &tmp_out, &tmp_indices, - input_height, input_width, descending); - - TransCompute( - ndims, dev_ctx, tmp_indices, indices, trans); - // transpose back - TransCompute(ndims, dev_ctx, tmp_out, - output, trans); - return; - } - } -}; - -template -class ArgsortGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* indices = ctx.Input("Indices"); - auto* dX = ctx.Output(framework::GradVarName("X")); - auto* dO = ctx.Input(framework::GradVarName("Out")); - int axis = ctx.Attr("axis"); - - dX->mutable_data(ctx.GetPlace()); - if (dO->numel() == 0) return; - - auto in_dims = dX->dims(); - axis = (axis < 0) ? (in_dims.size() + axis) : axis; - - int64_t size = dX->numel(); - const auto& dev_ctx = ctx.cuda_device_context(); - - // Parallel acceleration when the input size is equal to the length of the - // ‘axis’ dimension. - // Compared to 'special case for full sort' below, the gradient calculation - // is 10 times faster. - if (size == in_dims[axis]) { - ArgFlattenAssign(dev_ctx, dO, indices, size, dX); - return; - } - - // Special case for full sort, speedup ~190x. - if (axis == -1 || axis + 1 == in_dims.size()) { - const int64_t input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t input_width = in_dims[in_dims.size() - 1]; - ArgFullAssign(dev_ctx, dO, indices, dX, input_height, - input_width); - } else { - // if not full sort, do transpose first - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.push_back(i); - } - trans.push_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.push_back(i); - } - trans.push_back(axis); - framework::DDim trans_dims(in_dims); - for (int i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - } - - Tensor trans_dO; - trans_dO.mutable_data(trans_dims, ctx.GetPlace()); - Tensor trans_ind; - trans_ind.mutable_data(trans_dims, ctx.GetPlace()); - int ndims = trans.size(); - // Do transpose - TransCompute(ndims, dev_ctx, *dO, - &trans_dO, trans); - TransCompute( - ndims, dev_ctx, *indices, &trans_ind, trans); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t input_width = trans_dims[trans_dims.size() - 1]; - - Tensor tmp_out; - tmp_out.mutable_data(trans_dims, ctx.GetPlace()); - - ArgFullAssign(dev_ctx, &trans_dO, &trans_ind, &tmp_out, - input_height, input_width); - - // transpose back - TransCompute(ndims, dev_ctx, tmp_out, dX, - trans); - return; - } - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OP_CUDA_KERNEL( - argsort, - paddle::operators::ArgsortOpCUDAKernel, - paddle::operators::ArgsortOpCUDAKernel, - paddle::operators::ArgsortOpCUDAKernel, - paddle::operators::ArgsortOpCUDAKernel, - paddle::operators::ArgsortOpCUDAKernel); -REGISTER_OP_CUDA_KERNEL( - argsort_grad, paddle::operators::ArgsortGradOpCUDAKernel, - paddle::operators::ArgsortGradOpCUDAKernel, - paddle::operators::ArgsortGradOpCUDAKernel, - paddle::operators::ArgsortGradOpCUDAKernel, - paddle::operators::ArgsortGradOpCUDAKernel); diff --git a/paddle/fluid/operators/argsort_op.h b/paddle/fluid/operators/argsort_op.h deleted file mode 100644 index d850e51a4bf061..00000000000000 --- a/paddle/fluid/operators/argsort_op.h +++ /dev/null @@ -1,243 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/transpose_op.h" - -namespace paddle { -namespace operators { - -template -using EigenMatrix = framework::EigenMatrix; - -template -using EigenVector = framework::EigenVector; - -using Tensor = framework::Tensor; - -template -static void FullSort(Type input_height, Type input_width, int input_dim, - const framework::Tensor* input, T* t_out, Type* t_indices, - bool descending) { -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (Type i = 0; i < input_height; ++i) { - std::vector> col_vec; - col_vec.reserve(input_width); - if (input_dim == 1) { - auto e_input = EigenVector::Flatten(*input); - for (Type j = 0; j < input_width; ++j) { - col_vec.push_back(std::pair(e_input(j), j)); - } - } else { - auto e_input = EigenMatrix::Reshape(*input, input_dim - 1); - for (Type j = 0; j < input_width; ++j) { - col_vec.push_back(std::pair(e_input(i, j), j)); - } - } - std::sort(col_vec.begin(), col_vec.end(), - [&](const std::pair& l, const std::pair& r) { - if (descending) - return l.first > r.first; - else - return l.first < r.first; - }); - - for (Type j = 0; j < input_width; ++j) { - t_out[i * input_width + j] = col_vec[j].first; - t_indices[i * input_width + j] = col_vec[j].second; - } - } -} - -template -static void FullAssign(Type input_height, Type input_width, int input_dim, - const framework::Tensor* input, - const framework::Tensor* indices, T* t_out) { -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (Type i = 0; i < input_height; ++i) { - if (input_dim == 1) { - auto e_input = EigenVector::Flatten(*input); - auto e_indices = EigenVector::Flatten(*indices); - for (Type j = 0; j < input_width; ++j) { - t_out[i * input_width + e_indices(j)] = e_input(j); - } - } else { - auto e_input = EigenMatrix::Reshape(*input, input_dim - 1); - auto e_indices = EigenMatrix::Reshape(*indices, input_dim - 1); - for (Type j = 0; j < input_width; ++j) { - t_out[i * input_width + e_indices(i, j)] = e_input(i, j); - } - } - } -} - -template -class ArgsortKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - auto* indices = ctx.Output("Indices"); - int axis = ctx.Attr("axis"); - bool descending = ctx.Attr("descending"); - - auto in_dims = input->dims(); - axis = (axis < 0) ? (in_dims.size() + axis) : axis; - - T* out_data = output->mutable_data(ctx.GetPlace()); - - // Do full sort - if (axis == -1 || axis + 1 == in_dims.size()) { - const int64_t input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t input_width = in_dims[in_dims.size() - 1]; - - int64_t* ids_data = indices->mutable_data(ctx.GetPlace()); - FullSort(input_height, input_width, in_dims.size(), input, - out_data, ids_data, descending); - } else { - // If not full sort do transpose - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.push_back(i); - } - trans.push_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.push_back(i); - } - trans.push_back(axis); - framework::DDim trans_dims(in_dims); - for (size_t i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - } - - Tensor trans_inp; - trans_inp.mutable_data(trans_dims, ctx.GetPlace()); - int ndims = trans.size(); - auto& dev_ctx = ctx.template device_context(); - // Do transpose - TransCompute(ndims, dev_ctx, *input, - &trans_inp, trans); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t input_width = trans_dims[trans_dims.size() - 1]; - - Tensor tmp_out; - T* t_out = tmp_out.mutable_data(trans_dims, ctx.GetPlace()); - output->mutable_data(ctx.GetPlace()); - - Tensor tmp_indices; - - auto* t_ind = - tmp_indices.mutable_data(trans_dims, ctx.GetPlace()); - - FullSort(input_height, input_width, in_dims.size(), - &trans_inp, t_out, t_ind, descending); - - indices->mutable_data(ctx.GetPlace()); - TransCompute( - ndims, dev_ctx, tmp_indices, indices, trans); - // transpose back - TransCompute(ndims, dev_ctx, tmp_out, - output, trans); - } - } -}; - -template -class ArgsortGradientKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* indices = ctx.Input("Indices"); - auto* dX = ctx.Output(framework::GradVarName("X")); - auto* dO = ctx.Input(framework::GradVarName("Out")); - int axis = ctx.Attr("axis"); - - auto in_dims = indices->dims(); - axis = (axis < 0) ? (in_dims.size() + axis) : axis; - - dX->mutable_data(ctx.GetPlace()); - auto dxt = framework::EigenVector::Flatten(*dX); - auto& place = *ctx.template device_context() - .eigen_device(); - dxt.device(place) = dxt.constant(static_cast(0)); - if (dO->numel() == 0) return; - - // Do full assign - if (axis == -1 || axis + 1 == in_dims.size()) { - const int64_t input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t input_width = in_dims[in_dims.size() - 1]; - - FullAssign(input_height, input_width, in_dims.size(), dO, - indices, dX->data()); - } else { - // If not full assign do transpose - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.push_back(i); - } - trans.push_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.push_back(i); - } - trans.push_back(axis); - framework::DDim trans_dims(in_dims); - for (size_t i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - } - - Tensor trans_dO; - trans_dO.mutable_data(trans_dims, ctx.GetPlace()); - Tensor trans_ind; - trans_ind.mutable_data(trans_dims, ctx.GetPlace()); - int ndims = trans.size(); - auto& dev_ctx = ctx.template device_context(); - // Do transpose - TransCompute(ndims, dev_ctx, *dO, - &trans_dO, trans); - TransCompute( - ndims, dev_ctx, *indices, &trans_ind, trans); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t input_width = trans_dims[trans_dims.size() - 1]; - - Tensor tmp_out; - T* t_out = tmp_out.mutable_data(trans_dims, ctx.GetPlace()); - - FullAssign(input_height, input_width, in_dims.size(), - &trans_dO, &trans_ind, t_out); - - // transpose back - TransCompute(ndims, dev_ctx, tmp_out, dX, - trans); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/argsort_op_npu.cc b/paddle/fluid/operators/argsort_op_npu.cc index 077be715bece0b..c927eec00bc8bf 100644 --- a/paddle/fluid/operators/argsort_op_npu.cc +++ b/paddle/fluid/operators/argsort_op_npu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/argsort_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/argsort_op_xpu.cc b/paddle/fluid/operators/argsort_op_xpu.cc index 18e81936a16c63..359b00fcf87ee1 100644 --- a/paddle/fluid/operators/argsort_op_xpu.cc +++ b/paddle/fluid/operators/argsort_op_xpu.cc @@ -14,7 +14,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/argsort_op.h" +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/assign_op_npu_test.cc b/paddle/fluid/operators/assign_op_npu_test.cc index 72488a932d9c33..b452dea8536dd9 100644 --- a/paddle/fluid/operators/assign_op_npu_test.cc +++ b/paddle/fluid/operators/assign_op_npu_test.cc @@ -23,7 +23,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/atan2_op.cc b/paddle/fluid/operators/atan2_op.cc index 71a895c244c54f..0783b30a8580db 100644 --- a/paddle/fluid/operators/atan2_op.cc +++ b/paddle/fluid/operators/atan2_op.cc @@ -105,8 +105,8 @@ class Atan2OpVarTypeInference : public framework::VarTypeInference { } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(atan2, Atan2InferShapeFunctor, - PT_INFER_META(phi::Atan2InferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(atan2, Atan2InferShapeFunctor, + PD_INFER_META(phi::Atan2InferMeta)); REGISTER_OPERATOR(atan2, ops::Atan2Op, ops::Atan2OpMaker, ops::Atan2GradMaker, ops::Atan2GradMaker, diff --git a/paddle/fluid/operators/bce_loss_op.cc b/paddle/fluid/operators/bce_loss_op.cc index 55bb57466c7b5e..bc9076f4d7c368 100644 --- a/paddle/fluid/operators/bce_loss_op.cc +++ b/paddle/fluid/operators/bce_loss_op.cc @@ -138,8 +138,8 @@ DECLARE_INPLACE_OP_INFERER(BCELossGradInplaceInferer, } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(bce_loss, BCELossInferShapeFunctor, - PT_INFER_META(phi::BCELossInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(bce_loss, BCELossInferShapeFunctor, + PD_INFER_META(phi::BCELossInferMeta)); REGISTER_OPERATOR(bce_loss, ops::BCELossOp, ops::BCELossOpMaker, ops::BCELossGradOpMaker, diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.cc b/paddle/fluid/operators/bilinear_tensor_product_op.cc index 4774c0a1dbc3b7..9f6a78ab7a55f3 100644 --- a/paddle/fluid/operators/bilinear_tensor_product_op.cc +++ b/paddle/fluid/operators/bilinear_tensor_product_op.cc @@ -90,12 +90,12 @@ class BilinearTensorProductGradOpMaker namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(bilinear_tensor_product, +DECLARE_INFER_SHAPE_FUNCTOR(bilinear_tensor_product, BilinearTensorProductInferShapeFunctor, - PT_INFER_META(phi::BilinearTensorProductInferMeta)); -DELCARE_INFER_SHAPE_FUNCTOR( + PD_INFER_META(phi::BilinearTensorProductInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR( bilinear_tensor_product_grad, BilinearTensorProductGradInferShapeFunctor, - PT_INFER_META(phi::BilinearTensorProductGradInferMeta)); + PD_INFER_META(phi::BilinearTensorProductGradInferMeta)); REGISTER_OPERATOR( bilinear_tensor_product, ops::BilinearTensorProductOp, diff --git a/paddle/fluid/operators/bincount_op.cc b/paddle/fluid/operators/bincount_op.cc index b37334a14bad4f..062e7d510d54c0 100644 --- a/paddle/fluid/operators/bincount_op.cc +++ b/paddle/fluid/operators/bincount_op.cc @@ -12,12 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/bincount_op.h" - #include #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" + namespace paddle { namespace operators { @@ -28,51 +31,6 @@ class BincountOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(X) of BincountOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of BincountOp should not be null.")); - - auto input_dim = ctx->GetInputDim("X"); - auto minlength = ctx->Attrs().Get("minlength"); - - PADDLE_ENFORCE_GE(minlength, 0, - platform::errors::InvalidArgument( - "The minlength should be greater than or equal to 0." - "But received minlength is %d", - minlength)); - - PADDLE_ENFORCE_EQ(input_dim.size(), 1, - platform::errors::InvalidArgument( - "The 'shape' of Input(X) must be 1-D tensor." - "But the dimension of Input(X) is [%d]", - input_dim.size())); - - if (ctx->HasInput("Weights")) { - auto weights_dim = ctx->GetInputDim("Weights"); - PADDLE_ENFORCE_EQ(weights_dim.size(), 1, - platform::errors::InvalidArgument( - "The 'shape' of Input(Weights) must be 1-D tensor." - "But the dimension of Input(Weights) is [%d]", - weights_dim.size())); - - PADDLE_ENFORCE_EQ( - weights_dim[0], input_dim[0], - platform::errors::InvalidArgument( - "The 'shape' of Input(Weights) must be equal to the 'shape' of " - "Input(X)." - "But received: the 'shape' of Input(Weights) is [%s]," - "the 'shape' of Input(X) is [%s]", - weights_dim, input_dim)); - } - - ctx->SetOutputDim("Out", phi::make_ddim({-1})); - ctx->ShareLoD("X", /*->*/ "Out"); - } - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const { auto data_type = @@ -105,12 +63,10 @@ class BincountOpMaker : public framework::OpProtoAndCheckerMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(bincount, BincountInferShapeFunctor, + PD_INFER_META(phi::BincountInferMeta)); REGISTER_OPERATOR( bincount, ops::BincountOp, ops::BincountOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL( - bincount, ops::BincountKernel, - ops::BincountKernel, - ops::BincountKernel, - ops::BincountKernel); + paddle::framework::EmptyGradOpMaker, + BincountInferShapeFunctor); diff --git a/paddle/fluid/operators/bincount_op.cu b/paddle/fluid/operators/bincount_op.cu deleted file mode 100644 index cc576d0af92877..00000000000000 --- a/paddle/fluid/operators/bincount_op.cu +++ /dev/null @@ -1,162 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/operators/bincount_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/phi/core/hostdevice.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using platform::PADDLE_CUDA_NUM_THREADS; - -inline int GET_BLOCKS(const int N) { - return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS; -} - -template -__global__ void KernelBincount(const InputT* input, const int total_elements, - const bool has_weights, const T* weights, - OutT* output) { - if (!has_weights) { - for (int i = threadIdx.x; i < total_elements; i += blockDim.x) { - paddle::platform::CudaAtomicAdd(&output[input[i]], 1L); - } - } else { - for (int i = threadIdx.x; i < total_elements; i += blockDim.x) { - paddle::platform::CudaAtomicAdd(&output[input[i]], - static_cast(weights[i])); - } - } -} - -template -void BincountCUDAInner(const framework::ExecutionContext& context) { - const Tensor* input = context.Input("X"); - const Tensor* weights = context.Input("Weights"); - Tensor* output = context.Output("Out"); - auto& minlength = context.Attr("minlength"); - - const InputT* input_data = input->data(); - - const int input_numel = input->numel(); - - if (input_data == nullptr) { - framework::DDim out_dim{0}; - output->Resize(out_dim); - output->mutable_data(context.GetPlace()); - return; - } - auto input_x = framework::EigenVector::Flatten(*input); - - framework::Tensor input_min_t, input_max_t; - auto* input_max_data = - input_max_t.mutable_data({1}, context.GetPlace()); - auto* input_min_data = - input_min_t.mutable_data({1}, context.GetPlace()); - - auto input_max_scala = framework::EigenScalar::From(input_max_t); - auto input_min_scala = framework::EigenScalar::From(input_min_t); - - auto* place = context.template device_context().eigen_device(); - input_max_scala.device(*place) = input_x.maximum(); - input_min_scala.device(*place) = input_x.minimum(); - - Tensor input_min_cpu, input_max_cpu; - paddle::framework::TensorCopySync(input_max_t, platform::CPUPlace(), - &input_max_cpu); - paddle::framework::TensorCopySync(input_min_t, platform::CPUPlace(), - &input_min_cpu); - - InputT input_min = input_min_cpu.data()[0]; - - PADDLE_ENFORCE_GE( - input_min, static_cast(0), - platform::errors::InvalidArgument( - "The elements in input tensor must be non-negative ints")); - - int64_t output_size = - static_cast(input_max_cpu.data()[0]) + 1L; - - output_size = std::max(output_size, static_cast(minlength)); - framework::DDim out_dim{output_size}; - output->Resize(out_dim); - - bool has_weights = (weights != nullptr); - - const T* weights_data = has_weights ? weights->data() : nullptr; - - auto stream = - context.template device_context().stream(); - - if (!has_weights) { - int64_t* output_data = output->mutable_data(context.GetPlace()); - phi::funcs::SetConstant()( - context.template device_context(), output, 0L); - - KernelBincount<<>>( - input_data, input_numel, has_weights, weights_data, output_data); - } else { - const auto& weights_type = framework::TransToProtoVarType(weights->dtype()); - - if (weights_type == framework::proto::VarType::FP32) { - float* output_data = output->mutable_data(context.GetPlace()); - phi::funcs::SetConstant()( - context.template device_context(), output, - static_cast(0)); - - KernelBincount<<>>( - input_data, input_numel, has_weights, weights_data, output_data); - } else { - double* output_data = output->mutable_data(context.GetPlace()); - phi::funcs::SetConstant()( - context.template device_context(), output, - static_cast(0)); - - KernelBincount<<>>( - input_data, input_numel, has_weights, weights_data, output_data); - } - } -} - -template -class BincountCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* input = context.Input("X"); - const auto& input_type = framework::TransToProtoVarType(input->dtype()); - - if (input_type == framework::proto::VarType::INT32) { - BincountCUDAInner(context); - } else if (input_type == framework::proto::VarType::INT64) { - BincountCUDAInner(context); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - bincount, ops::BincountCUDAKernel, - ops::BincountCUDAKernel, - ops::BincountCUDAKernel, - ops::BincountCUDAKernel); diff --git a/paddle/fluid/operators/bincount_op.h b/paddle/fluid/operators/bincount_op.h deleted file mode 100644 index 84256bf78e4a19..00000000000000 --- a/paddle/fluid/operators/bincount_op.h +++ /dev/null @@ -1,109 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -void BincountInner(const framework::ExecutionContext& context) { - const Tensor* input = context.Input("X"); - const Tensor* weights = context.Input("Weights"); - Tensor* output = context.Output("Out"); - auto& minlength = context.Attr("minlength"); - - const InputT* input_data = input->data(); - - auto input_numel = input->numel(); - - if (input_data == nullptr) { - framework::DDim out_dim{0}; - output->Resize(out_dim); - output->mutable_data(context.GetPlace()); - return; - } - - PADDLE_ENFORCE_GE( - *std::min_element(input_data, input_data + input_numel), - static_cast(0), - platform::errors::InvalidArgument( - "The elements in input tensor must be non-negative ints")); - - int64_t output_size = static_cast(*std::max_element( - input_data, input_data + input_numel)) + - 1L; - output_size = std::max(output_size, static_cast(minlength)); - - framework::DDim out_dim{output_size}; - output->Resize(out_dim); - - bool has_weights = (weights != nullptr); - - if (has_weights) { - const T* weights_data = weights->data(); - const auto& weights_type = framework::TransToProtoVarType(weights->dtype()); - if (weights_type == framework::proto::VarType::FP32) { - float* output_data = output->mutable_data(context.GetPlace()); - phi::funcs::SetConstant()( - context.template device_context(), output, - static_cast(0)); - for (int64_t i = 0; i < input_numel; i++) { - output_data[input_data[i]] += static_cast(weights_data[i]); - } - } else { - double* output_data = output->mutable_data(context.GetPlace()); - phi::funcs::SetConstant()( - context.template device_context(), output, - static_cast(0)); - for (int64_t i = 0; i < input_numel; i++) { - output_data[input_data[i]] += static_cast(weights_data[i]); - } - } - - } else { - int64_t* output_data = output->mutable_data(context.GetPlace()); - phi::funcs::SetConstant()( - context.template device_context(), output, 0L); - for (int64_t i = 0; i < input_numel; i++) { - output_data[input_data[i]] += 1L; - } - } -} - -template -class BincountKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* input = context.Input("X"); - const auto& input_type = framework::TransToProtoVarType(input->dtype()); - - if (input_type == framework::proto::VarType::INT32) { - BincountInner(context); - } else if (input_type == framework::proto::VarType::INT64) { - BincountInner(context); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/broadcast_tensors_op.cc b/paddle/fluid/operators/broadcast_tensors_op.cc index c3917fad555cb4..1063a8b7992153 100644 --- a/paddle/fluid/operators/broadcast_tensors_op.cc +++ b/paddle/fluid/operators/broadcast_tensors_op.cc @@ -167,9 +167,9 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(BroadcastTensorsGradNoNeedBufVarsInferer, namespace ops = paddle::operators; namespace plat = paddle::platform; -DELCARE_INFER_SHAPE_FUNCTOR(broadcast_tensors, +DECLARE_INFER_SHAPE_FUNCTOR(broadcast_tensors, BroadcastTensorsInferShapeFunctor, - PT_INFER_META(phi::BroadcastTensorsInferMeta)); + PD_INFER_META(phi::BroadcastTensorsInferMeta)); REGISTER_OPERATOR(broadcast_tensors, ops::BroadcastTensorsOp, ops::BroadcastTensorsOpMaker, diff --git a/paddle/fluid/operators/cholesky_op.cc b/paddle/fluid/operators/cholesky_op.cc index 09e915a6bafd4a..ed80ac076c0af7 100644 --- a/paddle/fluid/operators/cholesky_op.cc +++ b/paddle/fluid/operators/cholesky_op.cc @@ -90,8 +90,8 @@ class CholeskyGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(cholesky, CholeskyInferShapeFunctor, - PT_INFER_META(phi::CholeskyInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(cholesky, CholeskyInferShapeFunctor, + PD_INFER_META(phi::CholeskyInferMeta)); REGISTER_OPERATOR(cholesky, ops::CholeskyOp, ops::CholeskyOpMaker, ops::CholeskyGradOpMaker, ops::CholeskyGradOpMaker, diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc index c0968581acda99..7206dd01bcaa3e 100644 --- a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc index 31b00a93f13965..0946ad8aca65e2 100644 --- a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc index 9c11704704ed42..61e5f279034779 100644 --- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc index 5787090e6a52f2..cf4d6a28744b36 100644 --- a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc index c79b2f92b69a1e..c4e410d04da5fb 100644 --- a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc index d9a7a4abb08fc8..8b498787c69db0 100644 --- a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc index b8abf458c1c6d3..133085ad3f3b0f 100644 --- a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/checknumeric_npu_test.cc b/paddle/fluid/operators/collective/checknumeric_npu_test.cc index bb78971734bf05..36c6f4fadd0fcc 100644 --- a/paddle/fluid/operators/collective/checknumeric_npu_test.cc +++ b/paddle/fluid/operators/collective/checknumeric_npu_test.cc @@ -27,7 +27,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc index 8f7b8c4a9040be..6e02d362156970 100644 --- a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc +++ b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc index c40b2c3e76a02c..57e3dd53cc7748 100644 --- a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc +++ b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc @@ -25,7 +25,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc index 1da7798ea26965..059fafa3e7f4d4 100644 --- a/paddle/fluid/operators/concat_op.cc +++ b/paddle/fluid/operators/concat_op.cc @@ -205,8 +205,8 @@ class ConcatDoubleGradOpMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(concat, ConcatInferShapeFunctor, - PT_INFER_META(phi::ConcatInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(concat, ConcatInferShapeFunctor, + PD_INFER_META(phi::ConcatInferMeta)); REGISTER_OPERATOR(concat, ops::ConcatOp, ops::ConcatOpMaker, ops::ConcatGradOpMaker, diff --git a/paddle/fluid/operators/conj_op.cc b/paddle/fluid/operators/conj_op.cc index 95135ba3b1a3db..cbec1182f20b88 100644 --- a/paddle/fluid/operators/conj_op.cc +++ b/paddle/fluid/operators/conj_op.cc @@ -66,8 +66,8 @@ class ConjGradMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(conj, ConjInferShapeFunctor, - PT_INFER_META(phi::UnchangedInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(conj, ConjInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(conj, ops::ConjOp, ops::ConjOpMaker, ops::ConjGradMaker, ops::ConjGradMaker, diff --git a/paddle/fluid/operators/controlflow/compare_all_op.cc b/paddle/fluid/operators/controlflow/compare_all_op.cc index 9f229e6f15c218..dd407f4f6f3c51 100644 --- a/paddle/fluid/operators/controlflow/compare_all_op.cc +++ b/paddle/fluid/operators/controlflow/compare_all_op.cc @@ -58,8 +58,8 @@ class CompareReduceOp : public framework::OperatorWithKernel { }; \ char _##op_type##Comment::type[]{#op_type}; \ char _##op_type##Comment::equation[]{_equation}; \ - DELCARE_INFER_SHAPE_FUNCTOR(op_type, op_type##_InferShapeFunctor, \ - PT_INFER_META(phi::CompareAllInferMeta)); \ + DECLARE_INFER_SHAPE_FUNCTOR(op_type, op_type##_InferShapeFunctor, \ + PD_INFER_META(phi::CompareAllInferMeta)); \ REGISTER_OPERATOR( \ op_type, ::paddle::operators::CompareReduceOp<_##op_type##Comment>, \ ::paddle::operators::CompareReduceOpProtoMaker<_##op_type##Comment>, \ diff --git a/paddle/fluid/operators/controlflow/compare_op.cc b/paddle/fluid/operators/controlflow/compare_op.cc index 5d9cdc617690f0..72d81d8c3fdf28 100644 --- a/paddle/fluid/operators/controlflow/compare_op.cc +++ b/paddle/fluid/operators/controlflow/compare_op.cc @@ -96,8 +96,8 @@ class CompareOp : public framework::OperatorWithKernel { }; \ char _##op_type##Comment::type[]{#op_type}; \ char _##op_type##Comment::equation[]{_equation}; \ - DELCARE_INFER_SHAPE_FUNCTOR(op_type, op_type##_InferShapeFunctor, \ - PT_INFER_META(phi::CompareInferMeta)); \ + DECLARE_INFER_SHAPE_FUNCTOR(op_type, op_type##_InferShapeFunctor, \ + PD_INFER_META(phi::CompareInferMeta)); \ REGISTER_OPERATOR( \ op_type, ::paddle::operators::CompareOp<_##op_type##Comment>, \ ::paddle::operators::CompareOpProtoMaker<_##op_type##Comment>, \ diff --git a/paddle/fluid/operators/cross_op.cc b/paddle/fluid/operators/cross_op.cc index fe00ee06603f0e..674b75625d1983 100644 --- a/paddle/fluid/operators/cross_op.cc +++ b/paddle/fluid/operators/cross_op.cc @@ -109,8 +109,8 @@ class CrossGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(cross, CrossInferShapeFunctor, - PT_INFER_META(phi::CrossInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(cross, CrossInferShapeFunctor, + PD_INFER_META(phi::CrossInferMeta)); REGISTER_OPERATOR(cross, ops::CrossOp, ops::CrossOpMaker, ops::CrossGradMaker, ops::CrossGradMaker, diff --git a/paddle/fluid/operators/cumsum_op.cc b/paddle/fluid/operators/cumsum_op.cc index 7c80917a71369e..11633fb0b87032 100644 --- a/paddle/fluid/operators/cumsum_op.cc +++ b/paddle/fluid/operators/cumsum_op.cc @@ -12,8 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -21,17 +24,6 @@ namespace operators { class CumOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - if (ctx->Attrs().Get("flatten")) { - ctx->SetOutputDim("Out", - phi::make_ddim({phi::product(ctx->GetInputDim("X"))})); - } else { - ctx->SetOutputDim("Out", ctx->GetInputDim("X")); - } - - ctx->ShareLoD("X", /*->*/ "Out"); - } }; class CumsumOpMaker : public framework::OpProtoAndCheckerMaker { @@ -87,10 +79,12 @@ class CumsumGradMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; using CPU = paddle::platform::CPUDeviceContext; - +DECLARE_INFER_SHAPE_FUNCTOR(cumsum, CumsumInferShapeFunctor, + PD_INFER_META(phi::CumsumInferMeta)); REGISTER_OPERATOR(cumsum, ops::CumOp, ops::CumsumOpMaker, ops::CumsumGradMaker, - ops::CumsumGradMaker); + ops::CumsumGradMaker, + CumsumInferShapeFunctor); REGISTER_OP_VERSION(cumsum) .AddCheckpoint( diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index 1ebafa54598574..568c7982cfc7c0 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -62,7 +62,7 @@ detection_library(locality_aware_nms_op SRCS locality_aware_nms_op.cc DEPS gpc) detection_library(matrix_nms_op SRCS matrix_nms_op.cc DEPS gpc) detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu) detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc) -detection_library(yolo_box_op SRCS yolo_box_op.cc yolo_box_op.cu) +detection_library(yolo_box_op SRCS yolo_box_op.cc) detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc box_decoder_and_assign_op.cu) detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc sigmoid_focal_loss_op.cu) detection_library(retinanet_detection_output_op SRCS retinanet_detection_output_op.cc) diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc index 511d8e0eed1065..0d9fbf612f73c4 100644 --- a/paddle/fluid/operators/detection/yolo_box_op.cc +++ b/paddle/fluid/operators/detection/yolo_box_op.cc @@ -9,7 +9,6 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/detection/yolo_box_op.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" @@ -240,8 +239,6 @@ REGISTER_OPERATOR( yolo_box, ops::YoloBoxOp, ops::YoloBoxOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL(yolo_box, ops::YoloBoxKernel, - ops::YoloBoxKernel); REGISTER_OP_VERSION(yolo_box) .AddCheckpoint( diff --git a/paddle/fluid/operators/detection/yolo_box_op.cu b/paddle/fluid/operators/detection/yolo_box_op.cu deleted file mode 100644 index fb5c214a59e127..00000000000000 --- a/paddle/fluid/operators/detection/yolo_box_op.cu +++ /dev/null @@ -1,143 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/memory/malloc.h" -#include "paddle/fluid/operators/detection/yolo_box_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/phi/kernels/funcs/math_function.h" -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -__global__ void KeYoloBoxFw(const T* input, const int* imgsize, T* boxes, - T* scores, const float conf_thresh, - const int* anchors, const int n, const int h, - const int w, const int an_num, const int class_num, - const int box_num, int input_size_h, - int input_size_w, bool clip_bbox, const float scale, - const float bias, bool iou_aware, - const float iou_aware_factor) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - T box[4]; - for (; tid < n * box_num; tid += stride) { - int grid_num = h * w; - int i = tid / box_num; - int j = (tid % box_num) / grid_num; - int k = (tid % grid_num) / w; - int l = tid % w; - - int an_stride = (5 + class_num) * grid_num; - int img_height = imgsize[2 * i]; - int img_width = imgsize[2 * i + 1]; - - int obj_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4, - iou_aware); - T conf = sigmoid(input[obj_idx]); - if (iou_aware) { - int iou_idx = GetIoUIndex(i, j, k * w + l, an_num, an_stride, grid_num); - T iou = sigmoid(input[iou_idx]); - conf = pow(conf, static_cast(1. - iou_aware_factor)) * - pow(iou, static_cast(iou_aware_factor)); - } - if (conf < conf_thresh) { - continue; - } - - int box_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0, - iou_aware); - GetYoloBox(box, input, anchors, l, k, j, h, w, input_size_h, - input_size_w, box_idx, grid_num, img_height, img_width, scale, - bias); - box_idx = (i * box_num + j * grid_num + k * w + l) * 4; - CalcDetectionBox(boxes, box, box_idx, img_height, img_width, clip_bbox); - - int label_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, - 5, iou_aware); - int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num; - CalcLabelScore(scores, input, label_idx, score_idx, class_num, conf, - grid_num); - } -} - -template -class YoloBoxOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* img_size = ctx.Input("ImgSize"); - auto* boxes = ctx.Output("Boxes"); - auto* scores = ctx.Output("Scores"); - - auto anchors = ctx.Attr>("anchors"); - int class_num = ctx.Attr("class_num"); - float conf_thresh = ctx.Attr("conf_thresh"); - int downsample_ratio = ctx.Attr("downsample_ratio"); - bool clip_bbox = ctx.Attr("clip_bbox"); - bool iou_aware = ctx.Attr("iou_aware"); - float iou_aware_factor = ctx.Attr("iou_aware_factor"); - float scale = ctx.Attr("scale_x_y"); - float bias = -0.5 * (scale - 1.); - - const int n = input->dims()[0]; - const int h = input->dims()[2]; - const int w = input->dims()[3]; - const int box_num = boxes->dims()[1]; - const int an_num = anchors.size() / 2; - int input_size_h = downsample_ratio * h; - int input_size_w = downsample_ratio * w; - - auto& dev_ctx = ctx.cuda_device_context(); - int bytes = sizeof(int) * anchors.size(); - auto anchors_ptr = memory::Alloc(dev_ctx, sizeof(int) * anchors.size()); - int* anchors_data = reinterpret_cast(anchors_ptr->ptr()); - const auto gplace = ctx.GetPlace(); - const auto cplace = platform::CPUPlace(); - memory::Copy(gplace, anchors_data, cplace, anchors.data(), bytes, - dev_ctx.stream()); - - const T* input_data = input->data(); - const int* imgsize_data = img_size->data(); - T* boxes_data = boxes->mutable_data({n, box_num, 4}, ctx.GetPlace()); - T* scores_data = - scores->mutable_data({n, box_num, class_num}, ctx.GetPlace()); - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, boxes, static_cast(0)); - set_zero(dev_ctx, scores, static_cast(0)); - platform::GpuLaunchConfig config = - platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), n * box_num); - - dim3 thread_num = config.thread_per_block; -#ifdef WITH_NV_JETSON - if (config.compute_capability == 53 || config.compute_capability == 62) { - thread_num = 512; - } -#endif - - KeYoloBoxFw<<>>( - input_data, imgsize_data, boxes_data, scores_data, conf_thresh, - anchors_data, n, h, w, an_num, class_num, box_num, input_size_h, - input_size_w, clip_bbox, scale, bias, iou_aware, iou_aware_factor); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(yolo_box, ops::YoloBoxOpCUDAKernel, - ops::YoloBoxOpCUDAKernel); diff --git a/paddle/fluid/operators/detection/yolo_box_op.h b/paddle/fluid/operators/detection/yolo_box_op.h deleted file mode 100644 index 2cd69c60b7c44d..00000000000000 --- a/paddle/fluid/operators/detection/yolo_box_op.h +++ /dev/null @@ -1,180 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#pragma once -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/core/hostdevice.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -HOSTDEVICE inline T sigmoid(T x) { - return 1.0 / (1.0 + std::exp(-x)); -} - -template -HOSTDEVICE inline void GetYoloBox(T* box, const T* x, const int* anchors, int i, - int j, int an_idx, int grid_size_h, - int grid_size_w, int input_size_h, - int input_size_w, int index, int stride, - int img_height, int img_width, float scale, - float bias) { - box[0] = (i + sigmoid(x[index]) * scale + bias) * img_width / grid_size_w; - box[1] = (j + sigmoid(x[index + stride]) * scale + bias) * img_height / - grid_size_h; - box[2] = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] * img_width / - input_size_w; - box[3] = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] * - img_height / input_size_h; -} - -HOSTDEVICE inline int GetEntryIndex(int batch, int an_idx, int hw_idx, - int an_num, int an_stride, int stride, - int entry, bool iou_aware) { - if (iou_aware) { - return (batch * an_num + an_idx) * an_stride + - (batch * an_num + an_num + entry) * stride + hw_idx; - } else { - return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx; - } -} - -HOSTDEVICE inline int GetIoUIndex(int batch, int an_idx, int hw_idx, int an_num, - int an_stride, int stride) { - return batch * an_num * an_stride + (batch * an_num + an_idx) * stride + - hw_idx; -} - -template -HOSTDEVICE inline void CalcDetectionBox(T* boxes, T* box, const int box_idx, - const int img_height, - const int img_width, bool clip_bbox) { - boxes[box_idx] = box[0] - box[2] / 2; - boxes[box_idx + 1] = box[1] - box[3] / 2; - boxes[box_idx + 2] = box[0] + box[2] / 2; - boxes[box_idx + 3] = box[1] + box[3] / 2; - - if (clip_bbox) { - boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : static_cast(0); - boxes[box_idx + 1] = - boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : static_cast(0); - boxes[box_idx + 2] = boxes[box_idx + 2] < img_width - 1 - ? boxes[box_idx + 2] - : static_cast(img_width - 1); - boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1 - ? boxes[box_idx + 3] - : static_cast(img_height - 1); - } -} - -template -HOSTDEVICE inline void CalcLabelScore(T* scores, const T* input, - const int label_idx, const int score_idx, - const int class_num, const T conf, - const int stride) { - for (int i = 0; i < class_num; i++) { - scores[score_idx + i] = conf * sigmoid(input[label_idx + i * stride]); - } -} - -template -class YoloBoxKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* imgsize = ctx.Input("ImgSize"); - auto* boxes = ctx.Output("Boxes"); - auto* scores = ctx.Output("Scores"); - auto anchors = ctx.Attr>("anchors"); - int class_num = ctx.Attr("class_num"); - float conf_thresh = ctx.Attr("conf_thresh"); - int downsample_ratio = ctx.Attr("downsample_ratio"); - bool clip_bbox = ctx.Attr("clip_bbox"); - bool iou_aware = ctx.Attr("iou_aware"); - float iou_aware_factor = ctx.Attr("iou_aware_factor"); - float scale = ctx.Attr("scale_x_y"); - float bias = -0.5 * (scale - 1.); - - const int n = input->dims()[0]; - const int h = input->dims()[2]; - const int w = input->dims()[3]; - const int box_num = boxes->dims()[1]; - const int an_num = anchors.size() / 2; - int input_size_h = downsample_ratio * h; - int input_size_w = downsample_ratio * w; - - const int stride = h * w; - const int an_stride = (class_num + 5) * stride; - - Tensor anchors_; - auto anchors_data = - anchors_.mutable_data({an_num * 2}, ctx.GetPlace()); - std::copy(anchors.begin(), anchors.end(), anchors_data); - - const T* input_data = input->data(); - const int* imgsize_data = imgsize->data(); - T* boxes_data = boxes->mutable_data({n, box_num, 4}, ctx.GetPlace()); - memset(boxes_data, 0, boxes->numel() * sizeof(T)); - T* scores_data = - scores->mutable_data({n, box_num, class_num}, ctx.GetPlace()); - memset(scores_data, 0, scores->numel() * sizeof(T)); - - T box[4]; - for (int i = 0; i < n; i++) { - int img_height = imgsize_data[2 * i]; - int img_width = imgsize_data[2 * i + 1]; - - for (int j = 0; j < an_num; j++) { - for (int k = 0; k < h; k++) { - for (int l = 0; l < w; l++) { - int obj_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, - stride, 4, iou_aware); - T conf = sigmoid(input_data[obj_idx]); - if (iou_aware) { - int iou_idx = - GetIoUIndex(i, j, k * w + l, an_num, an_stride, stride); - T iou = sigmoid(input_data[iou_idx]); - conf = pow(conf, static_cast(1. - iou_aware_factor)) * - pow(iou, static_cast(iou_aware_factor)); - } - if (conf < conf_thresh) { - continue; - } - - int box_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, - stride, 0, iou_aware); - GetYoloBox(box, input_data, anchors_data, l, k, j, h, w, - input_size_h, input_size_w, box_idx, stride, - img_height, img_width, scale, bias); - box_idx = (i * box_num + j * stride + k * w + l) * 4; - CalcDetectionBox(boxes_data, box, box_idx, img_height, img_width, - clip_bbox); - - int label_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, - stride, 5, iou_aware); - int score_idx = (i * box_num + j * stride + k * w + l) * class_num; - CalcLabelScore(scores_data, input_data, label_idx, score_idx, - class_num, conf, stride); - } - } - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/determinant_op.h b/paddle/fluid/operators/determinant_op.h index 375ef4344f4741..f89ecd37222870 100644 --- a/paddle/fluid/operators/determinant_op.h +++ b/paddle/fluid/operators/determinant_op.h @@ -19,11 +19,17 @@ #include #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/matrix_inverse.h" -#include "paddle/fluid/operators/svd_helper.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/for_range.h" -#include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/complex_kernel.h" +#include "paddle/phi/kernels/full_kernel.h" +#include "paddle/phi/kernels/funcs/diag_functor.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/matrix_inverse.h" +#include "paddle/phi/kernels/funcs/unsqueeze.h" +#include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/matmul_kernel.h" +#include "paddle/phi/kernels/transpose_kernel.h" namespace paddle { namespace operators { @@ -172,7 +178,7 @@ template class DeterminantGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto& dev_ctx = context.template device_context(); + auto& orig_dev_ctx = context.template device_context(); const auto* input = context.Input("Input"); const auto* det = context.Input("Out"); const auto* grad = @@ -200,15 +206,18 @@ class DeterminantGradKernel : public framework::OpKernel { // checked in forward, pass } + auto& dev_ctx = static_cast< + const typename framework::ConvertToPhiContext::TYPE&>( + orig_dev_ctx); + // Check Whether the matrix is invertible // (matrix A not invertible) == (det(A)=0) if (!CheckMatrixInvertible(context, det)) { // The matrix is not invertible VLOG(3) << "The input matrix not invertible!"; ddet->Resize(input->dims()); - ddet->mutable_data(context.GetPlace()); - phi::funcs::SetConstant zero; - zero(dev_ctx, ddet, static_cast(0.0f)); + phi::Full(dev_ctx, phi::vectorize(input->dims()), static_cast(0.0f), + ddet); return; } @@ -218,35 +227,35 @@ class DeterminantGradKernel : public framework::OpKernel { // we set d|A| = unsqueeze(dA * |A|, [-1, -2]) * inverse(A).transpose(-2, // -1) - math::DeviceIndependenceTensorOperations helper(context); - // First: inverse(A) framework::Tensor inverse_A; // A must be square matrices! inverse_A.Resize(input->dims()); inverse_A.mutable_data(context.GetPlace()); - math::MatrixInverseFunctor mat_inv; - mat_inv(dev_ctx, *input, &inverse_A); + phi::funcs::MatrixInverseFunctor mat_inv; + mat_inv(orig_dev_ctx, *input, &inverse_A); VLOG(3) << "inverse(A) dims: " << inverse_A.dims(); // Second: inverse(A).transpose(-2, -1) - framework::Tensor transpose_inverse_A = helper.Transpose(inverse_A); + framework::Tensor transpose_inverse_A = + phi::TransposeLast2Dim(dev_ctx, inverse_A); + VLOG(3) << "(dA * |A|).transpose(-2, -1) dims: " << transpose_inverse_A.dims(); // Third: dA * |A| - auto mul_dA_detA = helper.Mul(*grad, *det); + auto mul_dA_detA = phi::Multiply(dev_ctx, *grad, *det); VLOG(3) << "dA * |A| dims: " << mul_dA_detA.dims(); // Fourth: unsqueeze(dA * |A|, [-1, -2]) - auto unsqueeze1 = helper.Unsqueeze(mul_dA_detA, -1); - auto unsqueeze2 = helper.Unsqueeze(unsqueeze1, -2); + auto unsqueeze1 = phi::funcs::Unsqueeze(mul_dA_detA, -1); + auto unsqueeze2 = phi::funcs::Unsqueeze(unsqueeze1, -2); VLOG(3) << "unsqueezed(dA * |A|) dims: " << unsqueeze2.dims(); // Finally: unsqueeze(dA * |A|) * inverse(A) - auto res = helper.Mul(unsqueeze2, transpose_inverse_A); + auto res = phi::Multiply(dev_ctx, unsqueeze2, transpose_inverse_A); VLOG(3) << "unsqueeze(dA * |A|) * inverse(A) dims: " << res.dims(); @@ -331,7 +340,7 @@ template class SlogDeterminantGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto& dev_ctx = context.template device_context(); + auto& orig_dev_ctx = context.template device_context(); const auto* input = context.Input("Input"); const auto* slogdet = context.Input("Out"); const auto* grad = @@ -353,6 +362,10 @@ class SlogDeterminantGradKernel : public framework::OpKernel { input->dims().size() - grad->dims().size())); } + auto& dev_ctx = static_cast< + const typename framework::ConvertToPhiContext::TYPE&>( + orig_dev_ctx); + // Check Whether the matrix is invertible // (matrix A not invertible) == (absslogdet(A)=0) auto slogdet_vec = slogdet->Split(1, 0); @@ -361,9 +374,8 @@ class SlogDeterminantGradKernel : public framework::OpKernel { // The matrix is not invertible VLOG(3) << "The input matrix not invertible!"; dslogdet->Resize(input->dims()); - dslogdet->mutable_data(context.GetPlace()); - phi::funcs::SetConstant zero; - zero(dev_ctx, dslogdet, std::numeric_limits::quiet_NaN()); + phi::Full(dev_ctx, phi::vectorize(input->dims()), + std::numeric_limits::quiet_NaN(), dslogdet); return; } @@ -373,34 +385,25 @@ class SlogDeterminantGradKernel : public framework::OpKernel { // we set dsl|A| = unsqueeze(dslA, [-1, -2]) * // inverse(A).conj().transpose(-2, -1) - math::DeviceIndependenceTensorOperations helper(context); - // First: inverse(A) framework::Tensor inverse_A; // A must be square matrices! inverse_A.Resize(input->dims()); inverse_A.mutable_data(context.GetPlace()); - math::MatrixInverseFunctor mat_inv; - mat_inv(dev_ctx, *input, &inverse_A); + phi::funcs::MatrixInverseFunctor mat_inv; + mat_inv(orig_dev_ctx, *input, &inverse_A); VLOG(3) << "inverse(A) dims: " << inverse_A.dims(); // Second: inverse(A).conj() - framework::Tensor conj_inverse_A; - conj_inverse_A.Resize(inverse_A.dims()); - auto numel = input->numel(); - auto* conj_data = conj_inverse_A.mutable_data(context.GetPlace(), - size_t(numel * sizeof(T))); - - platform::ForRange for_range(dev_ctx, numel); - phi::funcs::ConjFunctor functor(inverse_A.data(), numel, conj_data); - for_range(functor); + auto conj_inverse_A = phi::Conj(dev_ctx, inverse_A); VLOG(3) << "inverse(A).conj() dims: " << conj_inverse_A.dims(); // Third: inverse(A).conj().transpose(-2, -1) - framework::Tensor transpose_inverse_A = helper.Transpose(conj_inverse_A); + framework::Tensor transpose_inverse_A = + phi::TransposeLast2Dim(dev_ctx, conj_inverse_A); VLOG(3) << "inverse(A).conj().transpose(-2, -1) dims: " << transpose_inverse_A.dims(); @@ -417,12 +420,12 @@ class SlogDeterminantGradKernel : public framework::OpKernel { det_grad.Resize(det_grad.dims().reshape(det_grad_vec)); // Fifth: unsqueeze(dslA, [-1, -2]) - auto unsqueeze1 = helper.Unsqueeze(det_grad, -1); - auto unsqueeze2 = helper.Unsqueeze(unsqueeze1, -2); + auto unsqueeze1 = phi::funcs::Unsqueeze(det_grad, -1); + auto unsqueeze2 = phi::funcs::Unsqueeze(unsqueeze1, -2); VLOG(3) << "unsqueezed(dslA, [-1, -2]) dims: " << unsqueeze2.dims(); // Finally: unsqueeze(dslA) * inverse(A) - auto res = helper.Mul(unsqueeze2, transpose_inverse_A); + auto res = phi::Multiply(dev_ctx, unsqueeze2, transpose_inverse_A); VLOG(3) << "unsqueeze(dslA) * inverse(A) dims: " << res.dims(); framework::TensorCopy(res, context.GetPlace(), dslogdet); diff --git a/paddle/fluid/operators/diag_v2_op.cc b/paddle/fluid/operators/diag_v2_op.cc index 0160277dc79af5..93fbff67e220bc 100644 --- a/paddle/fluid/operators/diag_v2_op.cc +++ b/paddle/fluid/operators/diag_v2_op.cc @@ -62,8 +62,8 @@ class DiagV2OpMaker : public framework::OpProtoAndCheckerMaker { } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(diag_v2, DiagInferShapeFunctor, - PT_INFER_META(phi::DiagInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(diag_v2, DiagInferShapeFunctor, + PD_INFER_META(phi::DiagInferMeta)); REGISTER_OPERATOR( diag_v2, ops::DiagV2Op, ops::DiagV2OpMaker, diff --git a/paddle/fluid/operators/diagonal_op.cc b/paddle/fluid/operators/diagonal_op.cc index 20813f8bb44e20..bf3cc941539eae 100644 --- a/paddle/fluid/operators/diagonal_op.cc +++ b/paddle/fluid/operators/diagonal_op.cc @@ -105,8 +105,8 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(DiagonalGradNoNeedBufferVarsInferer, namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(diagonal, DiagonalInferShapeFunctor, - PT_INFER_META(phi::DiagonalInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(diagonal, DiagonalInferShapeFunctor, + PD_INFER_META(phi::DiagonalInferMeta)); REGISTER_OPERATOR(diagonal, ops::DiagonalOp, ops::DiagonalOpMaker, ops::DiagonalGradOpMaker, diff --git a/paddle/fluid/operators/dist_op.cc b/paddle/fluid/operators/dist_op.cc index 3a53f1365567f9..55b2484941293c 100644 --- a/paddle/fluid/operators/dist_op.cc +++ b/paddle/fluid/operators/dist_op.cc @@ -12,10 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/dist_op.h" #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" + namespace paddle { namespace operators { @@ -121,13 +124,11 @@ class DistGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(dist, DistInferShapeFunctor, + PD_INFER_META(phi::DistInferMeta)); + REGISTER_OPERATOR(dist, ops::DistOp, ops::DistOpMaker, ops::DistGradOpMaker, - ops::DistGradOpMaker); + ops::DistGradOpMaker, + DistInferShapeFunctor); REGISTER_OPERATOR(dist_grad, ops::DistOpGrad); -REGISTER_OP_CPU_KERNEL( - dist, ops::DistKernel, - ops::DistKernel); -REGISTER_OP_CPU_KERNEL( - dist_grad, ops::DistGradKernel, - ops::DistGradKernel) diff --git a/paddle/fluid/operators/dist_op.h b/paddle/fluid/operators/dist_op.h deleted file mode 100644 index dfd7e29a8d0102..00000000000000 --- a/paddle/fluid/operators/dist_op.h +++ /dev/null @@ -1,304 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -using EigenTensor = framework::EigenTensor; -using framework::Tensor; - -template -static void GetBraodcastDims(const framework::DDim& x_dims, - const framework::DDim& y_dims, - Eigen::DSizes* x_bcast_dims, - Eigen::DSizes* y_bcast_dims) { - int bcast_dims_remainder = 0; - for (int i = 0; i < x_dims.size(); ++i) { - if (x_dims[i] >= y_dims[i]) { - (*x_bcast_dims)[i] = 1; - (*y_bcast_dims)[i] = x_dims[i] / y_dims[i]; - bcast_dims_remainder += x_dims[i] % y_dims[i]; - } else { - (*y_bcast_dims)[i] = 1; - (*x_bcast_dims)[i] = y_dims[i] / x_dims[i]; - bcast_dims_remainder += y_dims[i] % x_dims[i]; - } - } - PADDLE_ENFORCE_EQ(bcast_dims_remainder, 0, - platform::errors::PreconditionNotMet( - "The input tensor of Op(dist) could not be broadcast, " - "X's shape is [%s], Y's shape is [%s].", - x_dims, y_dims)); -} - -static framework::DDim GetNewDims(const framework::DDim& in_dims, int rank) { - std::vector new_dims_vec(rank); - if (in_dims.size() < rank) { - for (int i = 0; i < rank - in_dims.size(); ++i) { - new_dims_vec[i] = 1; - } - for (int i = 0; i < in_dims.size(); ++i) { - new_dims_vec[i + rank - in_dims.size()] = in_dims[i]; - } - } else { - new_dims_vec = vectorize(in_dims); - } - return phi::make_ddim(new_dims_vec); -} - -template -static void DistFunction(const framework::ExecutionContext& context) { - auto* x = context.Input("X"); - auto* y = context.Input("Y"); - auto* out = context.Output("Out"); - auto p = context.Attr("p"); - out->mutable_data(context.GetPlace()); - - auto x_dims = context.Input("X")->dims(); - auto y_dims = context.Input("Y")->dims(); - - // new dims with same size as rank, e.g. (rank=3, (4, 3) => (1, 4, 3)) - framework::DDim x_new_dims = GetNewDims(x_dims, Rank); - framework::DDim y_new_dims = GetNewDims(y_dims, Rank); - - auto x_t = EigenTensor::From(*x, x_new_dims); - auto y_t = EigenTensor::From(*y, y_new_dims); - auto out_t = EigenTensor::From(*out); - auto& place = - *context.template device_context().eigen_device(); - - Eigen::DSizes x_bcast_dims; - Eigen::DSizes y_bcast_dims; - GetBraodcastDims(x_new_dims, y_new_dims, &x_bcast_dims, &y_bcast_dims); - // p=0 means number of non-zero elements of (x-y) - // p=inf means the maximum of |x-y| - // p=-inf means the minimum of |x-y| - // otherwise, Lp-norm = pow(sum(pow(|x-y|, p)), 1/p) - if (p == 0) { - out_t.device(place) = - (x_t.broadcast(x_bcast_dims) != y_t.broadcast(y_bcast_dims)) - .template cast() - .sum(); - } else if (p == INFINITY) { - out_t.device(place) = - (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims)) - .abs() - .maximum(); - } else if (p == -INFINITY) { - out_t.device(place) = - (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims)) - .abs() - .minimum(); - } else { - out_t.device(place) = - (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims)) - .abs() - .pow(p) - .sum() - .pow(1.0 / p); - } -} - -template -static void DistGradFunction(const framework::ExecutionContext& context) { - auto* x = context.Input("X"); - auto* y = context.Input("Y"); - auto* out = context.Input("Out"); - auto p = context.Attr("p"); - - auto x_grad = context.Output(framework::GradVarName("X")); - auto y_grad = context.Output(framework::GradVarName("Y")); - auto out_grad = context.Input(framework::GradVarName("Out")); - - auto x_dims = context.Input("X")->dims(); - auto y_dims = context.Input("Y")->dims(); - auto out_dims = context.Input("Out")->dims(); - - framework::DDim x_new_dims = GetNewDims(x_dims, Rank); - framework::DDim y_new_dims = GetNewDims(y_dims, Rank); - framework::DDim out_new_dims = GetNewDims(out_dims, Rank); - auto x_t = EigenTensor::From(*x, x_new_dims); - auto y_t = EigenTensor::From(*y, y_new_dims); - auto out_t = EigenTensor::From(*out, out_new_dims); - - Eigen::DSizes x_bcast_dims; - Eigen::DSizes y_bcast_dims; - Eigen::DSizes out_bcast_dims; - - GetBraodcastDims(x_new_dims, y_new_dims, &x_bcast_dims, &y_bcast_dims); - std::vector new_dims_vec(Rank); - for (int i = 0; i < Rank; ++i) { - new_dims_vec[i] = std::max(x_new_dims[i], y_new_dims[i]); - out_bcast_dims[i] = new_dims_vec[i]; - } - framework::DDim new_dims = phi::make_ddim(new_dims_vec); - - auto& place = - *context.template device_context().eigen_device(); - auto out_grad_t = EigenTensor::From(*out_grad, out_new_dims); - framework::Tensor grad; - grad.mutable_data(new_dims, context.GetPlace()); - auto grad_t = EigenTensor::From(grad); - - auto x_minux_y = x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims); - auto x_minux_y_abs = x_minux_y.abs(); - auto sign = - (x_minux_y > static_cast(0)).template cast() * static_cast(1.0) + - (x_minux_y < static_cast(0)).template cast() * static_cast(-1.0); - T epsilon = static_cast(1.0e-10f); - - // 1: Lp-norm(z), z = x-y, compute dz - if (p == 0) { - phi::funcs::SetConstant set_zero; - auto& dev_ctx = context.template device_context(); - set_zero(dev_ctx, &grad, static_cast(0)); - } else if (p == INFINITY || p == -INFINITY) { - // p=inf or -inf, Lp-norm = |z_i|, the j-th element of dz tends to 0 if - // j!=i, or equals to sign(z_i) * dout if j=i. - if (platform::is_cpu_place(context.GetPlace())) { - grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims)) - .template cast() * - sign.eval() * out_grad_t.broadcast(out_bcast_dims); - } else { - grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims)) - .template cast() * - sign * out_grad_t.broadcast(out_bcast_dims); - } - } else { - // dz = pow(abs(x-y)/out, p-1) * sign(x-y) * dout - if (platform::is_cpu_place(context.GetPlace())) { - grad_t.device(place) = - (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims)) - .pow(p - 1) * - sign.eval() * out_grad_t.broadcast(out_bcast_dims); - } else { - grad_t.device(place) = - (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims)) - .pow(p - 1) * - sign * out_grad_t.broadcast(out_bcast_dims); - } - } - - Eigen::DSizes x_reshape_dims; - Eigen::DSizes y_reshape_dims; - Eigen::DSizes reduce_dims; - for (int i = 0; i < x_new_dims.size(); ++i) { - x_reshape_dims[2 * i] = x_bcast_dims[i]; - x_reshape_dims[2 * i + 1] = x_new_dims[i]; - y_reshape_dims[2 * i] = y_bcast_dims[i]; - y_reshape_dims[2 * i + 1] = y_new_dims[i]; - reduce_dims[i] = 2 * i; - } - - // 2: if x or y is broadcasted in forward function, - // the grad need to be sum along the broadcasted dimensions - if (x_grad) { - x_grad->mutable_data(context.GetPlace()); - auto x_grad_t = EigenTensor::From(*x_grad, x_new_dims); - x_grad_t.device(place) = grad_t.reshape(x_reshape_dims) - .sum(reduce_dims) - .reshape(x_grad_t.dimensions()); - } - if (y_grad) { - y_grad->mutable_data(context.GetPlace()); - auto y_grad_t = EigenTensor::From(*y_grad, y_new_dims); - y_grad_t.device(place) = -grad_t.reshape(y_reshape_dims) - .sum(reduce_dims) - .reshape(y_grad_t.dimensions()); - } -} - -template -class DistKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto x_rank = context.Input("X")->dims().size(); - auto y_rank = context.Input("Y")->dims().size(); - auto rank = std::max(x_rank, y_rank); - PADDLE_ENFORCE_LE(rank, 6, - platform::errors::Unimplemented( - "Op(dist) only support tensors with no more than 6 " - "dimensions, but X's rank is %d, Y's rank is %d.", - x_rank, y_rank)); - switch (rank) { - case 1: - DistFunction(context); - break; - case 2: - DistFunction(context); - break; - case 3: - DistFunction(context); - break; - case 4: - DistFunction(context); - break; - case 5: - DistFunction(context); - break; - case 6: - DistFunction(context); - break; - } - } -}; - -template -class DistGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto x_rank = context.Input("X")->dims().size(); - auto y_rank = context.Input("Y")->dims().size(); - auto rank = std::max(x_rank, y_rank); - PADDLE_ENFORCE_LE(rank, 6, - platform::errors::Unimplemented( - "Op(dist) only support tensors with no more than 6 " - "dimensions, but X's rank is %d, Y's rank is %d.", - x_rank, y_rank)); - switch (rank) { - case 1: - DistGradFunction(context); - break; - case 2: - DistGradFunction(context); - break; - case 3: - DistGradFunction(context); - break; - case 4: - DistGradFunction(context); - break; - case 5: - DistGradFunction(context); - break; - case 6: - DistGradFunction(context); - break; - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/dot_op.cc b/paddle/fluid/operators/dot_op.cc index a86a3bb35927d5..8efdd15781a6f2 100644 --- a/paddle/fluid/operators/dot_op.cc +++ b/paddle/fluid/operators/dot_op.cc @@ -101,8 +101,8 @@ class DotOpGradMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(dot, DotInferShapeFunctor, - PT_INFER_META(phi::DotInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(dot, DotInferShapeFunctor, + PD_INFER_META(phi::DotInferMeta)); REGISTER_OPERATOR(dot, ops::DotOp, ops::DotOpMaker, ops::DotOpGradMaker, diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h index dcdab033e8f801..144198367d538e 100644 --- a/paddle/fluid/operators/dropout_impl.cu.h +++ b/paddle/fluid/operators/dropout_impl.cu.h @@ -32,10 +32,9 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/dropout_impl_util.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/phi/kernels/funcs/aligned_vector.h" +#include "paddle/fluid/platform/aligned_vector.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/kernels/funcs/functors.h" namespace paddle { @@ -177,12 +176,13 @@ __global__ void DropoutGradCUDAKernel( } template -void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, - bool is_test, +void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx, bool is_test, const std::string dropout_implementation, float dropout_prob, bool upscale_in_train, - bool is_fix_seed, int seed_val, const Tensor& x, - const Tensor* seed, Tensor* mask, Tensor* y) { + bool is_fix_seed, int seed_val, + const framework::Tensor& x, + const framework::Tensor* seed, + framework::Tensor* mask, framework::Tensor* y) { auto& place = *dev_ctx.eigen_device(); int64_t x_numel = x.numel(); auto stream = dev_ctx.stream(); @@ -220,7 +220,8 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, // VectorizedRandomGenerator use curand_uniform4, so we only support // vec_size is 4; int vec_size = (phi::GetVectorizedSize(x_data) == 4) ? 4 : 1; - auto gpu_config = GetGpuLaunchConfig1D(dev_ctx, x_numel, vec_size); + auto gpu_config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_numel, vec_size); auto offset = ((x_numel - 1) / (gpu_config.GetThreadNum() * vec_size) + 1) * vec_size; @@ -266,7 +267,8 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, cudaMemcpyDeviceToDevice, stream)); #endif } else { - T factor = static_cast(1.0f - dropout_prob); + using MT = typename details::MPTypeTrait::Type; + MT factor = static_cast(1.0f - dropout_prob); std::vector ins = {&x}; std::vector outs = {y}; auto functor = phi::funcs::ScaleFunctor(factor); @@ -277,11 +279,13 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, } template -void DropoutGradGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, +void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx, const std::string dropout_implementation, - float dropout_prob, const Tensor& grad_y, - const Tensor& mask, int64_t size, - Tensor* grad_x, bool is_test = false) { + float dropout_prob, + const framework::Tensor& grad_y, + const framework::Tensor& mask, int64_t size, + framework::Tensor* grad_x, + bool is_test = false) { using MT = typename details::MPTypeTrait::Type; auto stream = dev_ctx.stream(); MT factor; diff --git a/paddle/fluid/operators/dropout_impl_util.h b/paddle/fluid/operators/dropout_impl_util.h index d7db7dddce3887..c62d45570ba291 100644 --- a/paddle/fluid/operators/dropout_impl_util.h +++ b/paddle/fluid/operators/dropout_impl_util.h @@ -20,7 +20,7 @@ limitations under the License. */ namespace paddle { namespace operators { -inline void GetSeedDataAndIncrement(const platform::CUDADeviceContext& dev_ctx, +inline void GetSeedDataAndIncrement(const phi::GPUContext& dev_ctx, const framework::Tensor* seed, const bool is_fix_seed, const int seed_val, const int offset, uint64_t* seed_data, diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc index 7613b04bccfdc2..6d52ce45c4c100 100644 --- a/paddle/fluid/operators/dropout_op.cc +++ b/paddle/fluid/operators/dropout_op.cc @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/dropout_op.h" #include #include +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { @@ -177,14 +177,3 @@ REGISTER_OPERATOR(dropout, ops::DropoutOp, ops::DropoutOpMaker, ops::DropoutGradOpMaker, ops::DropoutGradOpMaker); REGISTER_OPERATOR(dropout_grad, ops::DropoutOpGrad); -REGISTER_OP_CPU_KERNEL( - dropout, ops::CPUDropoutKernel, - ops::CPUDropoutKernel, - ops::CPUDropoutKernel); -REGISTER_OP_CPU_KERNEL( - dropout_grad, - ops::DropoutGradKernel, - ops::DropoutGradKernel, - ops::DropoutGradKernel); diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu deleted file mode 100644 index f6ddff1d0327d3..00000000000000 --- a/paddle/fluid/operators/dropout_op.cu +++ /dev/null @@ -1,94 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/dropout_impl.cu.h" -#include "paddle/fluid/operators/dropout_op.h" -#include "paddle/fluid/platform/bfloat16.h" -#include "paddle/fluid/platform/float16.h" - -namespace paddle { -namespace operators { - -// It seems that Eigen::Tensor::setRandom in GPU will SEGFAULT. -// Use std::random and thrust::random(thrust is a std library in CUDA) to -// implement uniform random. -template -class GPUDropoutKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* seed = - context.HasInput("Seed") ? context.Input("Seed") : nullptr; - auto* y = context.Output("Out"); - y->mutable_data(context.GetPlace()); - float dropout_prob = context.Attr("dropout_prob"); - - auto& dropout_implementation = - context.Attr("dropout_implementation"); - bool upscale_in_train = (dropout_implementation == "upscale_in_train"); - - bool is_test = context.Attr("is_test"); - - auto& dev_ctx = context.cuda_device_context(); - auto* mask = context.Output("Mask"); - mask->mutable_data(context.GetPlace()); - - bool is_fix_seed = context.Attr("fix_seed"); - int seed_val = context.Attr("seed"); - DropoutFwGPUKernelDriver(dev_ctx, is_test, dropout_implementation, - dropout_prob, upscale_in_train, is_fix_seed, - seed_val, *x, seed, mask, y); - } -}; - -template -class GPUDropoutGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* grad_x = context.Output(framework::GradVarName("X")); - auto* grad_y = context.Input(framework::GradVarName("Out")); - auto* mask = context.Input("Mask"); - grad_x->mutable_data(context.GetPlace()); - auto size = grad_x->numel(); - auto& dropout_implementation = - context.Attr("dropout_implementation"); - float dropout_prob = context.Attr("dropout_prob"); - - bool is_test = context.Attr("is_test"); - - auto& dev_ctx = - context.template device_context(); - DropoutGradGPUKernelDriver(dev_ctx, dropout_implementation, dropout_prob, - *grad_y, *mask, size, grad_x, is_test); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - dropout, ops::GPUDropoutKernel, - ops::GPUDropoutKernel, - ops::GPUDropoutKernel, - ops::GPUDropoutKernel); -REGISTER_OP_CUDA_KERNEL( - dropout_grad, ops::GPUDropoutGradKernel, - ops::GPUDropoutGradKernel, - ops::GPUDropoutGradKernel, - ops::GPUDropoutGradKernel); diff --git a/paddle/fluid/operators/dropout_op.h b/paddle/fluid/operators/dropout_op.h deleted file mode 100644 index ea6ed0e6194747..00000000000000 --- a/paddle/fluid/operators/dropout_op.h +++ /dev/null @@ -1,151 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#pragma once - -#include -#include -#include - -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/generator.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -template -using EigenMatrix = framework::EigenMatrix; - -template -using EigenVector = framework::EigenVector; - -template -class CPUDropoutKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* seed = - context.HasInput("Seed") ? context.Input("Seed") : nullptr; - auto* y = context.Output("Out"); - const auto* x_data = x->data(); - auto* y_data = y->mutable_data(context.GetPlace()); - float dropout_prob = context.Attr("dropout_prob"); - - auto& dropout_implementation = - context.Attr("dropout_implementation"); - bool upscale_in_train = (dropout_implementation == "upscale_in_train"); - if (!context.Attr("is_test")) { - auto* mask = context.Output("Mask"); - auto* mask_data = mask->mutable_data(context.GetPlace()); - size_t size = phi::product(mask->dims()); - - // Special case when dropout_prob is 1.0 - if (dropout_prob == 1.0f) { - std::memset(y_data, 0, size * sizeof(*y_data)); // NOLINT - std::memset(mask_data, 0, size * sizeof(*mask_data)); // NOLINT - return; - } - // std::minstd_rand engine; - // NOTE: fixed seed should only be used in unittest or for debug. - // Guarantee to use random seed in training. - int seed_data = 0; - if (seed) { - seed_data = *(seed->data()); - } else { - seed_data = - context.Attr("fix_seed") ? context.Attr("seed") : 0; - } - auto engine = framework::GetCPURandomEngine(seed_data); - - std::uniform_real_distribution dist(0, 1); - - for (size_t i = 0; i < size; ++i) { - if (dist(*engine) < dropout_prob) { - mask_data[i] = 0; - y_data[i] = 0; - } else { - mask_data[i] = 1; - if (upscale_in_train) { - y_data[i] = x_data[i] / static_cast(1.0f - dropout_prob); - } else { - y_data[i] = x_data[i]; - } - } - } - } else { - if (upscale_in_train) { - const auto* X_data = x->data(); - auto* Y_data = y->mutable_data(context.GetPlace()); -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (int i = 0; i < x->numel(); i++) { - Y_data[i] = X_data[i]; - } - } else { - auto X = EigenMatrix::Reshape(*x, 1); - auto Y = EigenMatrix::Reshape(*y, 1); - auto& place = - *context.template device_context().eigen_device(); - Y.device(place) = X * static_cast(1.0f - dropout_prob); - } - } - } -}; -template -class DropoutGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* grad_x = context.Output(framework::GradVarName("X")); - auto* grad_y = context.Input(framework::GradVarName("Out")); - auto* mask = context.Input("Mask"); - grad_x->mutable_data(context.GetPlace()); - - auto dX = EigenVector::Flatten(*grad_x); - auto dY = EigenVector::Flatten(*grad_y); - - auto& place = - *context.template device_context().eigen_device(); - auto& dropout_implementation = - context.Attr("dropout_implementation"); - if (context.Attr("is_test") == true) { - if (dropout_implementation == "upscale_in_train") { - dX.device(place) = static_cast(1) * dY; - } else { - float dropout_prob = context.Attr("dropout_prob"); - dX.device(place) = dY * static_cast(1.0f - dropout_prob); - } - } else { - auto M = EigenVector::Flatten(*mask); - if (dropout_implementation == "upscale_in_train") { - float dropout_prob = context.Attr("dropout_prob"); - if (dropout_prob == 1.0f) { - dX.device(place) = static_cast(0) * dY; - } else { - dX.device(place) = - dY * M.cast() / static_cast(1.0f - dropout_prob); - } - } else { - dX.device(place) = dY * M.cast(); - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/dropout_op_npu.cc b/paddle/fluid/operators/dropout_op_npu.cc index 6aae566760623c..07b3b538116257 100644 --- a/paddle/fluid/operators/dropout_op_npu.cc +++ b/paddle/fluid/operators/dropout_op_npu.cc @@ -15,8 +15,8 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" #include "paddle/phi/core/ddim.h" diff --git a/paddle/fluid/operators/dropout_op_test.cc b/paddle/fluid/operators/dropout_op_test.cc index 206d9a6c5e9c98..bdf08646f1d8b9 100644 --- a/paddle/fluid/operators/dropout_op_test.cc +++ b/paddle/fluid/operators/dropout_op_test.cc @@ -24,14 +24,13 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace f = paddle::framework; namespace p = paddle::platform; -USE_OP(dropout); +USE_OP_ITSELF(dropout); void Compare(f::Scope* scope, const p::DeviceContext& ctx) { // init diff --git a/paddle/fluid/operators/dropout_op_xpu.cc b/paddle/fluid/operators/dropout_op_xpu.cc index 07b7e2cc7c09b0..7d8660f238abc8 100644 --- a/paddle/fluid/operators/dropout_op_xpu.cc +++ b/paddle/fluid/operators/dropout_op_xpu.cc @@ -8,15 +8,17 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/dropout_op.h" + #include #include +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/device_wrapper.h" namespace paddle { namespace operators { #ifdef PADDLE_WITH_XPU +using Tensor = framework::Tensor; template class DropoutXPUKernel : public framework::OpKernel { using XPUTyp = typename XPUTypeTrait::Type; diff --git a/paddle/fluid/operators/eig_op.h b/paddle/fluid/operators/eig_op.h index e9c6c1eb7eced7..5e4c83e1a45ebd 100644 --- a/paddle/fluid/operators/eig_op.h +++ b/paddle/fluid/operators/eig_op.h @@ -18,12 +18,19 @@ #include #include #include "paddle/fluid/operators/math/matrix_solve.h" -#include "paddle/fluid/operators/svd_helper.h" #include "paddle/fluid/operators/transpose_op.h" #include "paddle/fluid/platform/for_range.h" +#include "paddle/phi/kernels/complex_kernel.h" #include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/funcs/diag_functor.h" #include "paddle/phi/kernels/funcs/lapack/lapack_function.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/slice.h" +#include "paddle/phi/kernels/funcs/unsqueeze.h" +#include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/matmul_kernel.h" +#include "paddle/phi/kernels/transpose_kernel.h" + #define EPSILON 1e-6 namespace paddle { @@ -214,12 +221,17 @@ class EigKernel : public framework::OpKernel { ApplyEigKernel>( *x, &real_values, &real_vectors, context); - auto dito = math::DeviceIndependenceTensorOperations< - DeviceContext, phi::dtype::Real, Tout>(context); + + auto& orig_dev_ctx = context.template device_context(); + auto& dev_ctx = static_cast< + const typename framework::ConvertToPhiContext::TYPE&>( + orig_dev_ctx); // 1. extract real part & imag part from real_values - Tensor real_part = dito.Slice(real_values, {-1}, {0}, {order}); - Tensor imag_part = dito.Slice(real_values, {-1}, {order}, {order * 2}); + Tensor real_part = + phi::funcs::Slice(dev_ctx, real_values, {-1}, {0}, {order}); + Tensor imag_part = phi::funcs::Slice(dev_ctx, real_values, {-1}, + {order}, {order * 2}); // 2. construct complex values auto* real_part_data = real_part.data>(); @@ -233,7 +245,8 @@ class EigKernel : public framework::OpKernel { for_range(functor); // 3. construct complex vectors - Tensor real_vector_trans = dito.Transpose(real_vectors); + Tensor real_vector_trans = + phi::TransposeLast2Dim(dev_ctx, real_vectors); Tensor out_vectors_trans; out_vectors_trans.mutable_data(x->dims(), context.GetPlace()); ConstructComplexVectors, Tout>( @@ -251,45 +264,48 @@ class EigKernel : public framework::OpKernel { } }; -template +template void ComputeBackwardForComplexInput( const Tensor& V, const Tensor& L, const Tensor& gL, const Tensor& gV, - Tout* x_grad_data, int batch_count, int order, + T* x_grad_data, int batch_count, int order, const framework::ExecutionContext& context) { - auto dito = - math::DeviceIndependenceTensorOperations( - context); - - Tensor trans_v = dito.Transpose(V); - Tensor Vh = dito.Conj(trans_v); - Tensor Lconj = dito.Conj(L); - Tensor Econj = dito.Sub(dito.Unsqueeze(Lconj, -2), dito.Unsqueeze(Lconj, -1)); - Tensor VhgV = dito.Matmul(Vh, gV); - Tensor diag_real = dito.Real(VhgV); - Tensor diag_res = dito.BatchDiag(diag_real, batch_count); - Tensor diag_unsqueezed = dito.Unsqueeze(diag_res, -2); + auto& orig_dev_ctx = context.template device_context(); + auto& dev_ctx = static_cast< + const typename framework::ConvertToPhiContext::TYPE&>( + orig_dev_ctx); + + Tensor trans_v = phi::TransposeLast2Dim(dev_ctx, V); + Tensor Vh = phi::Conj(dev_ctx, trans_v); + Tensor Lconj = phi::Conj(dev_ctx, L); + Tensor Econj = phi::Subtract(dev_ctx, phi::funcs::Unsqueeze(Lconj, -2), + phi::funcs::Unsqueeze(Lconj, -1)); + Tensor VhgV = phi::Matmul(dev_ctx, Vh, gV); + Tensor diag_real = phi::Real(dev_ctx, VhgV); + Tensor diag_res = phi::funcs::BatchDiag(dev_ctx, diag_real, batch_count); + Tensor diag_unsqueezed = phi::funcs::Unsqueeze(diag_res, -2); // turn diag_unsqueezed into complex auto numel = diag_unsqueezed.numel(); Tensor diag_unsqueezed_complex; - auto* data_diag_un = diag_unsqueezed.data>(); - auto* data_diag_un_com = diag_unsqueezed_complex.mutable_data( + auto* data_diag_un = diag_unsqueezed.data>(); + auto* data_diag_un_com = diag_unsqueezed_complex.mutable_data( diag_unsqueezed.dims(), context.GetPlace(), - static_cast(numel * sizeof(Tout))); - auto& dev_ctx = context.template device_context(); - platform::ForRange for_range(dev_ctx, numel); - phi::funcs::RealToComplexFunctor functor(data_diag_un, data_diag_un_com, - numel); + static_cast(numel * sizeof(T))); + + platform::ForRange for_range(orig_dev_ctx, numel); + phi::funcs::RealToComplexFunctor functor(data_diag_un, data_diag_un_com, + numel); for_range(functor); // real tensor multiply complex tensor in broadcast manner - Tensor res1 = dito.RealMulComplex(V, diag_unsqueezed_complex); - Tensor res2 = dito.Matmul(Vh, res1); - Tensor result = dito.Sub(VhgV, res2); + Tensor res1 = phi::Multiply(dev_ctx, V, diag_unsqueezed_complex); + Tensor res2 = phi::Matmul(dev_ctx, Vh, res1); + Tensor result = phi::Subtract(dev_ctx, VhgV, res2); - result.mutable_data(V.dims(), context.GetPlace()); - result = dito.Div(result, Econj); - result = dito.DiagFill(order, order, order, 0, gL, result); - Tensor rhs = dito.Matmul(result, Vh); + result.mutable_data(V.dims(), context.GetPlace()); + result = phi::Divide(dev_ctx, result, Econj); + result = + phi::funcs::DiagFill(dev_ctx, order, order, order, 0, gL, result); + Tensor rhs = phi::Matmul(dev_ctx, result, Vh); // solve linear system // solve(Vh, rhs, out, m, k) @@ -298,10 +314,10 @@ void ComputeBackwardForComplexInput( // x_grad: out int m = Vh.dims()[Vh.dims().size() - 1]; int k = rhs.dims()[rhs.dims().size() - 1]; - auto* matrix_data = Vh.data(); - auto* rhs_data = rhs.data(); - math::SolveLinearSystem(matrix_data, rhs_data, x_grad_data, m, k, - batch_count); + auto* matrix_data = Vh.data(); + auto* rhs_data = rhs.data(); + math::SolveLinearSystem(matrix_data, rhs_data, x_grad_data, m, k, + batch_count); } template diff --git a/paddle/fluid/operators/eigh_op.cc b/paddle/fluid/operators/eigh_op.cc index 553d0e679cc6dd..4e33c567eb6d12 100644 --- a/paddle/fluid/operators/eigh_op.cc +++ b/paddle/fluid/operators/eigh_op.cc @@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/eigh_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -22,42 +25,9 @@ using framework::Tensor; class EighOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Eigh"); - OP_INOUT_CHECK(ctx->HasOutput("Eigenvalues"), "Output", "Eigenvalues", - "Eigh"); - OP_INOUT_CHECK(ctx->HasOutput("Eigenvectors"), "Output", "Eigenvectors", - "Eigh"); - - auto input_dim = ctx->GetInputDim("X"); - auto rank = input_dim.size(); - - PADDLE_ENFORCE_GE(rank, 2, - platform::errors::InvalidArgument( - "The Input(X) should have at least 2 dimensions." - "But received a %d dimension tensor.", - rank)); - PADDLE_ENFORCE_EQ( - input_dim[rank - 2], input_dim[rank - 1], - platform::errors::InvalidArgument( - "Eigh op is designed for square matrix, consequently" - "inner-most 2 dimensions of Input(X) should be symmetric." - "But received X's shape[-2] = %d and shape[-1] = %d.", - input_dim[rank - 2], input_dim[rank - 1])); - - std::vector values_dim; - - for (auto i = 0; i < rank - 1; i++) { - values_dim.emplace_back(input_dim[i]); - } - - ctx->SetOutputDim("Eigenvalues", phi::make_ddim(values_dim)); - ctx->SetOutputDim("Eigenvectors", input_dim); - } }; -class EignOpMaker : public framework::OpProtoAndCheckerMaker { +class EighOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", @@ -140,24 +110,11 @@ class EighGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(eigh, EighInferShapeFunctor, + PD_INFER_META(phi::EighInferMeta)); -REGISTER_OPERATOR(eigh, ops::EighOp, ops::EignOpMaker, +REGISTER_OPERATOR(eigh, ops::EighOp, ops::EighOpMaker, ops::EighGradOpMaker, - ops::EighGradOpMaker); + ops::EighGradOpMaker, + EighInferShapeFunctor); REGISTER_OPERATOR(eigh_grad, ops::EighGradOp); - -REGISTER_OP_CPU_KERNEL( - eigh, ops::EighKernel, - ops::EighKernel, - ops::EighKernel>, - ops::EighKernel>); - -REGISTER_OP_CPU_KERNEL( - eigh_grad, ops::EighGradKernel, - ops::EighGradKernel, - ops::EighGradKernel>, - ops::EighGradKernel>); diff --git a/paddle/fluid/operators/eigh_op.cu b/paddle/fluid/operators/eigh_op.cu deleted file mode 100644 index 827c551637d4df..00000000000000 --- a/paddle/fluid/operators/eigh_op.cu +++ /dev/null @@ -1,32 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/eigh_op.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - eigh, ops::EighKernel, - ops::EighKernel, - ops::EighKernel>, - ops::EighKernel>); - -REGISTER_OP_CUDA_KERNEL( - eigh_grad, ops::EighGradKernel, - ops::EighGradKernel, - ops::EighGradKernel>, - ops::EighGradKernel>); diff --git a/paddle/fluid/operators/eigh_op.h b/paddle/fluid/operators/eigh_op.h deleted file mode 100644 index 5279ec750935c9..00000000000000 --- a/paddle/fluid/operators/eigh_op.h +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/eigen_values_vectors.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class EighKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto input = ctx.Input("X"); - auto output_w = ctx.Output("Eigenvalues"); - auto output_v = ctx.Output("Eigenvectors"); - std::string lower = ctx.Attr("UPLO"); - bool is_lower = (lower == "L"); - math::MatrixEighFunctor functor; - functor(ctx, *input, output_w, output_v, is_lower, true); - } -}; - -template -class EighGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using ValueType = phi::dtype::Real; - auto& x_grad = *ctx.Output(framework::GradVarName("X")); - x_grad.mutable_data(ctx.GetPlace()); - auto& output_w = *ctx.Input("Eigenvalues"); - auto& output_v = *ctx.Input("Eigenvectors"); - auto& output_w_grad = - *ctx.Input(framework::GradVarName("Eigenvalues")); - auto& output_v_grad = - *ctx.Input(framework::GradVarName("Eigenvectors")); - - auto& dims = output_v.dims(); - const int m = dims[dims.size() - 1]; - auto dito = - math::DeviceIndependenceTensorOperations( - ctx); - auto tV = dito.Transpose(dito.Conj(output_v)); - auto W = dito.template Sub(dito.Unsqueeze(output_w, -2), - dito.Unsqueeze(output_w, -1)); - Tensor result = dito.Matmul(tV, output_v_grad); - result.mutable_data(dims, ctx.GetPlace()); - std::vector out_shape = phi::vectorize(dims); - auto constant = dito.Fill(out_shape, 0.5); - result = dito.Sub(result, dito.Conj(dito.Transpose(result))); - result = dito.Mul(result, constant); - result = dito.Div(result, W); - result = dito.DiagFill(m, m, m, 0, output_w_grad, result); - x_grad = dito.Matmul(output_v, dito.Matmul(result, tV)); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cc b/paddle/fluid/operators/elementwise/elementwise_div_op.cc index 38cd232e4d1d22..13fd9b81a8765a 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cc @@ -102,42 +102,6 @@ REGISTER_OPERATOR( REGISTER_OPERATOR(elementwise_div_grad_grad, ops::ElementwiseDivOpDoubleGrad, ops::ElementwiseDoubleGradOpInplaceInferer); -REGISTER_OP_CPU_KERNEL( - elementwise_div, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel>, - ops::ElementwiseDivKernel>); -REGISTER_OP_CPU_KERNEL( - elementwise_div_grad, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel>, - ops::ElementwiseDivGradKernel>); - -REGISTER_OP_CPU_KERNEL( - elementwise_div_grad_grad, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel>, - ops::ElementwiseDivDoubleGradKernel>); - REGISTER_OP_VERSION(elementwise_div) .AddCheckpoint( R"ROC(Register elementwise_div for adding the attribute of Scale_y)ROC", diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu deleted file mode 100644 index 9eb4b0352e5337..00000000000000 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu +++ /dev/null @@ -1,96 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/elementwise/elementwise_div_op.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -namespace paddle { -namespace operators { - -template -typename std::enable_if< - std::is_same::value>::type -ElementwiseDivGrad(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - const framework::Tensor* out, const framework::Tensor* dout, - framework::Tensor* dx, framework::Tensor* dy) { - int axis = ctx.Attr("axis"); - const auto& dev_ctx = ctx.template device_context(); - const auto place = ctx.GetPlace(); - if (dx != nullptr && dy != nullptr) { - std::vector ins = {dout, out, y}; - GetGradXAndYOut( - dev_ctx, place, axis, ins, dout, dx, dy, DivGradXYFunctor()); - } else if (dx != nullptr && dy == nullptr) { - std::vector ins = {dout, y}; - GetGradXOrYOut(dev_ctx, place, axis, ins, dout, - dx, DivGradXFunctor()); - } else if (dy != nullptr && dx == nullptr) { - std::vector ins = {dout, out, y}; - GetGradXOrYOut( - dev_ctx, place, axis, ins, dout, dy, DivGradYFunctor()); - } -} - -} // namespace operators -} // namespace paddle - -REGISTER_OP_CUDA_KERNEL( - elementwise_div, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel>, - ops::ElementwiseDivKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_div_grad, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel>, - ops::ElementwiseDivGradKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_div_grad_grad, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel>, - ops::ElementwiseDivDoubleGradKernel>); diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h index c58a7f36548a57..e9adb9abdb528c 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h @@ -20,142 +20,6 @@ limitations under the License. */ namespace paddle { namespace operators { -template -void default_elementwise_sub(const framework::ExecutionContext& ctx, - const framework::Tensor* x, - const framework::Tensor* y, framework::Tensor* z) { - int axis = ctx.Attr("axis"); - auto x_dims = x->dims(); - auto y_dims = y->dims(); - if (x_dims.size() >= y_dims.size()) { - ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, - SubFunctor(), z); - } else { - ElementwiseComputeEx, DeviceContext, T>( - ctx, x, y, axis, InverseSubFunctor(), z); - } -} - -template -void default_elementwise_div(const framework::ExecutionContext& ctx, - const framework::Tensor* x, - const framework::Tensor* y, framework::Tensor* z) { - int axis = ctx.Attr("axis"); - auto x_dims = x->dims(); - auto y_dims = y->dims(); - if (x_dims.size() >= y_dims.size()) { - ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, - DivFunctor(), z); - } else { - ElementwiseComputeEx, DeviceContext, T>( - ctx, x, y, axis, InverseDivFunctor(), z); - } -} - -template -class ElementwiseDivKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* z = ctx.Output("Out"); - z->mutable_data(ctx.GetPlace()); - - auto& dev_ctx = ctx.device_context(); - int axis = ctx.Attr("axis"); - auto pt_x = paddle::experimental::MakePhiDenseTensor(*x); - auto pt_y = paddle::experimental::MakePhiDenseTensor(*y); - auto pt_z = paddle::experimental::MakePhiDenseTensor(*z); - phi::DivideRawKernel( - static_cast::TYPE&>(dev_ctx), - *pt_x.get(), *pt_y.get(), axis, pt_z.get()); - } -}; - -template -struct DivGradDX { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout / y; } -}; - -template -struct DivGradDX> { - HOSTDEVICE paddle::platform::complex operator()( - paddle::platform::complex x, paddle::platform::complex y, - paddle::platform::complex out, - paddle::platform::complex dout) const { - paddle::platform::complex y_conj(y.real, -y.imag); - return dout / y_conj; - } -}; - -template -struct DivGradDY { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { - return -dout * out / y; - } -}; - -template -struct DivGradDY> { - HOSTDEVICE paddle::platform::complex operator()( - paddle::platform::complex x, paddle::platform::complex y, - paddle::platform::complex out, - paddle::platform::complex dout) const { - paddle::platform::complex out_div_y_conj((out / y).real, - -(out / y).imag); - return -dout * out_div_y_conj; - } -}; - -template -struct DivDoubleDY { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { - return y * out * dout - x * dout; - } -}; - -template -typename std::enable_if< - std::is_same::value>::type -ElementwiseDivGrad(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - const framework::Tensor* out, const framework::Tensor* dout, - framework::Tensor* dx, framework::Tensor* dy) { - int axis = ctx.Attr("axis"); - - ElemwiseGradCompute, DivGradDY>( - ctx, *x, *y, *out, *dout, axis, dx, dy, DivGradDX(), DivGradDY()); -} - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -template -typename std::enable_if< - std::is_same::value>::type -ElementwiseDivGrad(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - const framework::Tensor* out, const framework::Tensor* dout, - framework::Tensor* dx, framework::Tensor* dy); -#endif - -template -class ElementwiseDivGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - using Tensor = framework::Tensor; - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Input("Out"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - - ElementwiseDivGrad(ctx, x, y, out, dout, dx, dy); - } -}; - class ElementwiseDivOpDoubleGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -206,80 +70,5 @@ class ElementwiseDivOpDoubleGrad : public framework::OperatorWithKernel { } }; -template -class ElementwiseDivDoubleGradKernel : public framework::OpKernel { - using Tensor = framework::Tensor; - - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* Y = ctx.Input("Y"); - auto* Out = ctx.Input("Out"); - auto* ddX = ctx.Input("DDX"); - auto* ddY = ctx.Input("DDY"); - auto* dX = ctx.Input("DX"); - - auto* dY = ctx.Output(framework::GradVarName("Y")); - auto* dOut = ctx.Output("DOut"); - auto* ddOut = ctx.Output("DDOut"); - - int axis = ctx.Attr("axis"); - - if (dY) dY->mutable_data(Y->dims(), ctx.GetPlace()); - if (dOut) dOut->mutable_data(Out->dims(), ctx.GetPlace()); - if (ddOut) ddOut->mutable_data(Out->dims(), ctx.GetPlace()); - - // ddX_safe == null ? 0 : ddX - // ddY_safe == null ? 0 : ddY - Tensor ddX_safe, ddY_safe; - GetDoubleGradSafeTensor(ctx, dX, ddX, &ddX_safe); - GetDoubleGradSafeTensor(ctx, Y, ddY, &ddY_safe); - - // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y - // dY = Out * dX * ddY / Y - dX * ddX / Y - // dOut = - dX * ddY - // To save memory, (1) dout can be used as 'tmp' tensor, (2) ddout can - // inplace ddx - Tensor tmp; - if (dOut) { - tmp = *dOut; - } else { - auto& dev_ctx = ctx.template device_context(); - tmp = ctx.AllocateTmpTensor(Out->dims(), dev_ctx); - } - if (dY) { - // dX_div_Y = dX / Y; - Tensor dX_div_Y = tmp; - default_elementwise_div(ctx, dX, Y, &dX_div_Y); - - // NOTE(dengkaipeng): in the following ElemwiseGradCompute, for the - // first output tensor is nullptr, the branch to calculate first - // output tensor will not be activated, DivGradDx function will not - // be called and can be ignored, the first branch has little effect - // on running speed. - - // dY = Out * dX * ddY / Y - dX * ddX / Y - ElemwiseGradCompute, DivDoubleDY>( - ctx, ddX_safe, ddY_safe, *Out, dX_div_Y, axis, nullptr, dY, - DivGradDX(), DivDoubleDY()); - } - - if (ddOut) { - // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y - default_elementwise_mul(ctx, Out, &ddY_safe, &tmp); - default_elementwise_sub(ctx, &ddX_safe, &tmp, &tmp); - default_elementwise_div(ctx, &tmp, Y, ddOut); - } - - if (dOut) { - // dOut = - dX * ddY - default_elementwise_mul(ctx, dX, &ddY_safe, dOut); - auto& place = - *ctx.template device_context().eigen_device(); - auto dout = framework::EigenVector::Flatten(*dOut); - dout.device(place) = static_cast(-1) * dout; - } - } -}; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_functor.h b/paddle/fluid/operators/elementwise/elementwise_functor.h index 86f5be3071c2d1..14baeaa74d2421 100644 --- a/paddle/fluid/operators/elementwise/elementwise_functor.h +++ b/paddle/fluid/operators/elementwise/elementwise_functor.h @@ -90,67 +90,6 @@ struct MinFunctor { template using Complex = paddle::platform::complex; -template -struct DivGradXYFunctor { - inline HOSTDEVICE phi::Array operator()(const InT a, const InT b, - const InT c) { - // dx = dout / y - // dy = - dout * out / y - phi::Array outs; - outs[0] = a / c; - outs[1] = -a * b / c; - return outs; - } -}; - -template -struct DivGradXYFunctor, Complex> { - inline HOSTDEVICE phi::Array, 2> operator()( - const Complex a, const Complex b, const Complex c) { - phi::Array, 2> outs; - Complex c_conj(c.real, -c.imag); - Complex out_div_c_conj((b / c).real, -(b / c).imag); - outs[0] = a / c_conj; - outs[1] = -a * out_div_c_conj; - return outs; - } -}; - -// Float div grad -template -struct DivGradXFunctor { - inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; } -}; - -// Complex div grad -template -struct DivGradXFunctor> { - inline HOSTDEVICE Complex operator()(const Complex a, - const Complex b) const { - Complex b_conj(b.real, -b.imag); - return a / b_conj; - } -}; - -// Float mul and div -template -struct DivGradYFunctor { - inline HOSTDEVICE T operator()(const T a, const T b, const T c) const { - return -a * b / c; - } -}; - -// Complex mul and div -template -struct DivGradYFunctor> { - inline HOSTDEVICE Complex operator()(const Complex a, - const Complex b, - const Complex c) const { - Complex out_div_c_conj((b / c).real, -(b / c).imag); - return -a * out_div_c_conj; - } -}; - // Fmax template struct FMaxFunctor { @@ -257,47 +196,6 @@ struct MinGradXYFunctor { } }; -template -struct MulGradFunctor { - inline HOSTDEVICE T operator()(const T a, const T b) const { return a * b; } -}; -template -struct MulGradFunctor> { - inline HOSTDEVICE Complex operator()(const Complex a, - const Complex b) const { - Complex b_conj(b.real, -b.imag); - return a * b_conj; - } -}; - -template -struct MulGradXYFunctor { - inline HOSTDEVICE phi::Array operator()(const InT a, const InT b, - const InT c) { - phi::Array outs; - // dx = dout * y - outs[0] = a * b; - // dy = dout * x - outs[1] = a * c; - return outs; - } -}; - -template -struct MulGradXYFunctor, Complex> { - inline HOSTDEVICE phi::Array, 2> operator()( - const Complex a, const Complex b, const Complex c) { - phi::Array, 2> outs; - // dx = dout * y - Complex b_conj(b.real, -b.imag); - outs[0] = a * b_conj; - // dy = dout * x - Complex c_conj(c.real, -c.imag); - outs[1] = a * c_conj; - return outs; - } -}; - // Ternary compare template struct MaxGradXFunctor { diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc index e172279145e28c..830e09eeae4811 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc @@ -173,55 +173,6 @@ REGISTER_OP_CPU_KERNEL( paddle::platform::complex>, ops::ElementwiseMulKernel>); -REGISTER_OP_CPU_KERNEL( - elementwise_mul_grad, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel>, - ops::ElementwiseMulGradKernel>); -REGISTER_OP_CPU_KERNEL( - elementwise_mul_grad_grad, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel>, - ops::ElementwiseMulDoubleGradKernel>); -REGISTER_OP_CPU_KERNEL( - elementwise_mul_triple_grad, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel>, - ops::ElementwiseMulTripleGradKernel>); REGISTER_OP_VERSION(elementwise_mul) .AddCheckpoint( diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu index 45c87a27a180af..f7b9fd1e265f5d 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu @@ -63,33 +63,6 @@ class ElementwiseMulKernel } }; -template -typename std::enable_if< - std::is_same::value>::type -ElementwiseMulGrad(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - const framework::Tensor* out, const framework::Tensor* dout, - framework::Tensor* dx, framework::Tensor* dy) { - int axis = ctx.Attr("axis"); - const auto& dev_ctx = - ctx.template device_context(); - const auto place = ctx.GetPlace(); - - if (dx != nullptr && dy != nullptr) { - std::vector ins = {dout, y, x}; - GetGradXAndYOut( - dev_ctx, place, axis, ins, dout, dx, dy, MulGradXYFunctor()); - } else if (dx != nullptr && dy == nullptr) { - std::vector ins = {dout, y}; - GetGradXOrYOut(dev_ctx, place, axis, ins, dout, - dx, MulGradFunctor()); - } else if (dx == nullptr && dy != nullptr) { - std::vector ins = {dout, x}; - GetGradXOrYOut(dev_ctx, place, axis, ins, dout, - dy, MulGradFunctor()); - } -} - } // namespace operators } // namespace paddle @@ -103,44 +76,3 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseMulKernel, ops::ElementwiseMulKernel>, ops::ElementwiseMulKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_mul_grad, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel>, - ops::ElementwiseMulGradKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_mul_grad_grad, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel>, - ops::ElementwiseMulDoubleGradKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_mul_triple_grad, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel>, - ops::ElementwiseMulTripleGradKernel>); diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h index c81266d584468f..58a3123c7e332f 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h @@ -137,244 +137,6 @@ class ElementwiseMulKernel : public framework::OpKernel { } } }; -template -struct MulGradDX { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * y; } -}; - -template -struct MulGradDX> { - HOSTDEVICE paddle::platform::complex operator()( - paddle::platform::complex x, paddle::platform::complex y, - paddle::platform::complex out, - paddle::platform::complex dout) const { - paddle::platform::complex y_conj(y.real, -y.imag); - return dout * y_conj; - } -}; - -template -struct MulGradDY { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * x; } -}; - -template -struct MulGradDY> { - HOSTDEVICE paddle::platform::complex operator()( - paddle::platform::complex x, paddle::platform::complex y, - paddle::platform::complex out, - paddle::platform::complex dout) const { - paddle::platform::complex x_conj(x.real, -x.imag); - return dout * x_conj; - } -}; -template -typename std::enable_if< - std::is_same::value>::type -ElementwiseMulGrad(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - const framework::Tensor* out, const framework::Tensor* dout, - framework::Tensor* dx, framework::Tensor* dy) { - int axis = ctx.Attr("axis"); - ElemwiseGradCompute, MulGradDY>( - ctx, *x, *y, *out, *dout, axis, dx, dy, MulGradDX(), MulGradDY()); -} - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -template -typename std::enable_if< - std::is_same::value>::type -ElementwiseMulGrad(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - const framework::Tensor* out, const framework::Tensor* dout, - framework::Tensor* dx, framework::Tensor* dy); -#endif - -template -class ElementwiseMulGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - using Tensor = framework::Tensor; - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* out = dout; // out is not necessary - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - - ElementwiseMulGrad(ctx, x, y, out, dout, dx, dy); - } -}; - -template -class ElementwiseMulDoubleGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using Tensor = framework::Tensor; - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input("DOut"); - auto* ddx = ctx.Input("DDX"); - auto* ddy = ctx.Input("DDY"); - - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - auto* ddout = ctx.Output("DDOut"); - - if (ddout) ddout->mutable_data(ctx.GetPlace()); - - Tensor ddx_safe, ddy_safe; - GetDoubleGradSafeTensor(ctx, x, ddx, &ddx_safe); - GetDoubleGradSafeTensor(ctx, y, ddy, &ddy_safe); - - // dx = dout * ddy - // dy = dout * ddx - // ddout = ddx * y + x * ddy - // change computation sequence to save memory, so ddout can inplace ddx and - // dx can be used as 'tmp' tensor - // (1) dx = x * ddy - // (2) dy = dout * ddx - // (3) ddout = ddx * y - // (4) ddout = ddout + dx - // (5) dx = dout * ddy - if (ddout) { - int axis = ctx.Attr("axis"); - auto& place = - *ctx.template device_context().eigen_device(); - // size(ddout) > size(ddx), ddout can't use memory of ddx using inplace - if (ddout->numel() > ddx->numel()) { - ElemwiseGradCompute, MulGradDY>( - ctx, ddx_safe, ddy_safe, *dout, *dout, axis, dx, dy, MulGradDX(), - MulGradDY()); - - Tensor ddout_tmp; - ddout_tmp.mutable_data(ddout->dims(), ctx.GetPlace()); - - default_elementwise_mul(ctx, y, &ddx_safe, ddout); - default_elementwise_mul(ctx, &ddy_safe, x, - &ddout_tmp); - - auto ddout_t = framework::EigenVector::Flatten(*ddout); - auto ddout_tmp_t = framework::EigenVector::Flatten(ddout_tmp); - ddout_t.device(place) = ddout_t + ddout_tmp_t; - } else { - // use dx to save memory, other than alloc tmp tensor - Tensor* ddout_tmp = dx; - - default_elementwise_mul(ctx, x, &ddy_safe, ddout_tmp); - // NOTE: in the following ElemwiseGradCompute, for the - // first output tensor is nullptr, the branch to calculate first - // output tensor will not be activated, DivGradDx function will not - // be called and can be ignored, the first branch has little effect - // on running speed. - ElemwiseGradCompute, MulGradDY>( - ctx, ddx_safe, ddy_safe, *dout, *dout, axis, nullptr, dy, - MulGradDX(), MulGradDY()); - default_elementwise_mul(ctx, &ddx_safe, y, ddout); - - auto ddout_t = framework::EigenVector::Flatten(*ddout); - auto ddout_tmp_t = framework::EigenVector::Flatten(*ddout_tmp); - ddout_t.device(place) = ddout_t + ddout_tmp_t; - default_elementwise_mul(ctx, dout, &ddy_safe, dx); - } - } - } -}; - -template -class ElementwiseMulTripleGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using Tensor = framework::Tensor; - // get input - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input("DOut"); - auto* ddx = ctx.Input("DDX"); - auto* ddy = ctx.Input("DDY"); - - auto* d_dx = ctx.Input("D_DX"); - auto* d_dy = ctx.Input("D_DY"); - auto* d_ddout = ctx.Input("D_DDOut"); - - // get output - auto* out_d_x = ctx.Output("D_X"); - auto* out_d_y = ctx.Output("D_Y"); - auto* out_d_dout = ctx.Output("D_DOut"); - - auto* out_d_ddx = ctx.Output("D_DDX"); - auto* out_d_ddy = ctx.Output("D_DDY"); - - if (out_d_x) out_d_x->mutable_data(x->dims(), ctx.GetPlace()); - if (out_d_y) out_d_y->mutable_data(y->dims(), ctx.GetPlace()); - if (out_d_dout) out_d_dout->mutable_data(dout->dims(), ctx.GetPlace()); - if (out_d_ddx) out_d_ddx->mutable_data(x->dims(), ctx.GetPlace()); - if (out_d_ddy) out_d_ddy->mutable_data(y->dims(), ctx.GetPlace()); - - auto& place = *ctx.template device_context().eigen_device(); - - Tensor ddx_safe, ddy_safe; - GetDoubleGradSafeTensor(ctx, x, ddx, &ddx_safe); - GetDoubleGradSafeTensor(ctx, y, ddy, &ddy_safe); - - if (d_ddout) { - if (out_d_x) { - // out_d_x = ddy * d_ddout - default_elementwise_mul(ctx, &ddy_safe, d_ddout, - out_d_x); - } - if (out_d_y) { - // out_d_y = ddx * d_ddout - default_elementwise_mul(ctx, &ddx_safe, d_ddout, - out_d_y); - } - } - - if (out_d_dout) { - // get out_d_dout - // out_d_dout = ddy * d_dx + d_dy * ddx - Tensor out_d_dout_tmp; - out_d_dout_tmp.mutable_data(dout->dims(), ctx.GetPlace()); - default_elementwise_mul(ctx, d_dy, &ddx_safe, - out_d_dout); - default_elementwise_mul(ctx, &ddy_safe, d_dx, - &out_d_dout_tmp); - auto out_d_dout_t = framework::EigenVector::Flatten(*out_d_dout); - auto out_d_dout_tmp_t = - framework::EigenVector::Flatten(out_d_dout_tmp); - out_d_dout_t.device(place) = out_d_dout_t + out_d_dout_tmp_t; - } - - if (out_d_ddx) { - // get out_d_ddx - // out_d_ddx = dout * d_dy + y * d_ddout - Tensor out_d_ddx_tmp; - out_d_ddx_tmp.mutable_data(ddx->dims(), ctx.GetPlace()); - default_elementwise_mul(ctx, dout, d_dy, out_d_ddx); - default_elementwise_mul(ctx, y, d_ddout, - &out_d_ddx_tmp); - auto out_d_ddx_t = framework::EigenVector::Flatten(*out_d_ddx); - auto out_d_ddx_tmp_t = framework::EigenVector::Flatten(out_d_ddx_tmp); - out_d_ddx_t.device(place) = out_d_ddx_t + out_d_ddx_tmp_t; - } - - if (out_d_ddy) { - // get out_d_ddy - // out_d_ddy = dout * d_dx + x * d_ddout - Tensor out_d_ddy_tmp; - out_d_ddy_tmp.mutable_data(ddy->dims(), ctx.GetPlace()); - default_elementwise_mul(ctx, dout, d_dx, out_d_ddy); - default_elementwise_mul(ctx, x, d_ddout, - &out_d_ddy_tmp); - auto out_d_ddy_t = framework::EigenVector::Flatten(*out_d_ddy); - auto out_d_ddy_tmp_t = framework::EigenVector::Flatten(out_d_ddy_tmp); - out_d_ddy_t.device(place) = out_d_ddy_t + out_d_ddy_tmp_t; - } - } -}; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index 61862aa9f87408..80b07721f0b4d1 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -45,6 +45,7 @@ limitations under the License. */ #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/kernels/gpu/elementwise_grad.h" #endif @@ -145,17 +146,9 @@ void ElemwiseGradCompute(const framework::ExecutionContext &ctx, const framework::Tensor &dout, int axis, framework::Tensor *dx, framework::Tensor *dy, DX_OP dx_op, DY_OP dy_op) { - const framework::DDim &x_dim = x.dims(); - const framework::DDim &y_dim = y.dims(); const auto &dev_ctx = ctx.template device_context(); - if (x.dims() == y.dims()) { - phi::funcs::ElemwiseGradComputeNoBroadcast( - dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op); - } else { - phi::funcs::ElemwiseGradComputeWithBroadcast( - dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op); - } + phi::funcs::ElemwiseGradCompute( + dev_ctx, x, y, out, dout, axis, dx, dy, dx_op, dy_op); } // It is a common implementation to compute binary calculation with the support @@ -1174,14 +1167,6 @@ static inline std::vector GetReduceDim(const framework::DDim &in, } #if defined(__NVCC__) || defined(__HIPCC__) -template -void ReduceWrapper(const platform::CUDADeviceContext &dev_ctx, int axis, - framework::Tensor *src, framework::Tensor *dst) { - std::vector reduce_dims = GetReduceDim(dst->dims(), src->dims(), axis); - TensorReduceImpl>( - dev_ctx, *src, dst, kps::IdentityFunctor(), reduce_dims, - dev_ctx.stream()); -} template void GetGradXAndYOut(const platform::CUDADeviceContext &dev_ctx, @@ -1189,36 +1174,8 @@ void GetGradXAndYOut(const platform::CUDADeviceContext &dev_ctx, std::vector ins, const framework::Tensor *dout, framework::Tensor *dx, framework::Tensor *dy, Functor func) { - framework::Tensor tmp_dx; - framework::Tensor tmp_dy; - dx->mutable_data(place); - dy->mutable_data(place); - std::vector outs; - if (dx->dims() == dout->dims() && dy->dims() == dout->dims()) { - outs = {dx, dy}; - } else if (dx->dims() != dout->dims() && dy->dims() == dout->dims()) { - tmp_dx.mutable_data(dout->dims(), place); - outs = {&tmp_dx, dy}; - } else if (dx->dims() == dout->dims() && dy->dims() != dout->dims()) { - tmp_dy.mutable_data(dout->dims(), place); - outs = {dx, &tmp_dy}; - } else if (dx->dims() != dout->dims() && dy->dims() != dout->dims()) { - tmp_dy.mutable_data(dout->dims(), place); - tmp_dx.mutable_data(dout->dims(), place); - outs = {&tmp_dx, &tmp_dy}; - } - - paddle::operators::LaunchElementwiseCudaKernel( - dev_ctx, ins, &outs, axis, func); - - if (dx->dims() != dout->dims() && dy->dims() == dout->dims()) { - ReduceWrapper(dev_ctx, axis, &tmp_dx, dx); - } else if (dx->dims() == dout->dims() && dy->dims() != dout->dims()) { - ReduceWrapper(dev_ctx, axis, &tmp_dy, dy); - } else if (dx->dims() != dout->dims() && dy->dims() != dout->dims()) { - ReduceWrapper(dev_ctx, axis, &tmp_dx, dx); - ReduceWrapper(dev_ctx, axis, &tmp_dy, dy); - } + phi::GetGradXAndYOut(dev_ctx, place, axis, ins, *dout, dx, dy, + func); } template @@ -1227,22 +1184,8 @@ void GetGradXOrYOut(const platform::CUDADeviceContext &dev_ctx, std::vector ins, const framework::Tensor *dout, framework::Tensor *dxy, Functor func) { - framework::Tensor tmp_dxy; - dxy->mutable_data(place); - - std::vector outs; - if (dxy->dims() != dout->dims()) { - tmp_dxy.mutable_data(dout->dims(), place); - outs = {&tmp_dxy}; - } else { - outs = {dxy}; - } - - paddle::operators::LaunchElementwiseCudaKernel(dev_ctx, ins, &outs, - axis, func); - if (dxy->dims() != dout->dims()) { - ReduceWrapper(dev_ctx, axis, &tmp_dxy, dxy); - } + phi::GetGradXOrYOut(dev_ctx, place, axis, ins, *dout, dxy, + func); } #endif diff --git a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc index fc128a88f2096a..3e9263fe93acd9 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc +++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc b/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc index 9aa206efed8c01..7890d634e99417 100644 --- a/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc +++ b/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc @@ -28,7 +28,7 @@ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" -USE_OP(elementwise_div); +USE_OP_ITSELF(elementwise_div); namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/empty_op.cc b/paddle/fluid/operators/empty_op.cc index 6baa504562e76f..9e0e4e7fe1c6d2 100644 --- a/paddle/fluid/operators/empty_op.cc +++ b/paddle/fluid/operators/empty_op.cc @@ -88,11 +88,8 @@ class EmptyOpVarTypeInference : public framework::VarTypeInference { namespace ops = paddle::operators; namespace plat = paddle::platform; -DELCARE_INFER_SHAPE_FUNCTOR(empty, EmptyInferShapeFunctor, - PT_INFER_META(phi::CreateInferMeta)); - -REGISTER_OPERATOR( - empty, ops::EmptyOp, ops::EmptyOpMaker, ops::EmptyOpVarTypeInference, - paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker, - EmptyInferShapeFunctor); +DECLARE_INFER_SHAPE_FUNCTOR(empty, EmptyInferShapeFunctor, + PD_INFER_META(phi::CreateInferMeta)); +REGISTER_OP_WITHOUT_GRADIENT(empty, ops::EmptyOp, ops::EmptyOpMaker, + ops::EmptyOpVarTypeInference, + EmptyInferShapeFunctor); diff --git a/paddle/fluid/operators/erfinv_op.cc b/paddle/fluid/operators/erfinv_op.cc index 3d409b4c4f6772..374b00792622f9 100644 --- a/paddle/fluid/operators/erfinv_op.cc +++ b/paddle/fluid/operators/erfinv_op.cc @@ -73,8 +73,8 @@ DECLARE_INPLACE_OP_INFERER(ErfinvInplaceInferer, {"X", "Out"}); } // namespace operators } // namespace paddle -DELCARE_INFER_SHAPE_FUNCTOR(erfinv, ErfinvInferShapeFunctor, - PT_INFER_META(phi::UnchangedInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(erfinv, ErfinvInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR( erfinv, paddle::operators::ErfinvOp, paddle::operators::ErfinvOpMaker, diff --git a/paddle/fluid/operators/expand_op_npu_test.cc b/paddle/fluid/operators/expand_op_npu_test.cc index cdd4e1dbaae6a6..df00ae54c1036b 100644 --- a/paddle/fluid/operators/expand_op_npu_test.cc +++ b/paddle/fluid/operators/expand_op_npu_test.cc @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/eye_op.cc b/paddle/fluid/operators/eye_op.cc index f8c6b4eb8c5e09..537c218d357b67 100644 --- a/paddle/fluid/operators/eye_op.cc +++ b/paddle/fluid/operators/eye_op.cc @@ -67,8 +67,8 @@ Return an identity tensor whose shape is [num_rows, num_columns]. } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(eye, EyeInferShapeFunctor, - PT_INFER_META(phi::EyeInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(eye, EyeInferShapeFunctor, + PD_INFER_META(phi::EyeInferMeta)); REGISTER_OPERATOR( eye, ops::EyeOp, ops::EyeOpMaker, ops::EyeOpVarTypeInference, diff --git a/paddle/fluid/operators/fill_constant_op_npu.cc b/paddle/fluid/operators/fill_constant_op_npu.cc index 79018f2a97448a..cb03add3143278 100644 --- a/paddle/fluid/operators/fill_constant_op_npu.cc +++ b/paddle/fluid/operators/fill_constant_op_npu.cc @@ -65,7 +65,7 @@ class FillConstantNPUKernel : public framework::OpKernel { tensor_value.mutable_data({1}, ctx.GetPlace()); FillNpuTensorWithConstant(&tensor_value, value); NpuOpRunner runner; -#if (CANN_VERSION_CODE >= 503003) +#if (CANN_VERSION_CODE >= 503003 && CANN_VERSION_CODE < 504001) runner.SetType("FillD") .AddInput(tensor_value) .AddOutput(*out_var) diff --git a/paddle/fluid/operators/filter_by_instag_op.cu b/paddle/fluid/operators/filter_by_instag_op.cu new file mode 100644 index 00000000000000..508730c3c7335d --- /dev/null +++ b/paddle/fluid/operators/filter_by_instag_op.cu @@ -0,0 +1,655 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// #if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11000 + +#if defined(PADDLE_WITH_CUDA) +#include +#endif + +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/mixed_vector.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/enforce.h" + +#include "paddle/fluid/operators/filter_by_instag_op.h" + +#if defined(PADDLE_WITH_CUDA) +namespace cg = cooperative_groups; +#endif + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using SelectedRows = phi::SelectedRows; +using LoDTensor = framework::LoDTensor; + +template +using Vector = framework::Vector; + +#define WARP_SIZE 32 +#define MAX_WARP_NUM 32 + +#if defined(PADDLE_WITH_CUDA) + +template +__global__ void filter_copy_fuse_kernel( + const size_t N, const int ins_per_thread, size_t* x1_lods_data, + size_t* x2_lods_data, const int64_t* x2_data, const int64_t* x3_data, + int64_t filter_tag_size, T* out_data, int64_t* map_data, + size_t* map_lods_data, size_t* out_lods_data, size_t* out_idx_data, + const T* x1_data, int x1_embed_size, float* loss_weight_data, + float fill_value) { + // N is instance num + // one threads for ins_per_thread instances + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + cg::thread_block b = cg::this_thread_block(); + cg::thread_block_tile g = cg::tiled_partition(b); + + int gid = idx / WARP_SIZE; + + // general use + int thread_num = + (N + (ins_per_thread - 1)) / ins_per_thread; // real thread num + int total_warp_num = thread_num / WARP_SIZE; // 30 + int remain_thread_num = thread_num % WARP_SIZE; // 16 + + int warp_thread_num = -1; + if (gid < total_warp_num) { + warp_thread_num = WARP_SIZE; + } else { + warp_thread_num = remain_thread_num; + } + + int group_num = total_warp_num; + if (remain_thread_num > 0) { + group_num = total_warp_num + 1; + } + + if (gid >= group_num) return; + + int ins_start = idx * ins_per_thread; + int ins_end = (idx + 1) * ins_per_thread; + + if (N < ins_end) ins_end = N; + + /* + if (!x1_lods_filled) { + for (int p = ins_start; p < ins_end; p++) { + x1_lods_data[p] = p; + } + if (idx == 0) { + x1_lods_data[N] = N; + } + } + + if (!x2_lods_filled) { + for (int p = ins_start; p < ins_end; p++) { + x2_lods_data[p] = p; + } + if (idx == 0) { + x2_lods_data[N] = N; + } + } + + if (!x1_lods_filled || !x2_lods_filled) { + b.sync(); + } + */ + + int flag_data[5]; + int prefix_sum_data[5]; + int prefix_sum_data2[5]; + + __shared__ int shr[MAX_WARP_NUM]; + __shared__ int shr2[MAX_WARP_NUM]; + __shared__ int shr3[MAX_WARP_NUM]; + + for (int p = ins_start; p < ins_end; p++) { + int ins_tag_start = x2_lods_data[p]; + int ins_tag_end = x2_lods_data[p + 1]; + flag_data[p - ins_start] = 0; + // filter logic + int i = ins_tag_start; + for (; i < ins_tag_end; i++) { + int64_t ins_tag = x2_data[i]; + int j = 0; + for (; j < filter_tag_size; j++) { + if (x3_data[j] == ins_tag) break; + } + // if ins_tag in filter tag + if (j < filter_tag_size) { + flag_data[p - ins_start] = 1; + break; + } + } + } + + int sum_addr = 0; + int sum_flag = 0; + int sum_out_lods = 0; + + int local_addr = 0; + int local_flag = 0; + int local_out_lods = 0; + + if (ins_start < ins_end) { + for (int p = ins_start; p < ins_end; p++) { + int previous = -1; + if (p == ins_start) { + previous = 0; + } else { + previous = prefix_sum_data[p - ins_start - 1]; + } + + prefix_sum_data[p - ins_start] = + previous + + flag_data[p - ins_start] * (x1_lods_data[p + 1] - x1_lods_data[p]); + } + + local_addr = prefix_sum_data[ins_end - 1 - ins_start]; + sum_addr = local_addr; + + // flag + // local_flag = 0; + for (int p = ins_start; p < ins_end; p++) { + local_flag += flag_data[p - ins_start]; + } + sum_flag = local_flag; + + for (int p = ins_start; p < ins_end; p++) { + local_out_lods += + flag_data[p - ins_start] * (x1_lods_data[p + 1] - x1_lods_data[p]); + } + + sum_out_lods = local_out_lods; + } + + // 32 threads + for (int i = 1; i < warp_thread_num; i *= 2) { + int temp_addr = g.shfl_up(sum_addr, i); + int temp_flag = g.shfl_up(sum_flag, i); + int temp_out_lods = g.shfl_up(sum_out_lods, i); + + if (g.thread_rank() >= i) { + sum_addr += temp_addr; + sum_flag += temp_flag; + sum_out_lods += temp_out_lods; + } + } + + if (g.thread_rank() == warp_thread_num - 1) { + shr[gid] = sum_addr; + shr2[gid] = sum_flag; + shr3[gid] = sum_out_lods; + } + + b.sync(); + + int sum_addr2 = 0; + int sum_flag2 = 0; + int sum_out_lods2 = 0; + + // communicate between warp + if (g.thread_rank() < group_num) { + sum_addr2 = shr[g.thread_rank()]; + sum_flag2 = shr2[g.thread_rank()]; + sum_out_lods2 = shr3[g.thread_rank()]; + } + + for (int i = 1; i < group_num; i *= 2) { + int temp_addr2 = g.shfl_up(sum_addr2, i); + int temp_flag2 = g.shfl_up(sum_flag2, i); + int temp_out_lods2 = g.shfl_up(sum_out_lods2, i); + + if (g.thread_rank() >= i) { + sum_addr2 += temp_addr2; + sum_flag2 += temp_flag2; + sum_out_lods2 += temp_out_lods2; + } + } + + int sum_addr3 = g.shfl(sum_addr2, gid); + int sum_flag3 = g.shfl(sum_flag2, gid); + int sum_out_lods3 = g.shfl(sum_out_lods2, gid); + + int p_flag; + int p_addr; + int p_out_lods; + + if (ins_start < ins_end) { + p_addr = sum_addr3 - shr[gid] + sum_addr - local_addr; + p_flag = sum_flag3 - shr2[gid] + sum_flag - local_flag; + p_out_lods = sum_out_lods3 - shr3[gid] + sum_out_lods - local_out_lods; + + for (int p = ins_start; p < ins_end; p++) { + if (ins_start == p) { + prefix_sum_data2[p - ins_start] = p_addr; + } else { + prefix_sum_data2[p - ins_start] = + prefix_sum_data2[p - ins_start - 1] + + flag_data[p - ins_start - 1] * + (x1_lods_data[p] - x1_lods_data[p - 1]); + } + } + + if (gid == 0 && g.thread_rank() == group_num - 1) { + *out_idx_data = (sum_flag2 + 1); + map_lods_data[sum_flag2] = sum_flag2; + } + } + + int sum_out_lods4 = g.shfl(sum_out_lods2 + 1, group_num - 1); + + if (ins_start < ins_end) { + int out_lods_idx = p_flag + 1; + + // ins_start = 1 + // BUG fix + for (int p = ins_start; p < ins_end; p++) { + if (flag_data[p - ins_start] == 1) { + // batch_len = 2 + // batch_len = 4 + size_t batch_len = x1_lods_data[p + 1] - x1_lods_data[p]; + // t = 0 + // t = 1 + int t = out_lods_idx - 1; + // out_lods_data[0] = 0; + int previous; + + if (out_lods_idx == p_flag + 1) { + // out_lods_data[t] = p_out_lods; + previous = p_out_lods; + } else { + previous = out_lods_data[t]; + } + + map_data[t * 3] = (int64_t)previous; + map_data[t * 3 + 1] = x1_lods_data[p]; + map_lods_data[t] = t; + out_lods_data[out_lods_idx] = previous + batch_len; + map_data[t * 3 + 2] = batch_len; + out_lods_idx++; + } + } + + // fill loss_weight_data + if (sum_out_lods4 > 1) { + int out_data_num = sum_out_lods4 - 1; + int out_start = ins_start; + + if (out_start < out_data_num) { + int out_end = ins_end >= out_data_num ? out_data_num : ins_end; + for (int p = out_start; p < out_end; p++) { + loss_weight_data[p] = fill_value; + } + } + } + + for (int p = ins_start; p < ins_end; p++) { + // copy logic + if (flag_data[p - ins_start] == 1) { + auto output_start_idx = prefix_sum_data2[p - ins_start]; + T* dst = out_data + output_start_idx * x1_embed_size; + + const T* src_start = x1_data + x1_lods_data[p] * x1_embed_size; + const T* src_end = x1_data + x1_lods_data[p + 1] * x1_embed_size; + + // optimized + for (const T *j = src_start; j != src_end; dst++, j++) { + *dst = *j; + } + } + } + } + + b.sync(); +} + +template +__global__ void copy_grad_kernel(const size_t N, const int ins_per_thread, + const T* out_grad_data, T* x1_grad_data, + const int64_t* map_data, int x1_embed_size) { + // N is instance num + // one threads for one instance + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int ins_start = idx * ins_per_thread; + int ins_end = (idx + 1) * ins_per_thread; + + if (ins_start >= N) { + return; + } + if (ins_end > N) ins_end = N; + + for (int p = ins_start; p < ins_end; p++) { + T* dst = x1_grad_data + map_data[p * 3 + 1] * x1_embed_size; + const T* src_start = out_grad_data + map_data[p * 3] * x1_embed_size; + const T* src_end = + out_grad_data + (map_data[p * 3] + map_data[p * 3 + 2]) * x1_embed_size; + + for (const T *j = src_start; j != src_end; dst++, j++) { + *dst = *j; + } + } +} + +#endif + +template +class FilterByInstagGPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { +#if defined(PADDLE_WITH_CUDA) + + auto gpu_place = context.GetPlace(); + + gpuStream_t current_stream = context.cuda_device_context().stream(); + + int max_thread_num_per_block = 1024; + // context.cuda_device_context().GetMaxThreadsPerBlock(); + // X1 is global FC output + // Dim [batch size, embedding size] + const LoDTensor* x1 = context.Input("Ins"); + bool is_lod = context.Attr("is_lod"); + + int is_x1_lod = -1; + if (is_lod) + is_x1_lod = 1; + else + is_x1_lod = 0; + + int64_t out_val_if_empty = context.Attr("out_val_if_empty"); + size_t x1_embed_size = x1->dims()[1]; + // X2 is ins tag list + // LoD [[0, Sum(ins1), Sum(ins1, ins2), ... ]] + const LoDTensor* x2 = context.Input("Ins_tag"); + // expected auto = const int64_t + const int64_t* x2_data = x2->data(); + + // X3 is local fc tag list + // LoD [[0, Sum(fc1), Sum(fc1, fc2) ...]] + const Tensor* x3 = context.Input("Filter_tag"); + const int64_t* x3_data = x3->data(); + + // int x2_lods_filled = 1; + + Vector x2_lods; + // Vector, in GPU + if (x2->lod().size() != 0) { // lod_level = 1 + x2_lods = x2->lod()[0]; + // x2_lods_filled = 1; + + } else { // lod_level = 0 + const size_t x2_lods_size = x2->dims()[0]; + // x2_lods.resize(x2->dims()[0] + 1); + // move to cuda + x2_lods.push_back(0); + for (size_t i = 0; i < x2_lods_size; i++) { + x2_lods.push_back(i + 1); + } + } + + const size_t x2_lods_size = x2_lods.size() - 1; + paddle::framework::MixVector mixv_x2_lods(&x2_lods); + + size_t* x2_lods_data = mixv_x2_lods.CUDAMutableData(gpu_place); + + // Vector, in GPU + // int x1_lods_filled = 1; + Vector x1_lods; + + if (!is_x1_lod) { + // move to cuda + // x1_lods.resize(x1->dims()[0] + 1); + x1_lods.push_back(0); + for (int i = 0; i < x1->dims()[0]; i++) { + x1_lods.push_back(i + 1); + } + } else { + // x1_lods = context.Input("Ins")->lod()[0]; + // new: lod_level=0 => lod() return {} + if (x1->lod().size() != 0) { // lod_level = 1 + // x1_lods_filled = 1; + x1_lods = x1->lod()[0]; + } else { // lod_level = 0 + // x1_lods.resize(x1->dims()[0] + 1); + // move to cuda + x1_lods.push_back(0); + for (int i = 0; i < x1->dims()[0]; i++) { + x1_lods.push_back(i + 1); + } + } + } + + paddle::framework::MixVector mixv_x1_lods(&x1_lods); + + size_t* x1_lods_data = mixv_x1_lods.CUDAMutableData(gpu_place); + auto* x1_data = x1->data(); + + // set output value + // for those whose ins been dropout, set 0 for whole lines. + // otherwise, copy whole line + // Dim [local fc count, batch size, embedding size] + LoDTensor* out = context.Output("Out"); + LoDTensor* map = context.Output("IndexMap"); + LoDTensor* loss_weight = context.Output("LossWeight"); + + int out_first = x1_lods.back(); + // int out_first = x1->dims()[0]; + // if (x1_lods_filled) { + // out_first = x1_lods.back(); + // } + + out->Resize(phi::make_ddim({(int64_t)out_first, (int64_t)x1_embed_size})); + map->Resize(phi::make_ddim({(int64_t)x2_lods_size, 3})); + loss_weight->Resize(phi::make_ddim({(int64_t)x2_lods_size, 1})); + + T* out_data = out->mutable_data(gpu_place); + int64_t* map_data = map->mutable_data(gpu_place); + float* loss_weight_data = loss_weight->mutable_data(gpu_place); + + int block_size = max_thread_num_per_block; + int ins_per_thread = (x2_lods_size + block_size - 1) / block_size; + dim3 block_dim(block_size); + dim3 grid_dim(1); + + Vector out_lods(x2_lods_size + 1, 0); + Vector map_lods(x2_lods_size + 1, 0); + + paddle::framework::MixVector mixv_out_lods(&out_lods); + paddle::framework::MixVector mixv_map_lods(&map_lods); + + // thrust::device_vector out_idx(1); + Vector out_idx(1, 0); + paddle::framework::MixVector mixv_out_idx(&out_idx); + + size_t* out_idx_data = mixv_out_idx.CUDAMutableData(gpu_place); + size_t* out_lods_data = mixv_out_lods.CUDAMutableData(gpu_place); + size_t* map_lods_data = mixv_map_lods.CUDAMutableData(gpu_place); + + float fill_value = 1.0; + + filter_copy_fuse_kernel<<>>( + x2_lods_size, ins_per_thread, x1_lods_data, x2_lods_data, x2_data, + x3_data, x3->numel(), out_data, map_data, map_lods_data, out_lods_data, + out_idx_data, x1_data, x1_embed_size, loss_weight_data, fill_value); + + platform::GpuStreamSync(current_stream); + + mixv_out_lods.resize(mixv_out_idx[0]); + + if (mixv_out_lods.size() - 1 > 0) { + out->Resize(phi::make_ddim( + {(int64_t)mixv_out_lods.back(), (int64_t)x1_embed_size})); + + map->Resize(phi::make_ddim({(int64_t)mixv_out_lods.size() - 1, 3})); + loss_weight->Resize( + phi::make_ddim({(int64_t)mixv_out_lods.size() - 1, 1})); + + } else { + out->Resize(phi::make_ddim({1, (int64_t)x1_embed_size})); + map->Resize(phi::make_ddim({1, 3})); + loss_weight->Resize(phi::make_ddim({1, 1})); + } + + if (mixv_out_lods.size() - 1 > 0) { + map_lods.resize(mixv_out_lods.size()); + + mixv_map_lods.CopyToCPU(); + + std::vector> map_lod_info; + map_lod_info.emplace_back(map_lods); + + map->set_lod(map_lod_info); + loss_weight->set_lod(map_lod_info); + + mixv_out_lods.CopyToCPU(); + std::vector> out_lod_info; + out_lod_info.emplace_back(out_lods); + out->set_lod(out_lod_info); + + } else { + Vector map_lods(2, 0); + paddle::framework::MixVector mixv_map_lods(&map_lods); + thrust::device_ptr map_data_ptr(map_data); + + map_data_ptr[0] = 0; + map_data_ptr[1] = 1; + map_data_ptr[2] = 1; + + mixv_map_lods[0] = 0; + mixv_map_lods[1] = 1; + mixv_out_lods.push_back(1); + + mixv_map_lods.CopyToCPU(); + mixv_out_lods.CopyToCPU(); + + std::vector> map_lod_info; + map_lod_info.emplace_back(map_lods); + map->set_lod(map_lod_info); + + loss_weight->set_lod(map_lod_info); + + std::vector> out_lod_info; + out_lod_info.emplace_back(out_lods); + out->set_lod(out_lod_info); + + thrust::device_ptr out_data_ptr(out_data); + + // gpu kernel + if (std::is_same::value) { + thrust::fill(out_data_ptr, out_data_ptr + out->numel(), + static_cast(out_val_if_empty)); + } else if (std::is_same::value) { + thrust::fill(out_data_ptr, out_data_ptr + out->numel(), + static_cast(out_val_if_empty)); + } else if (std::is_same::value) { + thrust::fill(out_data_ptr, out_data_ptr + out->numel(), + static_cast(out_val_if_empty)); + } else { + thrust::fill(out_data_ptr, out_data_ptr + out->numel(), + static_cast(out_val_if_empty)); + } + + thrust::device_ptr loss_weight_data_ptr(loss_weight_data); + loss_weight_data_ptr[0] = 0; + } + +#endif + } +}; + +template +class FilterByInstagGradGPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { +#if defined(PADDLE_WITH_CUDA) + + auto gpu_place = context.GetPlace(); + gpuStream_t current_stream = context.cuda_device_context().stream(); + auto max_thread_num_per_block = 1024; + auto* output_grad = context.Input(framework::GradVarName("Out")); + auto* x1_grad = context.Output(framework::GradVarName("Ins")); + auto* loss_weight = context.Input("LossWeight"); + auto* mmap = context.Input("IndexMap"); + auto* x1 = context.Input("Ins"); + + x1_grad->set_lod(context.Input("Ins")->lod()); + x1_grad->Resize(x1->dims()); + + auto* mmap_data = mmap->data(); + // expected auto = T + auto* output_grad_data = output_grad->data(); + auto* loss_weight_data = loss_weight->data(); + + // expected auto = T + auto* x1_grad_data = x1_grad->mutable_data(gpu_place); + thrust::device_ptr x1_grad_data_ptr(x1_grad_data); + thrust::device_ptr loss_weight_data_ptr(loss_weight_data); + + thrust::fill(x1_grad_data_ptr, + x1_grad_data_ptr + x1->dims()[0] * x1->dims()[1], 0); + + if (loss_weight->numel() != 1 || loss_weight_data_ptr[0] != 0) { + auto output_dims = output_grad->dims(); + int x1_embed_size = output_dims[1]; + + // one thread for multi-instances + int block_size = max_thread_num_per_block; + + size_t N = mmap->dims()[0]; + dim3 block_dim(block_size); + + dim3 grid_dim((N + block_size - 1) / block_size); + + const int ins_per_thread = 1; + + copy_grad_kernel<<>>( + N, ins_per_thread, output_grad_data, x1_grad_data, mmap_data, + x1_embed_size); + + cudaStreamSynchronize(current_stream); + } + +#endif + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL(filter_by_instag, ops::FilterByInstagGPUKernel, + ops::FilterByInstagGPUKernel, + ops::FilterByInstagGPUKernel, + ops::FilterByInstagGPUKernel); + +REGISTER_OP_CUDA_KERNEL(filter_by_instag_grad, + ops::FilterByInstagGradGPUKernel, + ops::FilterByInstagGradGPUKernel, + ops::FilterByInstagGradGPUKernel, + ops::FilterByInstagGradGPUKernel); diff --git a/paddle/fluid/operators/filter_by_instag_op.h b/paddle/fluid/operators/filter_by_instag_op.h index deb2aa96b539e3..3abc980ceaafc3 100644 --- a/paddle/fluid/operators/filter_by_instag_op.h +++ b/paddle/fluid/operators/filter_by_instag_op.h @@ -61,7 +61,20 @@ class FilterByInstagKernel : public framework::OpKernel { // expected auto = const int64_t auto* x2_data = x2->data(); // e.g get [0, 1, 2, 3, ...] - size_t x2_lods_size = x2->dims()[0]; + // size_t x2_lods_size = x2->dims()[0]; + // size_t instag_num_per_ins = x2->dims()[1]; + + Vector x2_lods(1, 0); + if (x2->lod().size() != 0) { // lod_level = 1 + x2_lods = x2->lod()[0]; + } else { // lod_level = 0 + const size_t x2_lods_size = x2->dims()[0]; + const size_t instag_num_per_ins = x2->dims()[1]; + for (size_t i = 0; i < x2_lods_size; i++) { + x2_lods.push_back(x2_lods.back() + instag_num_per_ins); + } + } + Vector x1_lods(1, 0); if (!is_x1_lod) { for (int i = 0; i < x1->dims()[0]; i++) { @@ -79,8 +92,8 @@ class FilterByInstagKernel : public framework::OpKernel { } std::unordered_map mmap_aux; Vector out_lods(1, 0); - for (size_t i = 0; i < x2_lods_size; i++) { - for (size_t j = i; j < i + 1; j++) { + for (size_t i = 0; i < x2_lods.size() - 1; i++) { + for (size_t j = x2_lods[i]; j < x2_lods[i + 1]; j++) { if (filter_tag.find(x2_data[j]) != filter_tag.end()) { size_t batch_len = x1_lods[i + 1] - x1_lods[i]; mmap_aux[out_lods.back()] = x1_lods[i]; @@ -165,8 +178,10 @@ class FilterByInstagKernel : public framework::OpKernel { out_data[oi] = (int32_t)out_val_if_empty; } else if (std::is_same::value) { out_data[oi] = (int64_t)out_val_if_empty; - } else { + } else if (std::is_same::value) { out_data[oi] = static_cast(out_val_if_empty); + } else { + out_data[oi] = static_cast(out_val_if_empty); } } loss_weight_data[0] = 0; diff --git a/paddle/fluid/operators/fold_op.cc b/paddle/fluid/operators/fold_op.cc index 40ec9aef190ff4..92f59e118c3b7b 100644 --- a/paddle/fluid/operators/fold_op.cc +++ b/paddle/fluid/operators/fold_op.cc @@ -95,6 +95,17 @@ class FoldOp : public framework::OperatorWithKernel { "but recieved strides_height: %d strides_width: %d.", strides[0], strides[1])); // check dilations + PADDLE_ENFORCE_GT(output_height, 1, + platform::errors::InvalidArgument( + "The `output_height` should be greater than one, " + "but recieved output_height: %d .", + output_height)); + PADDLE_ENFORCE_GT(output_width, 1, + platform::errors::InvalidArgument( + "The `output_width` should be greater than one, " + "but recieved output_width: %d .", + output_width)); + // check output size PADDLE_ENFORCE_GT( dilation_height, 0, platform::errors::InvalidArgument( @@ -146,7 +157,7 @@ class FoldOp : public framework::OperatorWithKernel { output_width)); PADDLE_ENFORCE_EQ( - blocks_height * blocks_width, in_dims[1], + blocks_height * blocks_width, in_dims[2], platform::errors::InvalidArgument( "Given input output_size (%d, %d), " "kernel_sizes (%d, %d), strides (%d, %d), dilations (%d, %d), " @@ -156,6 +167,15 @@ class FoldOp : public framework::OperatorWithKernel { strides[0], strides[1], dilations[0], dilations[1], blocks_height, blocks_width, blocks_height * blocks_width, in_dims[2])); + PADDLE_ENFORCE_EQ( + in_dims[1] % (kernel_sizes[0] * kernel_sizes[1]), 0, + platform::errors::InvalidArgument( + "Expected size of input's dimension 1 to be divisible by the" + "product of kernel_size, but got input.size(1)=%d and " + "kernel_size=( %d" + ", %d).", + in_dims[1], kernel_sizes[0], kernel_sizes[1])); + out_dims.push_back(output_height); out_dims.push_back(output_width); ctx->SetOutputDim("Y", phi::make_ddim(out_dims)); diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt index 67287afa6ae505..80e7f5c001d4b8 100644 --- a/paddle/fluid/operators/fused/CMakeLists.txt +++ b/paddle/fluid/operators/fused/CMakeLists.txt @@ -19,7 +19,8 @@ register_operators(EXCLUDES fused_attention_op fused_transformer_op fused_feedforward_op - resnet_unit_op) + resnet_unit_op + fused_gemm_epilogue_op) # fusion_gru_op does not have CUDA kernel op_library(fusion_gru_op) @@ -79,4 +80,8 @@ if (WITH_GPU OR WITH_ROCM) cc_test(test_cudnn_norm_conv SRCS cudnn_norm_conv_test.cc DEPS conv_op blas im2col vol2col depthwise_conv eigen_function tensor op_registry device_context generator memory) cc_test(test_cudnn_bn_add_relu SRCS cudnn_bn_add_relu_test.cc DEPS batch_norm_op fused_bn_add_activation_op tensor op_registry device_context generator memory) endif() + + if (CUDA_VERSION GREATER_EQUAL 11.6) + op_library(fused_gemm_epilogue_op) + endif() endif() diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc index b3792a176fabeb..a80f590aa495db 100644 --- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc +++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc @@ -405,8 +405,18 @@ TEST(CudnnNormConvFp16, K1S1) { CudnnNormConvolutionTester test( batch_size, height, width, input_channels, output_channels, kernel_size, stride); - test.CheckForward(1e-3, true); - test.CheckBackward(1e-3, true); + platform::CUDADeviceContext *ctx = static_cast( + platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); + + if (ctx->GetComputeCapability() <= 70) { + ASSERT_THROW(test.CheckForward(1e-3, true), + paddle::platform::EnforceNotMet); + ASSERT_THROW(test.CheckBackward(1e-3, true), + paddle::platform::EnforceNotMet); + } else { + ASSERT_NO_THROW(test.CheckForward(1e-3, true)); + ASSERT_NO_THROW(test.CheckBackward(1e-3, true)); + } } // test for fp16, kernel = 3, output_channels = input_channels @@ -421,8 +431,18 @@ TEST(CudnnNormConvFp16, K3S1) { CudnnNormConvolutionTester test( batch_size, height, width, input_channels, output_channels, kernel_size, stride); - test.CheckForward(1e-3, true); - test.CheckBackward(1e-3, true); + platform::CUDADeviceContext *ctx = static_cast( + platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); + + if (ctx->GetComputeCapability() <= 70) { + ASSERT_THROW(test.CheckForward(1e-3, true), + paddle::platform::EnforceNotMet); + ASSERT_THROW(test.CheckBackward(1e-3, true), + paddle::platform::EnforceNotMet); + } else { + ASSERT_NO_THROW(test.CheckForward(1e-3, true)); + ASSERT_NO_THROW(test.CheckBackward(1e-3, true)); + } } // test for fp16, kernel = 1, output_channels = input_channels * 4 @@ -437,8 +457,18 @@ TEST(CudnnNormConvFp16, K1S1O4) { CudnnNormConvolutionTester test( batch_size, height, width, input_channels, output_channels, kernel_size, stride); - test.CheckForward(1e-3, true); - test.CheckBackward(1e-3, true); + platform::CUDADeviceContext *ctx = static_cast( + platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); + + if (ctx->GetComputeCapability() <= 70) { + ASSERT_THROW(test.CheckForward(1e-3, true), + paddle::platform::EnforceNotMet); + ASSERT_THROW(test.CheckBackward(1e-3, true), + paddle::platform::EnforceNotMet); + } else { + ASSERT_NO_THROW(test.CheckForward(1e-3, true)); + ASSERT_NO_THROW(test.CheckBackward(1e-3, true)); + } } // test for fp16, kernel = 1, stride = 2, output_channels = input_channels * 4 diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h index 02027767579735..3c9e16785eac81 100644 --- a/paddle/fluid/operators/fused/fmha_ref.h +++ b/paddle/fluid/operators/fused/fmha_ref.h @@ -140,9 +140,9 @@ class FMHARef { if (dropout_param_.dropout_prob_) { DropoutFwGPUKernelDriver( - dev_ctx_, dropout_param_.is_test_, - static_cast( - dropout_param_.dropout_implementation_), + static_cast(dev_ctx_), + dropout_param_.is_test_, static_cast( + dropout_param_.dropout_implementation_), dropout_param_.dropout_prob_, dropout_param_.is_upscale_in_train_, dropout_param_.is_fix_seed_, dropout_param_.seed_val_, static_cast(*softmax_out_tensor), dropout_param_.seed_, @@ -242,8 +242,9 @@ class FMHARef { // dropout bw if (dropout_param_.dropout_prob_) { DropoutGradGPUKernelDriver( - dev_ctx_, static_cast( - dropout_param_.dropout_implementation_), + static_cast(dev_ctx_), + static_cast( + dropout_param_.dropout_implementation_), dropout_param_.dropout_prob_, static_cast(*dropout_out_grad_tensor), dropout_mask_out_tensor, softmax_out_grad_tensor->numel(), diff --git a/paddle/fluid/operators/fused/fused_dropout_test.h b/paddle/fluid/operators/fused/fused_dropout_test.h index d7952df470d815..18c7187fc8e64c 100644 --- a/paddle/fluid/operators/fused/fused_dropout_test.h +++ b/paddle/fluid/operators/fused/fused_dropout_test.h @@ -31,7 +31,7 @@ namespace framework = paddle::framework; namespace platform = paddle::platform; namespace memory = paddle::memory; -USE_OP(dropout); +USE_OP_ITSELF(dropout); USE_OP(layer_norm); template diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cc b/paddle/fluid/operators/fused/fused_feedforward_op.cc index 0c8eae4260441f..f3f8f174275778 100644 --- a/paddle/fluid/operators/fused/fused_feedforward_op.cc +++ b/paddle/fluid/operators/fused/fused_feedforward_op.cc @@ -195,6 +195,8 @@ class FusedFeedForwardOpMaker : public framework::OpProtoAndCheckerMaker { .SetDefault(false); AddAttr("dropout1_seed", "Dropout1 random seed.").SetDefault(0); AddAttr("dropout2_seed", "Dropout2 random seed.").SetDefault(0); + AddAttr("ring_id", "ring id for tensor model parallel.") + .SetDefault(-1); AddComment(R"DOC( the function of fused_feedforward operator is the same as the following pseudo code: residual = src; diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu index 3131269955bdd1..c38d9f7d4bcbd2 100644 --- a/paddle/fluid/operators/fused/fused_feedforward_op.cu +++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu @@ -21,11 +21,39 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/fused_dropout_helper.h" #include "paddle/fluid/operators/layer_norm_kernel.cu.h" +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" +#endif + namespace paddle { namespace operators { using Tensor = framework::Tensor; +template +static void AllReduce(framework::Tensor& tensor, // NOLINT + const int ring_id, + const platform::CUDADeviceContext& ctx) { + if (ring_id == -1) return; +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + auto dtype = + platform::ToNCCLDataType(framework::TransToProtoVarType(tensor.dtype())); + int64_t numel = tensor.numel(); + const void* sendbuff = tensor.data(); + auto place = ctx.GetPlace(); + void* recvbuff = tensor.mutable_data(place); + auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place); + auto stream = ctx.stream(); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( + sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream)); +#else + PADDLE_THROW(platform::errors::Unimplemented( + "PaddlePaddle should compile with NCCL or RCCL when used tensor model " + "parallel op.")); +#endif +} + template class FusedFeedForwardKernel : public framework::OpKernel { public: @@ -56,7 +84,7 @@ class FusedFeedForwardKernel : public framework::OpKernel { framework::Tensor* dropout1_out, framework::Tensor* dropout2_out, const int bsz_seq, const int d_model, const int dim_feedforward, const std::string& act_method, const bool pre_layer_norm, - const float epsilon1, const float epsilon2, + const float epsilon1, const float epsilon2, const int ring_id, const DropoutParam& dropout_param1, const DropoutParam& dropout_param2, const platform::CUDADeviceContext& ctx) const { @@ -95,6 +123,10 @@ class FusedFeedForwardKernel : public framework::OpKernel { framework::Tensor linear2_out; linear2_out.mutable_data({bsz_seq, d_model}, place); MatMul(ctx, *dropout1_out, linear2_weight, &linear2_out); + + // tensor model parallel + AllReduce(linear2_out, ring_id, ctx); + if (!pre_layer_norm) { fused_dropout_layernorm_helper.LayernormResidualDropoutBias( ctx, linear2_out.data(), x.data(), linear2_bias_ptr, @@ -150,6 +182,7 @@ class FusedFeedForwardKernel : public framework::OpKernel { const float epsilon1 = context.Attr("ln1_epsilon"); const float epsilon2 = context.Attr("ln2_epsilon"); + const int ring_id = context.Attr("ring_id"); DropoutParam dropout_param1(context, 1); DropoutParam dropout_param2(context, 2); @@ -186,7 +219,7 @@ class FusedFeedForwardKernel : public framework::OpKernel { dropout2_mask, ln1_mean, ln1_variance, ln2_mean, ln2_variance, linear1_out, ln1_out, dropout1_out, dropout2_out, bsz_seq, d_model, dim_feedforward, act_method, pre_layer_norm, epsilon1, epsilon2, - dropout_param1, dropout_param2, context.cuda_device_context()); + ring_id, dropout_param1, dropout_param2, context.cuda_device_context()); } }; @@ -231,7 +264,7 @@ class FusedFeedForwardGradKernel : public framework::OpKernel { const int dim_feedforward, const DropoutParam& dropout_param1, const DropoutParam& dropout_param2, const std::string& act_method, const bool pre_layer_norm, const float epsilon1, const float epsilon2, - const platform::CUDADeviceContext& ctx) const { + const int ring_id, const platform::CUDADeviceContext& ctx) const { FusedDropoutLayerNormHelper pre_layernorm_helper( bsz_seq, d_model, epsilon1); FusedDropoutHelper fused_act_dropout_helper( @@ -295,13 +328,16 @@ class FusedFeedForwardGradKernel : public framework::OpKernel { d_ln1_out.mutable_data({bsz_seq, d_model}, place); MatMulGrad(ctx, d_linear1_out, *ln1_out, linear1_weight, &d_ln1_out, d_linear1_weight); - + // tensor model parallel + AllReduce(d_ln1_out, ring_id, ctx); pre_layernorm_helper.LayerNormGrad( ctx, d_ln1_out.data(), x.data(), ln1_gamma_ptr, ln1_mean->data(), ln1_variance->data(), d_x->data(), d_ln1_gamma_ptr, d_ln1_beta_ptr); } else { MatMulGrad(ctx, d_linear1_out, x, linear1_weight, d_x, d_linear1_weight); + // tensor model parallel + AllReduce(*d_x, ring_id, ctx); } std::vector ins(2); std::vector outs(1); @@ -376,6 +412,7 @@ class FusedFeedForwardGradKernel : public framework::OpKernel { const float epsilon1 = context.Attr("ln1_epsilon"); const float epsilon2 = context.Attr("ln2_epsilon"); + const int ring_id = context.Attr("ring_id"); const std::string act_method = context.Attr("act_method"); DropoutParam dropout_param1(context, 1); DropoutParam dropout_param2(context, 2); @@ -419,7 +456,8 @@ class FusedFeedForwardGradKernel : public framework::OpKernel { d_linear1_bias, d_linear2_weight, d_linear2_bias, d_ln1_scale, d_ln1_bias, d_ln2_scale, d_ln2_bias, bsz_seq, d_model, dim_feedforward, dropout_param1, dropout_param2, act_method, - pre_layer_norm, epsilon1, epsilon2, context.cuda_device_context()); + pre_layer_norm, epsilon1, epsilon2, ring_id, + context.cuda_device_context()); } }; } // namespace operators diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc new file mode 100644 index 00000000000000..4c4e3661e6d6ed --- /dev/null +++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc @@ -0,0 +1,353 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" + +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; + +class FusedGemmEpilogueOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FusedGemmEpilogueOp"); + OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "FusedGemmEpilogueOp"); + OP_INOUT_CHECK(ctx->HasInput("Bias"), "Output", "Bias", + "FusedGemmEpilogueOp"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", + "FusedGemmEpilogueOp"); + + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + auto bias_dims = ctx->GetInputDim("Bias"); + + auto trans_x = ctx->Attrs().Get("trans_x"); + auto trans_y = ctx->Attrs().Get("trans_y"); + + PADDLE_ENFORCE_EQ( + y_dims.size(), 2, + platform::errors::InvalidArgument( + "The Input tensor Y's dimension of FusedGemmEpilogueOp " + " should be 2, but got %d.", + y_dims.size())); + + PADDLE_ENFORCE_GE( + x_dims.size(), 2, + platform::errors::InvalidArgument( + "The Input tensor X's dimension of FusedGemmEpilogueOp " + " should be >= 2, but got %d.", + x_dims.size())); + + PADDLE_ENFORCE_EQ( + bias_dims.size(), 1, + platform::errors::InvalidArgument( + "The Input tensor bias's dimension of FusedGemmEpilogueOp " + " should be == 1, but got %d.", + bias_dims.size())); + + PADDLE_ENFORCE_EQ(bias_dims[0], trans_y ? y_dims[0] : y_dims[1], + platform::errors::InvalidArgument( + "The Input tensor bias's dimension 0" + " should be == Y[-1], but got bias's shape = [%s] " + "and Y's shape = [%s]", + bias_dims, y_dims)); + + auto x_mat_dims = + phi::flatten_to_2d(x_dims, trans_x ? 1 : x_dims.size() - 1); + + int K_from_x = trans_x ? x_mat_dims[0] : x_mat_dims[1]; + int K_from_y = trans_y ? y_dims[1] : y_dims[0]; + + PADDLE_ENFORCE_EQ( + K_from_x, K_from_y, + platform::errors::InvalidArgument( + "The last dimension of X should be equal with Y's first dimension." + "But received X[-1] = [%d], Y[0] = [%d].", + K_from_x, K_from_y)); + + auto activation = ctx->Attrs().Get("activation"); + + if ((activation != "relu") && (activation != "gelu") && + (activation != "none")) { + PADDLE_ENFORCE_EQ( + true, false, + platform::errors::InvalidArgument( + "The activation attribute of fused_gemm_epilogue op should be" + " one of {\"none\", \"relu\", \"gelu\"}. But received %s." + "But received activation=%s.", + activation)); + } + + if (activation == "none" && ctx->HasOutput("ReserveSpace")) { + PADDLE_THROW(platform::errors::InvalidArgument( + "The ReserveSpace would not be used when activation = \"none\"")); + } + + // cublasLt's restriction for auxiliary. + if (ctx->HasOutput("ReserveSpace") && activation != "none") { + int min_size_of_n = activation == "relu" ? 128 : 8; + int N_size = trans_y ? y_dims[0] : y_dims[1]; + PADDLE_ENFORCE_EQ(N_size % min_size_of_n, 0, + platform::errors::InvalidArgument( + "The output dimension N (X(MxK) * Y(KxN) = C(MxN)) " + "should be multiple of %d when auxiliary_key given " + "and activation=%s, but got N = %d.", + min_size_of_n, activation, N_size)); + } + + std::vector out_dims; + out_dims.reserve(static_cast(x_dims.size())); + if (trans_x) { + for (int i = 1; i < x_dims.size(); ++i) out_dims.push_back(x_dims[i]); + } else { + for (int i = 0; i < x_dims.size() - 1; ++i) out_dims.push_back(x_dims[i]); + } + + if (trans_y) { + out_dims.push_back(y_dims[0]); + } else { + out_dims.push_back(y_dims[1]); + } + + ctx->SetOutputDim("Out", phi::make_ddim(out_dims)); + // Note (Ming Huang): Reserve space of relu is a bit-mask, + // which cannot pass nan_and_inf checking if shape is set. + if (activation == "gelu" && ctx->HasOutput("ReserveSpace")) { + ctx->SetOutputDim("ReserveSpace", phi::make_ddim(out_dims)); + } + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); + return framework::OpKernelType(data_type, ctx.GetPlace(), layout, library); + } +}; + +class FusedGemmEpilogueOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "The input tensor X of Out = Act((X * Y) + Bias)."); + AddInput("Y", "The input tensor Y of Out = Act((X * Y) + Bias)."); + AddInput("Bias", "The input tensor bias of Out = Act((X * Y) + Bias)."); + + AddOutput("Out", "The output tensor Out of Out = Act((X * Y) + Bias)."); + AddOutput("ReserveSpace", + R"DOC(Reserve GPU space to place + auxiliary data pointer. It is used to pass auxiliary data pointer + for fused_gemm_epilogue op. If not given (empty string), the + auxiliary mode would not be enable.)DOC") + .AsDispensable() + .AsExtra(); + + AddAttr( + "trans_x", + R"DOC((bool, default false), Whether to transpose input tensor X + or not. The input tensor X coulbe be more than two dimension. When + set trans_x=true, it would fully reverse X. For instant: X with shpae + [d0, d1, d2, d3] -> [d3, d2, d1, d0].)DOC") + .SetDefault(false); + AddAttr( + "trans_y", + R"DOC((bool, default false), Whether to transpose input tensor Y + or not. The input tensor Y should be two dimension. When + set trans_y=true, it would transpose Y. For instant: Y with shpae + [d0, d1] -> [d1, d0].)DOC") + .SetDefault(false); + + AddAttr( + "activation", + R"DOC((string, default none), The activation function. It could be + one of {none, relu, gelu}. When none is given, Act would be null + operations)DOC") + .SetDefault("none"); + + AddComment(R"DOC( +FusedGemmEpilogue Operator +This operator is used to perform Activeation(Elementwise_add(Matmul(X, Y), bias)). +It is equal to paddle.nn.Linear + Activation (None, ReLU or GeLU). + +Note: +X could be more than two dimension and would be flatten to 2D for computing. +X with shape [d0, d1, d2, d3] -> X_2D with shape [d0*d1*d2, d3] +)DOC"); + } +}; + +class FusedGemmEpilogueGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("DOut"), "Input", "DOut", + "FusedGemmEpilogueGradOp"); + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FusedGemmEpilogueGradOp"); + OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "FusedGemmEpilogueGradOp"); + OP_INOUT_CHECK(ctx->HasOutput("DY"), "Output", "DY", "FusedGemmEpilogueOp"); + + auto dout_dims = ctx->GetInputDim("DOut"); + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + + PADDLE_ENFORCE_GE( + dout_dims.size(), 2, + platform::errors::InvalidArgument( + "The Input tensor DOut's dimension of FusedGemmEpilogueGradOp " + " should be >= 2, but got %d.", + dout_dims.size())); + + PADDLE_ENFORCE_EQ( + y_dims.size(), 2, + platform::errors::InvalidArgument( + "The Input tensor Y's dimension of FusedGemmEpilogueGradOp " + " should be 2, but got %d.", + y_dims.size())); + + PADDLE_ENFORCE_GE( + x_dims.size(), 2, + platform::errors::InvalidArgument( + "The Input tensor X's dimension of FusedGemmEpilogueGradOp " + " should be >= 2, but got %d.", + x_dims.size())); + + PADDLE_ENFORCE_EQ( + dout_dims.size(), x_dims.size(), + platform::errors::InvalidArgument( + "The Input tensor DOut's and X's dimension of " + "FusedGemmEpilogueGradOp " + " should be the same, but got DOut's dim = %d and X's = %d.", + dout_dims.size(), x_dims.size())); + + auto dout_mat_dims = phi::flatten_to_2d(dout_dims, dout_dims.size() - 1); + + auto x_mat_dims = phi::flatten_to_2d(x_dims, x_dims.size() - 1); + + PADDLE_ENFORCE_EQ( + dout_mat_dims[1], y_dims[1], + platform::errors::InvalidArgument( + "The last dimension of DOut should be equal with Y's last" + "dimension. But received DOut[-1] = [%d], Y[1] = [%d].", + dout_mat_dims[1], y_dims[1])); + + PADDLE_ENFORCE_EQ( + dout_mat_dims[0], x_mat_dims[0], + platform::errors::InvalidArgument( + "The first dimension of DOut should be equal with X's first" + "dimension. But received DOut[0] = [%d], Y[0] = [%d].", + dout_mat_dims[0], x_mat_dims[0])); + + auto activation_grad = ctx->Attrs().Get("activation_grad"); + if ((activation_grad != "relu_grad") && (activation_grad != "gelu_grad") && + (activation_grad != "none")) { + PADDLE_ENFORCE_EQ( + true, false, + platform::errors::InvalidArgument( + "The activation attribute of fused_gemm_epilogue op should be" + " one of {\"none\", \"relu\", \"gelu\"}. But received %s." + "But received activation=%s.", + activation_grad)); + } + + if (activation_grad != "none" && !ctx->HasInput("ReserveSpace")) { + PADDLE_ENFORCE_EQ(true, false, + platform::errors::InvalidArgument( + "The ReserveSpace should not be empty. " + "when activation_grad == {relu_grad, gelu_grad}.")); + } + + if (ctx->HasOutput("DX")) { + std::vector dx_dims; + dx_dims.reserve(static_cast(x_dims.size())); + for (int i = 0; i < x_dims.size(); ++i) { + dx_dims.push_back(x_dims[i]); + } + ctx->SetOutputDim("DX", phi::make_ddim(dx_dims)); + } + + std::vector dy_dims(y_dims.Get(), y_dims.Get() + y_dims.size()); + ctx->SetOutputDim("DY", phi::make_ddim(dy_dims)); + + if (ctx->HasOutput("DBias")) { + std::vector dbias_dims; + dbias_dims.push_back(y_dims[1]); + ctx->SetOutputDim("DBias", phi::make_ddim(dbias_dims)); + } + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "DOut"); + return framework::OpKernelType(data_type, ctx.GetPlace(), layout, library); + } +}; + +class FusedGemmEpilogueGradOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("DOut", + "The input grad tensor to Out of Out = (Act(X) * Y) + bias"); + AddInput("X", "The input tensor X of Out = (Act(X) * Y) + bias"); + AddInput("Y", "The input tensor Y of Out = (Act(X) * Y) + bias"); + AddInput("ReserveSpace", + R"DOC(A GPU space to fetch + auxiliary data pointer. It is used to pass auxiliary data pointer + for fused_gemm_epilogue_grad op. If not given (empty string), the + auxiliary mode would not be enable.)DOC") + .AsDispensable(); + + AddOutput("DX", "The output grad tensor to X of Out = (Act(X) * Y) + bias.") + .AsDispensable(); + AddOutput("DY", + "The output grad tensor to Y of Out = (Act(X) * Y) + bias."); + AddOutput("DBias", + "The output grad tensor to bias of Out = (Act(X) * Y) + bias.") + .AsDispensable(); + + AddAttr( + "activation_grad", + R"DOC((string, default none), The backward activation function. It could be + one of {none, relu_grad, gelu_grad}. When none is given, The backward Act would + be null operations)DOC") + .SetDefault("none"); + + AddComment(R"DOC( +FusedGemmEpilogueGrad Operator +This operator is used to perform backward of Elementwise_add(Matmul(Activeation(X), Y), bias). +It is equal to Activation (None, ReLU or GeLU) + paddle.nn.Linear. + +Note: +X could be more than two dimension and would be flatten to 2D for computing. +X with shape [d0, d1, d2, d3] -> X_2D with shape [d0*d1*d2, d3] +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(fused_gemm_epilogue, ops::FusedGemmEpilogueOp, + ops::FusedGemmEpilogueOpMaker) +REGISTER_OPERATOR(fused_gemm_epilogue_grad, ops::FusedGemmEpilogueGradOp, + ops::FusedGemmEpilogueGradOpMaker) diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu new file mode 100644 index 00000000000000..e16c9e8f483ccc --- /dev/null +++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu @@ -0,0 +1,376 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/platform/dynload/cublasLt.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class FusedGemmEpilogueKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = ctx.template device_context(); + + const Tensor* x = ctx.Input("X"); + const Tensor* y = ctx.Input("Y"); + const Tensor* bias = ctx.Input("Bias"); + + Tensor* out = ctx.Output("Out"); + Tensor* reserve_space = ctx.Output("ReserveSpace"); + + bool trans_x = ctx.Attr("trans_x"); + bool trans_y = ctx.Attr("trans_y"); + + std::string activation = ctx.Attr("activation"); + bool enable_auxiliary = reserve_space == nullptr ? false : true; + + out->mutable_data(ctx.GetPlace()); + auto* out_data = out->data(); + + auto x_mat_dims = + phi::flatten_to_2d(x->dims(), trans_x ? 1 : x->dims().size() - 1); + int64_t M = trans_x ? x_mat_dims[1] : x_mat_dims[0]; + int64_t K = trans_y ? y->dims()[1] : y->dims()[0]; + int64_t N = trans_y ? y->dims()[0] : y->dims()[1]; + + cudaDataType_t mat_type = CUDA_R_32F; + cudaDataType_t scale_type = CUDA_R_32F; + cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F; + if (std::is_same::value) { + mat_type = CUDA_R_16F; + scale_type = CUDA_R_16F; + } + if (std::is_same::value) { + mat_type = CUDA_R_64F; + scale_type = CUDA_R_64F; + compute_type = CUBLAS_COMPUTE_64F; + } + + cublasLtMatmulDesc_t operation_desc = NULL; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmulDescCreate( + &operation_desc, compute_type, scale_type)); + cublasOperation_t transx = trans_x ? CUBLAS_OP_T : CUBLAS_OP_N; + cublasOperation_t transy = trans_y ? CUBLAS_OP_T : CUBLAS_OP_N; + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + operation_desc, CUBLASLT_MATMUL_DESC_TRANSB, &transx, + sizeof(transx))); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + operation_desc, CUBLASLT_MATMUL_DESC_TRANSA, &transy, + sizeof(transy))); + + cublasLtEpilogue_t epiloque_func = + get_epilogue_type_(activation, enable_auxiliary); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epiloque_func, + sizeof(epiloque_func))); + const T* bias_data = bias->data(); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + operation_desc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias_data, + sizeof(bias_data))); + + if (enable_auxiliary && activation != "none") { + size_t reserve_space_size = 0; + if (activation == "relu") { + // Count in bits. + reserve_space_size = phi::product(out->dims()) / 8; + } else { + reserve_space_size = phi::product(out->dims()) * sizeof(T); + } + reserve_space->mutable_data(ctx.GetPlace(), out->type(), + reserve_space_size); + void* aux_data = reinterpret_cast(reserve_space->data()); + + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER, + &aux_data, sizeof(aux_data))); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &N, + sizeof(N))); + } + + cublasLtMatrixLayout_t x_desc = NULL, y_desc = NULL, out_desc = NULL; + if (trans_x) + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &x_desc, mat_type, M, K, M)); + else + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &x_desc, mat_type, K, M, K)); + if (trans_y) + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &y_desc, mat_type, K, N, K)); + else + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &y_desc, mat_type, N, K, N)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &out_desc, mat_type, N, M, N)); + + cublasLtHandle_t lt_handle = dev_ctx.cublaslt_handle(); + size_t workspace_size = 4 * 1024 * 1024; + const cublasLtMatmulAlgo_t* algo = nullptr; + cudaStream_t stream = dev_ctx.stream(); + memory::allocation::AllocationPtr workspace = + memory::Alloc(dev_ctx, workspace_size); + + double alpha64 = 1.0, beta64 = 0.0; + float alpha32 = 1.0f, beta32 = 0.0f; + void *alpha = nullptr, *beta = nullptr; + if (std::is_same::value) { + alpha = &alpha64; + beta = &beta64; + } else { + alpha = &alpha32; + beta = &beta32; + } + + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmul( + lt_handle, operation_desc, alpha, y->data(), y_desc, x->data(), + x_desc, beta, out_data, out_desc, out_data, out_desc, algo, + workspace->ptr(), workspace_size, stream)); + } + + private: + static cublasLtEpilogue_t get_epilogue_type_(const std::string& activation, + bool enable_auxiliary) { + if (activation == "relu") { + return enable_auxiliary ? CUBLASLT_EPILOGUE_RELU_AUX_BIAS + : CUBLASLT_EPILOGUE_RELU_BIAS; + } else if (activation == "gelu") { + return enable_auxiliary ? CUBLASLT_EPILOGUE_GELU_AUX_BIAS + : CUBLASLT_EPILOGUE_GELU_BIAS; + } else if (activation == "none") { + return CUBLASLT_EPILOGUE_BIAS; + } else { + PADDLE_ENFORCE_EQ( + true, false, + platform::errors::InvalidArgument( + "The activation attribute of fused_gemm_epilogue op should be" + " one of {\"none\", \"relu\", \"gelu\"}. But received %s." + "But received activation=%s.", + activation)); + } + } +}; + +template +class FusedGemmEpilogueGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = ctx.template device_context(); + + const Tensor* dout = ctx.Input("DOut"); + const Tensor* x = ctx.Input("X"); + const Tensor* y = ctx.Input("Y"); + const Tensor* reserve_space = ctx.Input("ReserveSpace"); + + Tensor* dx = ctx.Output("DX"); + Tensor* dy = ctx.Output("DY"); + Tensor* dbias = ctx.Output("DBias"); + + std::string activation_grad = ctx.Attr("activation_grad"); + + auto dout_mat_dims = + phi::flatten_to_2d(dout->dims(), dout->dims().size() - 1); + auto x_mat_dims = phi::flatten_to_2d(x->dims(), x->dims().size() - 1); + + int64_t M = x_mat_dims[0]; + int64_t K = y->dims()[0]; + int64_t N = y->dims()[1]; + + cudaDataType_t mat_type = CUDA_R_32F; + cudaDataType_t scale_type = CUDA_R_32F; + cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F; + if (std::is_same::value) { + mat_type = CUDA_R_16F; + scale_type = CUDA_R_16F; + } + if (std::is_same::value) { + mat_type = CUDA_R_64F; + scale_type = CUDA_R_64F; + compute_type = CUBLAS_COMPUTE_64F; + } + + cublasLtHandle_t lt_handle = dev_ctx.cublaslt_handle(); + size_t workspace_size = 4 * 1024 * 1024; + const cublasLtMatmulAlgo_t* algo = nullptr; + cudaStream_t stream = dev_ctx.stream(); + + double alpha64 = 1.0, beta64 = 0.0; + float alpha32 = 1.0f, beta32 = 0.0f; + void *alpha = nullptr, *beta = nullptr; + if (std::is_same::value) { + alpha = &alpha64; + beta = &beta64; + } else { + alpha = &alpha32; + beta = &beta32; + } + + cublasOperation_t trans_dout = CUBLAS_OP_N; + cublasLtMatrixLayout_t dout_desc = NULL; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &dout_desc, mat_type, N, M, N)); + + if (dx) { + cublasLtMatmulDesc_t dx_operation_desc = NULL; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmulDescCreate( + &dx_operation_desc, compute_type, scale_type)); + cublasOperation_t trans_y = CUBLAS_OP_T; + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dx_operation_desc, CUBLASLT_MATMUL_DESC_TRANSB, &trans_dout, + sizeof(trans_dout))); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dx_operation_desc, CUBLASLT_MATMUL_DESC_TRANSA, &trans_y, + sizeof(trans_y))); + cublasLtEpilogue_t epiloque_func_for_dx = + get_epilogue_type_(activation_grad); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dx_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE, + &epiloque_func_for_dx, sizeof(epiloque_func_for_dx))); + + if (activation_grad != "none") { + auto* aux_data = reserve_space->data(); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dx_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER, + &aux_data, sizeof(aux_data))); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dx_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &N, + sizeof(N))); + } + + cublasLtMatrixLayout_t y_desc = NULL, dx_desc = NULL; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &y_desc, mat_type, N, K, N)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &dx_desc, mat_type, K, M, K)); + + memory::allocation::AllocationPtr dx_workspace = + memory::Alloc(dev_ctx, workspace_size); + + dx->mutable_data(ctx.GetPlace()); + auto* dx_data = dx->data(); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmul( + lt_handle, dx_operation_desc, alpha, y->data(), y_desc, + dout->data(), dout_desc, beta, dx_data, dx_desc, dx_data, dx_desc, + algo, dx_workspace->ptr(), workspace_size, stream)); + } + + if (dy) { + cublasLtMatmulDesc_t dy_operation_desc = NULL; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmulDescCreate( + &dy_operation_desc, compute_type, scale_type)); + cublasOperation_t trans_x = CUBLAS_OP_T; + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dy_operation_desc, CUBLASLT_MATMUL_DESC_TRANSA, &trans_dout, + sizeof(trans_dout))); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dy_operation_desc, CUBLASLT_MATMUL_DESC_TRANSB, &trans_x, + sizeof(trans_x))); + cublasLtEpilogue_t epiloque_func_for_dy = dbias == nullptr + ? CUBLASLT_EPILOGUE_DEFAULT + : CUBLASLT_EPILOGUE_BGRADA; + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dy_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE, + &epiloque_func_for_dy, sizeof(epiloque_func_for_dy))); + + if (dbias) { + dbias->mutable_data(ctx.GetPlace()); + auto* dbias_data = dbias->data(); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dy_operation_desc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, + &dbias_data, sizeof(dbias_data))); + } + + cublasLtMatrixLayout_t x_desc = NULL, dy_desc = NULL; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &x_desc, mat_type, K, M, K)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &dy_desc, mat_type, N, K, N)); + + memory::allocation::AllocationPtr dy_workspace = + memory::Alloc(dev_ctx, workspace_size); + + dy->mutable_data(ctx.GetPlace()); + auto* dy_data = dy->data(); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmul( + lt_handle, dy_operation_desc, alpha, dout->data(), dout_desc, + x->data(), x_desc, beta, dy_data, dy_desc, dy_data, dy_desc, algo, + dy_workspace->ptr(), workspace_size, stream)); + } + } + + private: + static cublasLtEpilogue_t get_epilogue_type_( + const std::string& activation_grad) { + if (activation_grad == "relu_grad") { + return CUBLASLT_EPILOGUE_DRELU; + } else if (activation_grad == "gelu_grad") { + return CUBLASLT_EPILOGUE_DGELU; + } else if (activation_grad == "none") { + return CUBLASLT_EPILOGUE_DEFAULT; + } else { + PADDLE_ENFORCE_EQ( + true, false, + platform::errors::InvalidArgument( + "The activation_grad attribute of fused_gemm_epilogue op should " + "be" + " one of {\"none\", \"relu\", \"gelu\"}. But received %s." + "But received activation_grad=%s.", + activation_grad)); + } + } +}; + +} // namespace operators +} // namespace paddle + +#if CUDA_VERSION >= 11060 +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + fused_gemm_epilogue, + ops::FusedGemmEpilogueKernel, + ops::FusedGemmEpilogueKernel, + ops::FusedGemmEpilogueKernel); + +REGISTER_OP_CUDA_KERNEL( + fused_gemm_epilogue_grad, + ops::FusedGemmEpilogueGradKernel, + ops::FusedGemmEpilogueGradKernel, + ops::FusedGemmEpilogueGradKernel); +#endif diff --git a/paddle/fluid/operators/gather_nd_op.cc b/paddle/fluid/operators/gather_nd_op.cc index fcd3384ac24444..e5ca15a39ef51f 100644 --- a/paddle/fluid/operators/gather_nd_op.cc +++ b/paddle/fluid/operators/gather_nd_op.cc @@ -130,11 +130,11 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(GatherNdGradNoNeedBufferVarInferer, "X"); namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(gather_nd, GatherNdInferShapeFunctor, - PT_INFER_META(phi::GatherNdInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(gather_nd, GatherNdInferShapeFunctor, + PD_INFER_META(phi::GatherNdInferMeta)); -DELCARE_INFER_SHAPE_FUNCTOR(gather_nd_grad, GatherNdGradInferShapeFunctor, - PT_INFER_META(phi::GatherNdGradInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(gather_nd_grad, GatherNdGradInferShapeFunctor, + PD_INFER_META(phi::GatherNdGradInferMeta)); REGISTER_OPERATOR(gather_nd, ops::GatherNdOp, ops::GatherNdOpMaker, ops::GatherNdGradOpMaker, diff --git a/paddle/fluid/operators/gather_tree_op.cc b/paddle/fluid/operators/gather_tree_op.cc index 7f6c82032fe39d..c84e94f5c71277 100644 --- a/paddle/fluid/operators/gather_tree_op.cc +++ b/paddle/fluid/operators/gather_tree_op.cc @@ -61,8 +61,8 @@ selected ids. } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(gather_tree, GatherTreeInferShapeFunctor, - PT_INFER_META(phi::GatherTreeMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(gather_tree, GatherTreeInferShapeFunctor, + PD_INFER_META(phi::GatherTreeMeta)); REGISTER_OPERATOR(gather_tree, ops::GatherTreeOp, ops::GatherTreeOpMaker, GatherTreeInferShapeFunctor); diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc index 6b559885c569d0..66eecc13d04d1a 100644 --- a/paddle/fluid/operators/gaussian_random_op.cc +++ b/paddle/fluid/operators/gaussian_random_op.cc @@ -15,12 +15,14 @@ limitations under the License. */ #include #include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/fill_constant_op.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif +#include "paddle/phi/infermeta/nullary.h" namespace paddle { namespace operators { @@ -54,38 +56,6 @@ class GaussianRandomOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "GaussianRandom"); - - auto shape = ctx->Attrs().Get>("shape"); - std::vector temp; - temp.reserve(shape.size()); - for (auto dim : shape) { - temp.push_back(static_cast(dim)); - } - if (shape.empty() && ctx->HasInput("ShapeTensor")) { - auto shape_dims = ctx->GetInputDim("ShapeTensor"); - int num_ele = 1; - for (int i = 0; i < shape_dims.size(); ++i) { - num_ele *= shape_dims[i]; - } - auto vec_dims = std::vector(num_ele, -1); - ctx->SetOutputDim("Out", phi::make_ddim(vec_dims)); - - return; - } - if (!ctx->HasInput("ShapeTensor") && !ctx->HasInputs("ShapeTensorList")) { - PADDLE_ENFORCE_GT( - shape.size(), 0UL, - platform::errors::InvalidArgument( - "Attribute(shape) of GaussianRandomOp must be set " - "and shape.size() > 0, but reveived shape.size() is %d", - shape.size())); - } - - ctx->SetOutputDim("Out", phi::make_ddim(temp)); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -171,11 +141,20 @@ Used to initialize tensors with gaussian random generator. } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(gaussian_random, ops::GaussianRandomOp, - ops::GaussianRandomOpMaker); + +DECLARE_INFER_SHAPE_FUNCTOR(gaussian_random, GaussianRandomInferShapeFunctor, + PD_INFER_META(phi::GaussianRandomInferMeta)); + +REGISTER_OPERATOR( + gaussian_random, ops::GaussianRandomOp, ops::GaussianRandomOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + GaussianRandomInferShapeFunctor); + REGISTER_OP_CPU_KERNEL(gaussian_random_batch_size_like, ops::CPUGaussianRandomBatchSizeLikeKernel, ops::CPUGaussianRandomBatchSizeLikeKernel); + REGISTER_OP_VERSION(gaussian_random) .AddCheckpoint( R"ROC( diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu index 717ec774414bf8..00ce10bfe3bccb 100644 --- a/paddle/fluid/operators/gaussian_random_op.cu +++ b/paddle/fluid/operators/gaussian_random_op.cu @@ -45,7 +45,8 @@ struct GaussianGenerator { thrust::minstd_rand rng; rng.seed(seed_); using MT = typename details::MPTypeTrait::Type; - thrust::normal_distribution dist(mean_, std_); + thrust::normal_distribution dist(static_cast(mean_), + static_cast(std_)); unsigned int new_n = n + offset_; rng.discard(new_n); MT out = dist(rng); diff --git a/paddle/fluid/operators/gelu_op_npu_test.cc b/paddle/fluid/operators/gelu_op_npu_test.cc index 00ff7ad2166dcf..f3ac53138328db 100644 --- a/paddle/fluid/operators/gelu_op_npu_test.cc +++ b/paddle/fluid/operators/gelu_op_npu_test.cc @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/graph_send_recv_op.cc b/paddle/fluid/operators/graph_send_recv_op.cc index 6af8388d9eba4e..f7c006dbcb1a9a 100644 --- a/paddle/fluid/operators/graph_send_recv_op.cc +++ b/paddle/fluid/operators/graph_send_recv_op.cc @@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/graph_send_recv_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -21,59 +24,6 @@ class GraphSendRecvOP : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "GraphSendRecv"); - OP_INOUT_CHECK(ctx->HasInput("Src_index"), "Input", "Src_index", - "GraphSendRecv"); - OP_INOUT_CHECK(ctx->HasInput("Dst_index"), "Input", "Dst_index", - "GraphSendRecv"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "GraphSendRecv"); - - auto src_index_dims = ctx->GetInputDim("Src_index"); - if (src_index_dims.size() == 2) { - PADDLE_ENFORCE_EQ(src_index_dims[1], 1, - platform::errors::InvalidArgument( - "The last dim of Src_index should be 1 when it " - "is 2D, but we get %d", - src_index_dims[1])); - } else { - PADDLE_ENFORCE_EQ( - src_index_dims.size(), 1, - platform::errors::InvalidArgument( - "The Src_index should be 1D, when it is not 2D, but we get %d", - src_index_dims.size())); - } - - auto dst_index_dims = ctx->GetInputDim("Dst_index"); - if (dst_index_dims.size() == 2) { - PADDLE_ENFORCE_EQ(dst_index_dims[1], 1, - platform::errors::InvalidArgument( - "The last dim of Dst_index should be 1 when it " - "is 2D, but we get %d", - dst_index_dims[1])); - } else { - PADDLE_ENFORCE_EQ( - dst_index_dims.size(), 1, - platform::errors::InvalidArgument("The Dst_index should be 1D, " - "when it is not 2D, but we get %d", - dst_index_dims.size())); - } - - PADDLE_ENFORCE_EQ( - src_index_dims[0], dst_index_dims[0], - platform::errors::InvalidArgument( - "Src_index and Dst_index should have the same shape.")); - - auto dims = ctx->GetInputDim("X"); - ctx->SetOutputDim("Out", dims); - - if (ctx->Attrs().Get("pool_type") == "MEAN") { - OP_INOUT_CHECK(ctx->HasOutput("Dst_count"), "Output", "Dst_count", - "GraphSendRecv"); - ctx->SetOutputDim("Dst_count", {dims[0]}); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -164,20 +114,12 @@ class GraphSendRecvGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; -using CPU = paddle::platform::CPUDeviceContext; +DECLARE_INFER_SHAPE_FUNCTOR(graph_send_recv, GraphSendRecvInferShapeFunctor, + PD_INFER_META(phi::GraphSendRecvInferMeta)); REGISTER_OPERATOR(graph_send_recv, ops::GraphSendRecvOP, ops::GraphSendRecvOpMaker, ops::GraphSendRecvGradOpMaker, - ops::GraphSendRecvGradOpMaker); + ops::GraphSendRecvGradOpMaker, + GraphSendRecvInferShapeFunctor); REGISTER_OPERATOR(graph_send_recv_grad, ops::GraphSendRecvGradOp); -REGISTER_OP_CPU_KERNEL(graph_send_recv, ops::GraphSendRecvOpKernel, - ops::GraphSendRecvOpKernel, - ops::GraphSendRecvOpKernel, - ops::GraphSendRecvOpKernel); - -REGISTER_OP_CPU_KERNEL(graph_send_recv_grad, - ops::GraphSendRecvGradOpKernel, - ops::GraphSendRecvGradOpKernel, - ops::GraphSendRecvGradOpKernel, - ops::GraphSendRecvGradOpKernel); diff --git a/paddle/fluid/operators/graph_send_recv_op.cu b/paddle/fluid/operators/graph_send_recv_op.cu deleted file mode 100644 index f43d31814ac384..00000000000000 --- a/paddle/fluid/operators/graph_send_recv_op.cu +++ /dev/null @@ -1,419 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/graph_send_recv_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/fluid/platform/place.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -struct GraphSendRecvSumCUDAFunctor { - DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i, - const IndexT& out_i) { - paddle::platform::CudaAtomicAdd(output + out_i, *(params + in_i)); - } -}; - -template -struct GraphSendRecvMaxCUDAFunctor { - DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i, - const IndexT& out_i) { - paddle::platform::CudaAtomicMax(output + out_i, *(params + in_i)); - } -}; - -template -struct GraphSendRecvMinCUDAFunctor { - DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i, - const IndexT& out_i) { - paddle::platform::CudaAtomicMin(output + out_i, *(params + in_i)); - } -}; - -template -__global__ void GraphSendRecvCUDAKernel(const T* params, - const IndexT* src_indices, - const IndexT* dst_indices, T* output, - size_t index_size, size_t slice_size, - Functor functor) { - CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { - int64_t indices_i = i / slice_size; - int64_t slice_i = i - indices_i * slice_size; - IndexT src_i = src_indices[indices_i]; - IndexT dst_i = dst_indices[indices_i]; - int64_t in_i = src_i * slice_size + slice_i; - int64_t out_i = dst_i * slice_size + slice_i; - functor(params, output, in_i, out_i); - } -} - -// For max -template -__global__ void InputResetMaxCUDAKernel(T* output, size_t input_size, - size_t slice_size) { - CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) { - if (*(output + i) == std::numeric_limits::min()) { - *(output + i) = 0; - } - } -} - -// For min -template -__global__ void InputResetMinCUDAKernel(T* output, size_t input_size, - size_t slice_size) { - CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) { - if (*(output + i) == std::numeric_limits::max()) { - *(output + i) = 0; - } - } -} - -// Get dst_count -template -__global__ void ComputeCountCUDAKernel(int* count, const IndexT* dst_indices, - size_t index_size) { - CUDA_KERNEL_LOOP_TYPE(i, index_size, int64_t) { - IndexT dst_i = dst_indices[i]; - paddle::platform::CudaAtomicAdd(count + dst_i, 1); - } -} - -// For forward mean -template -__global__ void ManipulateMeanCUDAKernel(T* output, int* count, - size_t input_size, size_t slice_size) { - CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) { - int64_t c_index = i / slice_size; - if (*(count + c_index) > 1) { - *(output + i) = *(output + i) / *(count + c_index); - } - } -} - -// For backward mean -template -__global__ void ManipulateMeanGradCUDAKernel( - const T* params, const IndexT* src_indices, const IndexT* dst_indices, - T* output, size_t index_size, size_t slice_size, const int* dst_count) { - CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { - int64_t indices_i = i / slice_size; - int64_t slice_i = i - indices_i * slice_size; - IndexT src_i = src_indices[indices_i]; - IndexT dst_i = dst_indices[indices_i]; - int64_t in_i = src_i * slice_size + slice_i; - int64_t out_i = dst_i * slice_size + slice_i; - paddle::platform::CudaAtomicAdd(output + out_i, - *(params + in_i) / dst_count[src_i]); - } -} - -// For backward min and max -template -__global__ void ManipulateMinMaxGradCUDAKernel( - const T* params, const IndexT* src_indices, const IndexT* dst_indices, - T* output, size_t index_size, size_t slice_size, const T* ptr_input, - const T* ptr_output) { - CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { - int64_t indices_i = i / slice_size; - int64_t slice_i = i - indices_i * slice_size; - IndexT src_i = src_indices[indices_i]; - IndexT dst_i = dst_indices[indices_i]; - int64_t in_i = src_i * slice_size + slice_i; - int64_t out_i = dst_i * slice_size + slice_i; - paddle::platform::CudaAtomicAdd( - output + out_i, - *(params + in_i) * (*(ptr_input + out_i) == *(ptr_output + in_i))); - } -} - -template -void GraphSendRecvOpCUDAKernelLaunchHelper( - const framework::ExecutionContext& ctx, const Tensor& src_index, - const Tensor& dst_index) { - auto* X = ctx.Input("X"); - auto* Y = ctx.Output("Out"); - std::string pool_type = ctx.Attr("pool_type"); - - const int& index_size = src_index.dims()[0]; - - T* p_output = Y->mutable_data(ctx.GetPlace()); - const auto& src_dims = X->dims(); - int64_t memset_size = 1; - for (int i = 0; i < src_dims.size(); ++i) { - memset_size *= src_dims[i]; - } - const size_t& memset_bytes = memset_size * sizeof(T); - if (pool_type == "SUM" || pool_type == "MEAN") { -#ifdef PADDLE_WITH_HIP - hipMemset(p_output, 0, memset_bytes); -#else - cudaMemset(p_output, 0, memset_bytes); -#endif - } else if (pool_type == "MAX") { - thrust::device_ptr p_output_ptr(p_output); - thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size, - std::numeric_limits::min()); - } else if (pool_type == "MIN") { - thrust::device_ptr p_output_ptr(p_output); - thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size, - std::numeric_limits::max()); - } - - if (index_size == 0) return; - - int64_t slice_size = 1; - for (int i = 1; i < src_dims.size(); ++i) { - slice_size *= src_dims[i]; - } - const T* p_src = X->data(); - const IndexT* s_index = src_index.data(); - const IndexT* d_index = dst_index.data(); - -#ifdef PADDLE_WITH_HIP - int block = 256; -#else - int block = 1024; -#endif - int64_t n = slice_size * index_size; - const auto& dev_ctx = ctx.cuda_device_context(); - int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0]; - int64_t grid_tmp = (n + block - 1) / block; - int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; - int64_t input_size = src_dims[0]; - if (pool_type == "SUM") { - GraphSendRecvSumCUDAFunctor functor; - GraphSendRecvCUDAKernel><<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, functor); - } else if (pool_type == "MAX") { - GraphSendRecvMaxCUDAFunctor functor; - GraphSendRecvCUDAKernel><<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, functor); - - int64_t grid_max_tmp = (input_size * slice_size + block - 1) / block; - int64_t grid_max = - grid_max_tmp < max_grid_dimx ? grid_max_tmp : max_grid_dimx; - InputResetMaxCUDAKernel< - T><<( - ctx.device_context()) - .stream()>>>(p_output, input_size, slice_size); - } else if (pool_type == "MIN") { - GraphSendRecvMinCUDAFunctor functor; - GraphSendRecvCUDAKernel><<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, functor); - - int64_t grid_min_tmp = (input_size * slice_size + block - 1) / block; - int64_t grid_min = - grid_min_tmp < max_grid_dimx ? grid_min_tmp : max_grid_dimx; - InputResetMinCUDAKernel< - T><<( - ctx.device_context()) - .stream()>>>(p_output, input_size, slice_size); - } else if (pool_type == "MEAN") { - GraphSendRecvSumCUDAFunctor functor; - GraphSendRecvCUDAKernel><<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, functor); - - auto* dst_count = ctx.Output("Dst_count"); - int* p_dst_count = dst_count->mutable_data(ctx.GetPlace()); - -#ifdef PADDLE_WITH_HIP - hipMemset(p_dst_count, 0, input_size * sizeof(int)); -#else - cudaMemset(p_dst_count, 0, input_size * sizeof(int)); -#endif - - int64_t grid_count = (index_size + block - 1) / block; - ComputeCountCUDAKernel< - T, IndexT><<( - ctx.device_context()) - .stream()>>>(p_dst_count, d_index, index_size); - - int64_t grid_mean_tmp = (input_size * slice_size + block - 1) / block; - int64_t grid_mean = - grid_mean_tmp < max_grid_dimx ? grid_mean_tmp : max_grid_dimx; - ManipulateMeanCUDAKernel< - T><<( - ctx.device_context()) - .stream()>>>(p_output, p_dst_count, input_size, slice_size); - } -} - -template -void GraphSendRecvGradOpCUDAKernelLaunchHelper( - const framework::ExecutionContext& ctx, const Tensor& src_index, - const Tensor& dst_index) { - auto* X = ctx.Input(framework::GradVarName("Out")); - auto* Y = ctx.Output(framework::GradVarName("X")); - std::string pool_type = ctx.Attr("pool_type"); - - const int& index_size = src_index.dims()[0]; - - T* p_output = Y->mutable_data(ctx.GetPlace()); - const auto& src_dims = X->dims(); - int64_t memset_size = 1; - for (int i = 0; i < src_dims.size(); ++i) { - memset_size *= src_dims[i]; - } - const size_t& memset_bytes = memset_size * sizeof(T); - -#ifdef PADDLE_WITH_HIP - hipMemset(p_output, 0, memset_bytes); -#else - cudaMemset(p_output, 0, memset_bytes); -#endif - - if (index_size == 0) return; - - int64_t slice_size = 1; - for (int i = 1; i < src_dims.size(); ++i) { - slice_size *= src_dims[i]; - } - const T* p_src = X->data(); - const IndexT* s_index = src_index.data(); - const IndexT* d_index = dst_index.data(); - -#ifdef PADDLE_WITH_HIP - int block = 256; -#else - int block = 1024; -#endif - int64_t n = slice_size * index_size; - const auto& dev_ctx = ctx.cuda_device_context(); - int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0]; - int64_t grid_tmp = (n + block - 1) / block; - int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; - int64_t input_size = src_dims[0]; - if (pool_type == "SUM") { - GraphSendRecvSumCUDAFunctor functor; - GraphSendRecvCUDAKernel><<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, functor); - } else if (pool_type == "MEAN") { - auto* dst_count = ctx.Input("Dst_count"); - const int* s_count = dst_count->data(); - ManipulateMeanGradCUDAKernel<<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, s_count); - } else if (pool_type == "MAX" || pool_type == "MIN") { - auto* input = ctx.Input("X"); - auto* output = ctx.Input("Out"); - const T* ptr_input = input->data(); - const T* ptr_output = output->data(); - ManipulateMinMaxGradCUDAKernel<<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, ptr_input, - ptr_output); - } -} - -template -class GraphSendRecvOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* src_index = ctx.Input("Src_index"); - auto* dst_index = ctx.Input("Dst_index"); - auto index_type = framework::TransToProtoVarType(src_index->dtype()); - - if (index_type == framework::proto::VarType::INT32) { - GraphSendRecvOpCUDAKernelLaunchHelper( - ctx, *src_index, *dst_index); - } else if (index_type == framework::proto::VarType::INT64) { - GraphSendRecvOpCUDAKernelLaunchHelper( - ctx, *src_index, *dst_index); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported Src_index or Dst_index dtype, expected int, int64, but " - "got %s.", - index_type)); - } - } -}; - -template -class GraphSendRecvGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* src_index = ctx.Input("Dst_index"); - auto* dst_index = ctx.Input("Src_index"); - auto index_type = framework::TransToProtoVarType(src_index->dtype()); - - if (index_type == framework::proto::VarType::INT32) { - GraphSendRecvGradOpCUDAKernelLaunchHelper( - ctx, *src_index, *dst_index); - } else if (index_type == framework::proto::VarType::INT64) { - GraphSendRecvGradOpCUDAKernelLaunchHelper( - ctx, *src_index, *dst_index); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported Src_index or Dst_index dtype, expected int, int64, but " - "got %s.", - index_type)); - } - } -}; - -} // namespace operators -} // namespace paddle - -using CUDA = paddle::platform::CUDADeviceContext; -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL(graph_send_recv, - ops::GraphSendRecvOpCUDAKernel, - ops::GraphSendRecvOpCUDAKernel, - ops::GraphSendRecvOpCUDAKernel, - ops::GraphSendRecvOpCUDAKernel); - -REGISTER_OP_CUDA_KERNEL(graph_send_recv_grad, - ops::GraphSendRecvGradOpCUDAKernel, - ops::GraphSendRecvGradOpCUDAKernel, - ops::GraphSendRecvGradOpCUDAKernel, - ops::GraphSendRecvGradOpCUDAKernel); diff --git a/paddle/fluid/operators/graph_send_recv_op.h b/paddle/fluid/operators/graph_send_recv_op.h deleted file mode 100644 index 8d8111e0ee845b..00000000000000 --- a/paddle/fluid/operators/graph_send_recv_op.h +++ /dev/null @@ -1,291 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/platform/place.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -struct GraphSendRecvSumFunctor { - void operator()(const bool& first_flag, const Tensor& src_slice, - Tensor* dst_slice) { - auto eigen_src = framework::EigenVector::Flatten(src_slice); - auto eigen_dst = framework::EigenVector::Flatten(*dst_slice); - eigen_dst += eigen_src; - } -}; - -template -struct GraphSendRecvMinFunctor { - void operator()(const bool& first_flag, const Tensor& src_slice, - Tensor* dst_slice) { - auto eigen_src = framework::EigenVector::Flatten(src_slice); - auto eigen_dst = framework::EigenVector::Flatten(*dst_slice); - if (first_flag) { - eigen_dst += eigen_src; - } else { - eigen_dst = eigen_dst.cwiseMin(eigen_src); - } - } -}; - -template -struct GraphSendRecvMaxFunctor { - void operator()(const int& first_flag, const Tensor& src_slice, - Tensor* dst_slice) { - auto eigen_src = framework::EigenVector::Flatten(src_slice); - auto eigen_dst = framework::EigenVector::Flatten(*dst_slice); - if (first_flag) { - eigen_dst += eigen_src; - } else { - eigen_dst = eigen_dst.cwiseMax(eigen_src); - } - } -}; - -template -void elementwise_inner_operation(const Tensor& src, Tensor* dst, - const IndexT& src_index, - const IndexT& dst_index, - const bool& first_flag, Functor functor) { - auto src_slice = src.Slice(src_index, src_index + 1); - auto dst_slice = dst->Slice(dst_index, dst_index + 1); - - functor(first_flag, src_slice, &dst_slice); -} - -template -void graph_send_recv_cpu_for_loop(const int& input_size, const int& index_size, - const IndexT* s_index, const IndexT* d_index, - const Tensor& src, Tensor* dst, - const std::string& pool_type, - int* dst_count = nullptr) { - Functor functor; - if (pool_type == "SUM") { - for (int i = 0; i < index_size; ++i) { - const IndexT& src_idx = s_index[i]; - const IndexT& dst_idx = d_index[i]; - elementwise_inner_operation(src, dst, src_idx, - dst_idx, false, functor); - } - } else if (pool_type == "MEAN") { - for (int i = 0; i < index_size; ++i) { - const IndexT& src_idx = s_index[i]; - const IndexT& dst_idx = d_index[i]; - elementwise_inner_operation(src, dst, src_idx, - dst_idx, false, functor); - } - for (int i = 0; i < index_size; ++i) { - IndexT dst_idx = d_index[i]; - *(dst_count + dst_idx) += 1; - } - for (int i = 0; i < input_size; ++i) { - if (*(dst_count + i) == 0) continue; - auto dst_slice = dst->Slice(i, i + 1); - auto eigen_dst = framework::EigenVector::Flatten(dst_slice); - eigen_dst = eigen_dst / static_cast(*(dst_count + i)); - } - } else if (pool_type == "MIN" || pool_type == "MAX") { - std::set existed_dst; - for (int i = 0; i < index_size; ++i) { - const IndexT& src_idx = s_index[i]; - const IndexT& dst_idx = d_index[i]; - bool in_set = existed_dst.find(dst_idx) != existed_dst.end(); - if (!in_set) { - elementwise_inner_operation(src, dst, src_idx, - dst_idx, true, functor); - existed_dst.emplace(dst_idx); - } else { - elementwise_inner_operation( - src, dst, src_idx, dst_idx, false, functor); - } - } - } -} - -template -void graph_send_recv_cpu_for_loop_grad( - const int& input_size, const int& index_size, const IndexT* s_index, - const IndexT* d_index, const Tensor& src, Tensor* dst, - const std::string& pool_type, const int* dst_count = nullptr, - const Tensor* input = nullptr, const Tensor* output = nullptr) { - if (pool_type == "SUM") { - Functor functor; - for (int i = 0; i < index_size; ++i) { - const IndexT& src_idx = s_index[i]; - const IndexT& dst_idx = d_index[i]; - elementwise_inner_operation(src, dst, src_idx, - dst_idx, false, functor); - } - } else if (pool_type == "MEAN") { - for (int i = 0; i < index_size; ++i) { - const IndexT& src_idx = s_index[i]; - const IndexT& dst_idx = d_index[i]; - auto src_slice = src.Slice(src_idx, src_idx + 1); - auto dst_slice = dst->Slice(dst_idx, dst_idx + 1); - auto eigen_src = framework::EigenVector::Flatten(src_slice); - auto eigen_dst = framework::EigenVector::Flatten(dst_slice); - eigen_dst += (eigen_src / static_cast(dst_count[src_idx])); - } - } else if (pool_type == "MIN" || pool_type == "MAX") { - for (int i = 0; i < index_size; ++i) { - const IndexT& forward_src_idx = d_index[i]; - const IndexT& forward_dst_idx = s_index[i]; - auto input_slice = input->Slice(forward_src_idx, forward_src_idx + 1); - auto output_slice = output->Slice(forward_dst_idx, forward_dst_idx + 1); - auto eigen_input = framework::EigenVector::Flatten(input_slice); - auto eigen_output = framework::EigenVector::Flatten(output_slice); - - auto src_slice = src.Slice(forward_dst_idx, forward_dst_idx + 1); - auto dst_slice = dst->Slice(forward_src_idx, forward_src_idx + 1); - auto eigen_src = framework::EigenVector::Flatten(src_slice); - auto eigen_dst = framework::EigenVector::Flatten(dst_slice); - eigen_dst += eigen_src * (eigen_output == eigen_input); - } - } -} - -template -void GraphSendRecvOpKernelLaunchHelper(const framework::ExecutionContext& ctx, - const Tensor& src_index) { - auto* X = ctx.Input("X"); - auto* dst_index = ctx.Input("Dst_index"); - auto* Y = ctx.Output("Out"); - - const int& index_size = src_index.dims()[0]; - - T* p_output = Y->mutable_data(ctx.GetPlace()); - const auto& src_dims = X->dims(); - int64_t memset_size = 1; - for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i]; - const size_t& memset_bytes = memset_size * sizeof(T); - memset(p_output, 0, memset_bytes); - - if (index_size == 0) return; - - const IndexT* s_index = src_index.data(); - const IndexT* d_index = dst_index->data(); - const std::string& pool_type = ctx.Attr("pool_type"); - if (pool_type == "SUM") { - graph_send_recv_cpu_for_loop>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type); - } else if (pool_type == "MIN") { - graph_send_recv_cpu_for_loop>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type); - } else if (pool_type == "MAX") { - graph_send_recv_cpu_for_loop>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type); - } else if (pool_type == "MEAN") { - auto* dst_count = ctx.Output("Dst_count"); - int* p_dst_count = dst_count->mutable_data(ctx.GetPlace()); - memset(p_dst_count, 0, src_dims[0] * sizeof(int)); - graph_send_recv_cpu_for_loop>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, - p_dst_count); - } -} - -template -void GraphSendRecvGradOpKernelLaunchHelper( - const framework::ExecutionContext& ctx, const Tensor& src_index) { - auto* X = ctx.Input(framework::GradVarName("Out")); - auto* dst_index = ctx.Input("Src_index"); - auto* Y = ctx.Output(framework::GradVarName("X")); - - const int& index_size = src_index.dims()[0]; - - T* p_output = Y->mutable_data(ctx.GetPlace()); - const auto& src_dims = X->dims(); - int64_t memset_size = 1; - for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i]; - const size_t& memset_bytes = memset_size * sizeof(T); - memset(p_output, 0, memset_bytes); - - if (index_size == 0) return; - - const IndexT* s_index = src_index.data(); - const IndexT* d_index = dst_index->data(); - - const std::string& pool_type = ctx.Attr("pool_type"); - if (pool_type == "SUM") { - graph_send_recv_cpu_for_loop_grad>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type); - } else if (pool_type == "MEAN") { - auto* dst_count = ctx.Input("Dst_count"); - const int* s_count = dst_count->data(); - // Functor not used here. - graph_send_recv_cpu_for_loop_grad>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, s_count); - } else if (pool_type == "MIN" || pool_type == "MAX") { - const auto* input = ctx.Input("X"); - const auto* output = ctx.Input("Out"); - // Functor not used here. - graph_send_recv_cpu_for_loop_grad>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, nullptr, - input, output); - } -} - -template -class GraphSendRecvOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* src_index = ctx.Input("Src_index"); - auto index_type = framework::TransToProtoVarType(src_index->dtype()); - - if (index_type == framework::proto::VarType::INT32) { - GraphSendRecvOpKernelLaunchHelper(ctx, *src_index); - } else if (index_type == framework::proto::VarType::INT64) { - GraphSendRecvOpKernelLaunchHelper(ctx, - *src_index); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported Src_index or Dst_index type, Expected int, int64, but " - "got %s.", - index_type)); - } - } -}; - -template -class GraphSendRecvGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* src_index = ctx.Input("Dst_index"); - auto index_type = framework::TransToProtoVarType(src_index->dtype()); - - if (index_type == framework::proto::VarType::INT32) { - GraphSendRecvGradOpKernelLaunchHelper(ctx, - *src_index); - } else if (index_type == framework::proto::VarType::INT64) { - GraphSendRecvGradOpKernelLaunchHelper( - ctx, *src_index); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported Src_index or Dst_index type, Expected int, int64, but " - "got %s.", - index_type)); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/gumbel_softmax_op.cc b/paddle/fluid/operators/gumbel_softmax_op.cc index f8f8f3fd789ad6..524f2d6c9d7194 100644 --- a/paddle/fluid/operators/gumbel_softmax_op.cc +++ b/paddle/fluid/operators/gumbel_softmax_op.cc @@ -90,11 +90,11 @@ class GumbelSoftmaxGradOpMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(gumbel_softmax, GumbelSoftmaxInferShapeFunctor, - PT_INFER_META(phi::GumbelSoftmaxInferMeta)); -DELCARE_INFER_SHAPE_FUNCTOR(gumbel_softmax_grad, +DECLARE_INFER_SHAPE_FUNCTOR(gumbel_softmax, GumbelSoftmaxInferShapeFunctor, + PD_INFER_META(phi::GumbelSoftmaxInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(gumbel_softmax_grad, GumbelSoftmaxGradInferShapeFunctor, - PT_INFER_META(phi::GumbelSoftmaxGradInferMeta)); + PD_INFER_META(phi::GumbelSoftmaxGradInferMeta)); REGISTER_OPERATOR(gumbel_softmax, ops::GumbelSoftmaxOp, ops::GumbelSoftmaxOpMaker, diff --git a/paddle/fluid/operators/huber_loss_op.cc b/paddle/fluid/operators/huber_loss_op.cc index 3915ce5809c394..3c9bbc753f29b1 100644 --- a/paddle/fluid/operators/huber_loss_op.cc +++ b/paddle/fluid/operators/huber_loss_op.cc @@ -112,8 +112,8 @@ class HuberLossGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(huber_loss, HuberLossInferShapeFunctor, - PT_INFER_META(phi::HuberLossInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(huber_loss, HuberLossInferShapeFunctor, + PD_INFER_META(phi::HuberLossInferMeta)); REGISTER_OPERATOR(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker, ops::HuberLossGradOpMaker, diff --git a/paddle/fluid/operators/imag_op.cc b/paddle/fluid/operators/imag_op.cc index 567a69f383d1cc..16968876ac96ca 100644 --- a/paddle/fluid/operators/imag_op.cc +++ b/paddle/fluid/operators/imag_op.cc @@ -82,8 +82,8 @@ DECLARE_INPLACE_OP_INFERER(ImagGradOpInplaceInferer, } // namespace operators } // namespace paddle -DELCARE_INFER_SHAPE_FUNCTOR(imag, ImagInferShapeFunctor, - PT_INFER_META(phi::RealAndImagInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(imag, ImagInferShapeFunctor, + PD_INFER_META(phi::RealAndImagInferMeta)); namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/increment_op.cc b/paddle/fluid/operators/increment_op.cc index 105d818e197434..e2efaa1759b008 100644 --- a/paddle/fluid/operators/increment_op.cc +++ b/paddle/fluid/operators/increment_op.cc @@ -87,8 +87,8 @@ class IncrementGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(increment, IncrementInferShapeFunctor, - PT_INFER_META(phi::IncrementInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(increment, IncrementInferShapeFunctor, + PD_INFER_META(phi::IncrementInferMeta)); REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementOpMaker, ops::IncrementGradOpMaker, ops::IncrementGradOpMaker, diff --git a/paddle/fluid/operators/increment_op_npu_test.cc b/paddle/fluid/operators/increment_op_npu_test.cc index 09f4e63943ad37..8324a6215bca81 100644 --- a/paddle/fluid/operators/increment_op_npu_test.cc +++ b/paddle/fluid/operators/increment_op_npu_test.cc @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/index_sample_op.cc b/paddle/fluid/operators/index_sample_op.cc index 68d002fceea70f..d17c6368c7537b 100644 --- a/paddle/fluid/operators/index_sample_op.cc +++ b/paddle/fluid/operators/index_sample_op.cc @@ -100,8 +100,8 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(IndexSampleGradNoNeedBufferVarInferer, "X"); } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(index_sample, IndexSampleInferShapeFunctor, - PT_INFER_META(phi::IndexSampleInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(index_sample, IndexSampleInferShapeFunctor, + PD_INFER_META(phi::IndexSampleInferMeta)); REGISTER_OPERATOR(index_sample, ops::IndexSampleOp, ops::IndexSampleOpMaker, ops::IndexSampleGradMaker, ops::IndexSampleGradMaker, diff --git a/paddle/fluid/operators/inverse_op.h b/paddle/fluid/operators/inverse_op.h index 1e061d8b50ae02..31c22915ec5d05 100644 --- a/paddle/fluid/operators/inverse_op.h +++ b/paddle/fluid/operators/inverse_op.h @@ -15,8 +15,8 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/matrix_inverse.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/matrix_inverse.h" namespace paddle { namespace operators { @@ -30,7 +30,7 @@ class InverseKernel : public framework::OpKernel { output->mutable_data(context.GetPlace()); auto& dev_ctx = context.template device_context(); - math::MatrixInverseFunctor mat_inv; + phi::funcs::MatrixInverseFunctor mat_inv; mat_inv(dev_ctx, *input, output); } }; diff --git a/paddle/fluid/operators/is_empty_op.cc b/paddle/fluid/operators/is_empty_op.cc index 2750367dc77392..c835bb3cf60bfb 100644 --- a/paddle/fluid/operators/is_empty_op.cc +++ b/paddle/fluid/operators/is_empty_op.cc @@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/is_empty_op.h" +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -24,12 +26,6 @@ class IsEmptyOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; protected: - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "IsEmpty"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "IsEmpty"); - ctx->SetOutputDim("Out", {1}); - } - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { auto *x = ctx.Input("X"); @@ -56,12 +52,10 @@ It will just return product(tensor.ddims()) > 0; } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(is_empty, IsEmptyInferShapeFunctor, + PD_INFER_META(phi::IsEmptyInferMeta)); REGISTER_OPERATOR( is_empty, ops::IsEmptyOp, ops::IsEmptyOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL( - is_empty, ops::IsEmptyOpKernel, - ops::IsEmptyOpKernel, - ops::IsEmptyOpKernel, - ops::IsEmptyOpKernel); + paddle::framework::EmptyGradOpMaker, + IsEmptyInferShapeFunctor); diff --git a/paddle/fluid/operators/isfinite_v2_op.cc b/paddle/fluid/operators/isfinite_v2_op.cc index 735fffa7203b12..cfa370ff9cb19d 100644 --- a/paddle/fluid/operators/isfinite_v2_op.cc +++ b/paddle/fluid/operators/isfinite_v2_op.cc @@ -12,11 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/isfinite_v2_op.h" - #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/common_infer_shape_functions.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace framework { @@ -49,11 +51,6 @@ class OverflowV2Op : public framework::OperatorWithKernel { const framework::VariableNameMap &outputs, const framework::AttributeMap &attrs) : OperatorWithKernel(type, inputs, outputs, attrs) {} - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "isfinitev2"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "isfinitev2"); - UnaryOpUnchangedInferShape(ctx); - } protected: framework::OpKernelType GetExpectedKernelType( @@ -104,6 +101,14 @@ element of X as a tensor. } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(isinf_v2, IsinfInferShapeFunctor, + PD_INFER_META(phi::IsfiniteInferMeta)); + +DECLARE_INFER_SHAPE_FUNCTOR(isnan_v2, IsnanInferShapeFunctor, + PD_INFER_META(phi::IsfiniteInferMeta)); + +DECLARE_INFER_SHAPE_FUNCTOR(isfinite_v2, IsfiniteInferShapeFunctor, + PD_INFER_META(phi::IsfiniteInferMeta)); #define REGISTER_V2OP_MAKER(op_type, comment) \ namespace paddle { \ @@ -124,50 +129,17 @@ REGISTER_V2OP_MAKER(isfinite_v2, "isfinitev2(X)"); REGISTER_OPERATOR( isinf_v2, ops::OverflowV2Op, ops::_isinf_v2OverflowV2OpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); + paddle::framework::EmptyGradOpMaker, + IsinfInferShapeFunctor); REGISTER_OPERATOR( isnan_v2, ops::OverflowV2Op, ops::_isnan_v2OverflowV2OpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); + paddle::framework::EmptyGradOpMaker, + IsnanInferShapeFunctor); REGISTER_OPERATOR( isfinite_v2, ops::OverflowV2Op, ops::_isfinite_v2OverflowV2OpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); - -REGISTER_OP_CPU_KERNEL(isnan_v2, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); - -REGISTER_OP_CPU_KERNEL( - isinf_v2, ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); - -REGISTER_OP_CPU_KERNEL( - isfinite_v2, ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); + paddle::framework::EmptyGradOpMaker, + IsfiniteInferShapeFunctor); diff --git a/paddle/fluid/operators/isfinite_v2_op.cu b/paddle/fluid/operators/isfinite_v2_op.cu deleted file mode 100644 index 1b9f19d36dfa0f..00000000000000 --- a/paddle/fluid/operators/isfinite_v2_op.cu +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/isfinite_v2_op.h" -#include "paddle/fluid/platform/float16.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL(isnan_v2, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); - -REGISTER_OP_CUDA_KERNEL( - isinf_v2, ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); - -REGISTER_OP_CUDA_KERNEL( - isfinite_v2, ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); diff --git a/paddle/fluid/operators/kthvalue_op.cu b/paddle/fluid/operators/kthvalue_op.cu index 4f30c58d375008..f6f56f70f1a119 100644 --- a/paddle/fluid/operators/kthvalue_op.cu +++ b/paddle/fluid/operators/kthvalue_op.cu @@ -16,7 +16,6 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/kthvalue_op.h" #include "paddle/fluid/operators/top_k_function_cuda.h" -#include "paddle/fluid/operators/top_k_v2_op.h" #ifdef __NVCC__ #include "cub/cub.cuh" #endif diff --git a/paddle/fluid/operators/lerp_op.cc b/paddle/fluid/operators/lerp_op.cc index fef6fc5319ebdb..5e053445379118 100644 --- a/paddle/fluid/operators/lerp_op.cc +++ b/paddle/fluid/operators/lerp_op.cc @@ -85,8 +85,8 @@ DECLARE_INPLACE_OP_INFERER(LerpInplaceInferer, {"X", "Out"}); } // namespace operators } // namespace paddle -DELCARE_INFER_SHAPE_FUNCTOR(lerp, LerpInferShapeFunctor, - PT_INFER_META(phi::LerpInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(lerp, LerpInferShapeFunctor, + PD_INFER_META(phi::LerpInferMeta)); REGISTER_OPERATOR( lerp, paddle::operators::LerpOp, paddle::operators::LerpOpMaker, paddle::operators::LerpOpGradMaker, diff --git a/paddle/fluid/operators/linspace_op.cc b/paddle/fluid/operators/linspace_op.cc index fe271fa5e893a7..378c7573d6129a 100644 --- a/paddle/fluid/operators/linspace_op.cc +++ b/paddle/fluid/operators/linspace_op.cc @@ -12,9 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/linspace_op.h" #include + +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -23,33 +27,6 @@ class LinspaceOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Start"), "Input", "Start", "linspace"); - OP_INOUT_CHECK(ctx->HasInput("Stop"), "Input", "Stop", "linspace"); - OP_INOUT_CHECK(ctx->HasInput("Num"), "Input", "Num", "linspace"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "linspace"); - - auto s_dims = ctx->GetInputDim("Start"); - PADDLE_ENFORCE_EQ((s_dims.size() == 1) && (s_dims[0] == 1), true, - platform::errors::InvalidArgument( - "The shape of Input(Start) must be [1]," - "but received input shape is [%s].", - s_dims)); - auto e_dims = ctx->GetInputDim("Stop"); - PADDLE_ENFORCE_EQ((e_dims.size() == 1) && (e_dims[0] == 1), true, - platform::errors::InvalidArgument( - "The shape of Input(Stop) must be [1]," - "but received input shape is [%s].", - e_dims)); - auto step_dims = ctx->GetInputDim("Num"); - PADDLE_ENFORCE_EQ( - (step_dims.size() == 1) && (step_dims[0] == 1), true, - platform::errors::InvalidArgument("The shape of Input(Num) must be [1]," - "but received input shape is [%s].", - step_dims)); - ctx->SetOutputDim("Out", {-1}); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -88,11 +65,13 @@ class LinspaceOpMaker : public framework::OpProtoAndCheckerMaker { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(linspace, ops::LinspaceOp, ops::LinspaceOpMaker); -REGISTER_OP_CPU_KERNEL(linspace, ops::CPULinspaceKernel, - ops::CPULinspaceKernel, - ops::CPULinspaceKernel, - ops::CPULinspaceKernel); +DECLARE_INFER_SHAPE_FUNCTOR(linspace, LinspaceInferShapeFunctor, + PD_INFER_META(phi::LinspaceInferMeta)); +REGISTER_OPERATOR( + linspace, ops::LinspaceOp, ops::LinspaceOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + LinspaceInferShapeFunctor); REGISTER_OP_VERSION(linspace) .AddCheckpoint( diff --git a/paddle/fluid/operators/linspace_op.cu b/paddle/fluid/operators/linspace_op.cu deleted file mode 100644 index aa625a7f5b9df0..00000000000000 --- a/paddle/fluid/operators/linspace_op.cu +++ /dev/null @@ -1,104 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/data_type_transform.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/linspace_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -__global__ void LinspaceKernel(T start, T stop, double step, int64_t size, - T* out) { - int64_t index = blockIdx.x * blockDim.x + threadIdx.x; - - for (; index < size; index += blockDim.x * gridDim.x) { - if (index < size / 2) { - out[index] = static_cast(start + step * index); - } else { - out[index] = static_cast(stop - step * (size - index - 1)); - } - } -} - -template -__global__ void LinspaceSpecialKernel(T start, T* out) { - out[0] = static_cast(start); -} - -template -class CUDALinspaceKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* pre_start = context.Input("Start"); - auto* pre_stop = context.Input("Stop"); - auto* num_t = context.Input("Num"); - auto* out = context.Output("Out"); - auto dtype = static_cast( - context.Attr("dtype")); - - Tensor start_t; - Tensor stop_t; - auto start_dtype = framework::OpKernelType( - framework::TransToProtoVarType(pre_start->dtype()), context.GetPlace()); - auto stop_dtype = framework::OpKernelType( - framework::TransToProtoVarType(pre_stop->dtype()), context.GetPlace()); - auto out_dtype = framework::OpKernelType(dtype, context.GetPlace()); - framework::TransDataType(start_dtype, out_dtype, *pre_start, &start_t); - framework::TransDataType(stop_dtype, out_dtype, *pre_stop, &stop_t); - - framework::Tensor n_start; - framework::Tensor n_stop; - framework::Tensor n_num; - framework::TensorCopy(start_t, platform::CPUPlace(), &n_start); - T start = n_start.data()[0]; - framework::TensorCopy(stop_t, platform::CPUPlace(), &n_stop); - T stop = n_stop.data()[0]; - framework::TensorCopy(*num_t, platform::CPUPlace(), &n_num); - int64_t num = static_cast(n_num.data()[0]); - - PADDLE_ENFORCE_GT(num, 0, platform::errors::InvalidArgument( - "The num of linspace op should be larger " - "than 0, but received num is %d", - num)); - - out->Resize(phi::make_ddim({num})); - T* out_data = out->mutable_data(context.GetPlace()); - - double step = 0; - auto stream = context.cuda_device_context().stream(); - int block = 512; - int grid = (num + block - 1) / block; - if (num != 1) { - step = (static_cast(stop - start)) / (num - 1); - LinspaceKernel<<>>(start, stop, step, num, - out_data); - } else { - LinspaceSpecialKernel<<>>(start, out_data); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(linspace, ops::CUDALinspaceKernel, - ops::CUDALinspaceKernel, - ops::CUDALinspaceKernel, - ops::CUDALinspaceKernel); diff --git a/paddle/fluid/operators/linspace_op.h b/paddle/fluid/operators/linspace_op.h deleted file mode 100644 index ae51f1221cc09b..00000000000000 --- a/paddle/fluid/operators/linspace_op.h +++ /dev/null @@ -1,76 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/data_type_transform.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class CPULinspaceKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* pre_start = context.Input("Start"); - auto* pre_stop = context.Input("Stop"); - int32_t num = context.Input("Num")->data()[0]; - auto* out = context.Output("Out"); - auto dtype = static_cast( - context.Attr("dtype")); - - Tensor start_t; - Tensor stop_t; - auto start_dtype = framework::OpKernelType( - framework::TransToProtoVarType(pre_start->dtype()), context.GetPlace()); - auto stop_dtype = framework::OpKernelType( - framework::TransToProtoVarType(pre_stop->dtype()), context.GetPlace()); - auto out_dtype = framework::OpKernelType(dtype, context.GetPlace()); - framework::TransDataType(start_dtype, out_dtype, *pre_start, &start_t); - framework::TransDataType(stop_dtype, out_dtype, *pre_stop, &stop_t); - - T start = start_t.data()[0]; - T stop = stop_t.data()[0]; - PADDLE_ENFORCE_GT(num, 0, platform::errors::InvalidArgument( - "The num of linspace op should be larger " - "than 0, but received num is %d", - num)); - - out->Resize(phi::make_ddim({num})); - - T* out_data = out->mutable_data(context.GetPlace()); - - if (num > 1) { - // step should be of double type for all types - double step = (static_cast(stop - start)) / (num - 1); - int half_num = num / 2; - for (int i = 0; i < num; ++i) { - if (i < half_num) { - out_data[i] = static_cast(start + step * i); - } else { - out_data[i] = static_cast(stop - step * (num - i - 1)); - } - } - } else { - out_data[0] = static_cast(start); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc index 2e596ff3e62573..883e3597d8a311 100644 --- a/paddle/fluid/operators/log_loss_op.cc +++ b/paddle/fluid/operators/log_loss_op.cc @@ -13,7 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -21,43 +24,6 @@ namespace operators { class LogLossOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Predicted"), "Input", "Predicted", "LogLoss"); - OP_INOUT_CHECK(ctx->HasInput("Labels"), "Input", "Labels", "LogLoss"); - - auto pred_dims = ctx->GetInputDim("Predicted"); - auto label_dims = ctx->GetInputDim("Labels"); - - if (ctx->IsRuntime() || - (phi::product(pred_dims) > 0 && phi::product(label_dims) > 0)) { - PADDLE_ENFORCE_EQ( - pred_dims, label_dims, - platform::errors::InvalidArgument( - "The dimensions of Input(Predicted) must be equal to the" - "dimensions of Input(Labels), but received dimensions of " - "Input(Predicted)" - "is [%s], received dimensions of Input(Labels) is [%s].", - pred_dims, label_dims)); - } - PADDLE_ENFORCE_EQ(pred_dims.size(), 2, - platform::errors::InvalidArgument( - "The dimensions of Input(Predicted) must be 2," - "But received dimensions of Input(Predicted)" - "is [%d]", - pred_dims.size())); - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ( - pred_dims[1], 1, - platform::errors::InvalidArgument( - "Each row of Input(Predicted) contains a real value, " - "so the 2nd dimension of Input(X) must be 1," - "But got [%d]", - pred_dims[1])); - } - ctx->SetOutputDim("Loss", {pred_dims[0], 1}); - ctx->ShareLoD("Predicted", "Loss"); - } }; template @@ -145,7 +111,10 @@ class LogLossGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(log_loss, LogLossInferShapeFunctor, + PD_INFER_META(phi::LogLossInferMeta)); REGISTER_OPERATOR(log_loss, ops::LogLossOp, ops::LogLossOpMaker, ops::LogLossGradMaker, - ops::LogLossGradMaker); + ops::LogLossGradMaker, + LogLossInferShapeFunctor); REGISTER_OPERATOR(log_loss_grad, ops::LogLossGradOp); diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 14b12ca3acb19a..31a98d9f630e1c 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -5,6 +5,8 @@ endif() # please add new math_library in alphabetical order if (WITH_ASCEND_CL) math_library(concat_and_split DEPS concat_and_split_functor npu_op_runner) +elseif (WITH_MLU) +math_library(concat_and_split DEPS concat_and_split_functor mlu_baseop) else() math_library(concat_and_split DEPS concat_and_split_functor) endif() @@ -44,8 +46,6 @@ math_library(vol2col) math_library(prelu) math_library(bert_encoder_functor) math_library(tree2col DEPS math_function) -math_library(matrix_inverse) -math_library(segment_pooling) math_library(matrix_solve) cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor) diff --git a/paddle/fluid/operators/math/concat_and_split.cc b/paddle/fluid/operators/math/concat_and_split.cc index 46126ac59c8927..c9308d27c0a349 100644 --- a/paddle/fluid/operators/math/concat_and_split.cc +++ b/paddle/fluid/operators/math/concat_and_split.cc @@ -18,6 +18,9 @@ limitations under the License. */ #ifdef PADDLE_WITH_ASCEND_CL #include "paddle/fluid/platform/device/npu/npu_op_runner.h" #endif +#ifdef PADDLE_WITH_MLU +#include "paddle/fluid/operators/mlu/mlu_baseop.h" +#endif #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/float16.h" @@ -226,6 +229,90 @@ class SplitFunctor { }; #endif +#ifdef PADDLE_WITH_MLU +template +class ConcatFunctor { + public: + void operator()(const platform::MLUDeviceContext& context, + const std::vector& input, int axis, + framework::Tensor* output) { + int dev_id = context.GetPlace().GetDeviceId(); + platform::MLUDeviceGuard guard(dev_id); + + auto ins_size = input.size(); + + const int axis_t = axis; + const int ins_size_t = ins_size; + auto place = context.GetPlace(); + output->mutable_data(place); + + // mlu should do sth + // init ins tensors + std::vector inputs; + std::vector input_descs; + std::vector desc_vector; + for (size_t i = 0; i < ins_size; i++) { + input_descs.emplace_back(MLUCnnlTensorDesc( + input[i], CNNL_LAYOUT_ARRAY, ToCnnlDataType(input[i].dtype()))); + desc_vector.push_back(input_descs.back().get()); + inputs.push_back(input[i].data()); + } + // init out tensors + MLUCnnlTensorDesc output_desc(*output, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(output->dtype())); + + // MLU should do sth + MLUCnnl::Concat(context, ins_size_t, axis_t, desc_vector.data(), + inputs.data(), output_desc.get(), GetBasePtr(output)); + } +}; + +template +class SplitFunctor { + public: + void operator()(const platform::MLUDeviceContext& context, + const framework::Tensor& input, + const std::vector& ref_inputs, + const int axis, std::vector* outputs) { + if (input.numel() == 0) { + return; + } + + int dev_id = context.GetPlace().GetDeviceId(); + platform::MLUDeviceGuard guard(dev_id); + + auto in_dims = input.dims(); + auto out_size = outputs->size(); + + std::vector outs_dims(out_size, in_dims); + for (size_t i = 0; i < out_size; ++i) { + outs_dims[i][axis] = ref_inputs[i]->dims()[axis]; + } + + // init out tensors + std::vector vct_tensor; + std::vector output_descs; + std::vector desc_vector; + for (size_t i = 0; i < out_size; i++) { + (*outputs)[i]->Resize(outs_dims[i]); + (*outputs)[i]->mutable_data(context.GetPlace()); + output_descs.emplace_back( + MLUCnnlTensorDesc(*(*outputs)[i], CNNL_LAYOUT_ARRAY, + ToCnnlDataType((*outputs)[i]->dtype()))); + desc_vector.push_back(output_descs.back().get()); + vct_tensor.push_back(GetBasePtr((*outputs)[i])); + } + // init in tensors + MLUCnnlTensorDesc input_desc(input, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(input.dtype())); + + // MLU should do sth + MLUCnnl::Split(context, out_size, axis, input_desc.get(), input.data(), + desc_vector.data(), vct_tensor.data()); + } +}; +#endif + #define DEFINE_FUNCTOR(type) \ template class ConcatFunctor; \ template class SplitFunctor; @@ -248,6 +335,19 @@ DEFINE_XPU_FUNCTOR(float) FOR_ALL_TYPES(DEFINE_NPU_FUNCTOR) #endif +#ifdef PADDLE_WITH_MLU +#define DEFINE_MLU_FUNCTOR(type) \ + template class ConcatFunctor; \ + template class SplitFunctor; +DEFINE_MLU_FUNCTOR(float) +DEFINE_MLU_FUNCTOR(platform::float16) +DEFINE_MLU_FUNCTOR(int64_t) +DEFINE_MLU_FUNCTOR(bool) +DEFINE_MLU_FUNCTOR(int) +DEFINE_MLU_FUNCTOR(int8_t) +DEFINE_MLU_FUNCTOR(int16_t) +DEFINE_MLU_FUNCTOR(uint8_t) +#endif } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/matrix_inverse.cc b/paddle/fluid/operators/math/matrix_inverse.cc deleted file mode 100644 index 1b36e615c68df8..00000000000000 --- a/paddle/fluid/operators/math/matrix_inverse.cc +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/math/matrix_inverse.h" -#include "Eigen/Core" -#include "Eigen/LU" -#include "paddle/phi/kernels/funcs/blas/blas.h" - -namespace paddle { -namespace operators { -namespace math { - -template -class MatrixInverseFunctor { - public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& a, framework::Tensor* a_inv) { - compute_inverse_eigen(context, a, a_inv); - } -}; - -template class MatrixInverseFunctor; -template class MatrixInverseFunctor; - -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/math/matrix_inverse.cu.cc b/paddle/fluid/operators/math/matrix_inverse.cu.cc deleted file mode 100644 index 41335a69417a94..00000000000000 --- a/paddle/fluid/operators/math/matrix_inverse.cu.cc +++ /dev/null @@ -1,124 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/math/matrix_inverse.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" - -namespace paddle { -namespace platform { -class CUDADeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -namespace math { - -template -class MatrixInverseFunctor; - -template -class MatrixInverseFunctor { - public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& a, framework::Tensor* a_inv) { -#ifndef PADDLE_WITH_HIP - const auto& mat_dims = a.dims(); - const int rank = mat_dims.size(); - int n = mat_dims[rank - 1]; - int batch_size = rank > 2 ? a.numel() / (n * n) : 1; - - memory::allocation::AllocationPtr tmp_gpu_mat_data; - const T* gpu_mat = a.data(); - if (n >= 32) { - // Copy all elements of input matrix A to a temporary memory space to - // avoid being overriden by getrf. - tmp_gpu_mat_data = memory::Alloc(context, a.numel() * sizeof(T)); - memory::Copy(context.GetPlace(), tmp_gpu_mat_data->ptr(), - context.GetPlace(), a.data(), a.numel() * sizeof(T), - context.stream()); - gpu_mat = reinterpret_cast(tmp_gpu_mat_data->ptr()); - } - - std::vector cpu_ptrs(batch_size * 2); - for (int i = 0; i < batch_size; ++i) { - cpu_ptrs[i] = gpu_mat + i * n * n; - cpu_ptrs[i + batch_size] = a_inv->data() + i * n * n; - } - - // Copy the addresses of A and A_inv from host to device. - memory::allocation::AllocationPtr tmp_gpu_ptrs_data = - memory::Alloc(context, cpu_ptrs.size() * sizeof(T*)); - memory::Copy(context.GetPlace(), tmp_gpu_ptrs_data->ptr(), - platform::CPUPlace(), static_cast(cpu_ptrs.data()), - cpu_ptrs.size() * sizeof(T*), context.stream()); - T** gpu_inv_ptrs = - reinterpret_cast(tmp_gpu_ptrs_data->ptr()) + batch_size; - - // Allocate device memory for info and pivots. - int num_ints = n < 32 ? batch_size : batch_size * (n + 1); - memory::allocation::AllocationPtr tmp_gpu_info_data = - memory::Alloc(context, num_ints * sizeof(int)); - int* gpu_info_ptr = reinterpret_cast(tmp_gpu_info_data->ptr()); - - auto blas = phi::funcs::GetBlas(context); - - std::vector info; // only for singular checking - info.resize(batch_size); - // This functions in cuBLAS is intended to be used for matrices of small - // sizes where the launch overhead is a significant factor. - // TODO(Xreki): call function in cusolver for large matrices. - if (n < 32) { - // cublasmatinvBatched is a short cut of cublasgetrfBatched - // plus cublasgetriBatched. - // However it only works if N is less than 32. If not, we need to - // go through cublasgetrfBatched and cublasgetriBatched. - blas.BatchedMatInv(n, - reinterpret_cast(tmp_gpu_ptrs_data->ptr()), - gpu_inv_ptrs, gpu_info_ptr, batch_size); - } else { - // This function performs the LU factorization of each matrix A by the - // equation P * A = L * U. L and U are written back to original matrix A, - // and diagonal elements of L are discarded. - int* gpu_pivot_ptr = - reinterpret_cast(tmp_gpu_info_data->ptr()) + batch_size; - blas.BatchedGETRF(n, reinterpret_cast(tmp_gpu_ptrs_data->ptr()), - gpu_pivot_ptr, gpu_info_ptr, batch_size); - - blas.BatchedGETRI(n, - reinterpret_cast(tmp_gpu_ptrs_data->ptr()), - gpu_pivot_ptr, gpu_inv_ptrs, gpu_info_ptr, batch_size); - } - memory::Copy(platform::CPUPlace(), info.data(), context.GetPlace(), - gpu_info_ptr, sizeof(int) * batch_size, context.stream()); - for (int i = 0; i < batch_size; ++i) { - PADDLE_ENFORCE_EQ(info[i], 0, - platform::errors::PreconditionNotMet( - "For batch [%d]: U(%d, %d) is zero, singular U. " - "Please check the matrix value and change it to a " - "non-singular matrix", - i, info[i], info[i])); - } -#else - compute_inverse_eigen(context, a, a_inv); -#endif - } -}; - -template class MatrixInverseFunctor; -template class MatrixInverseFunctor; - -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/math/maxouting.cc b/paddle/fluid/operators/math/maxouting.cc index 45556e97d1d7af..28ec3a871022f4 100644 --- a/paddle/fluid/operators/math/maxouting.cc +++ b/paddle/fluid/operators/math/maxouting.cc @@ -14,106 +14,107 @@ limitations under the License. */ #include "paddle/fluid/operators/math/maxouting.h" +#include "paddle/phi/backends/cpu/cpu_context.h" + namespace paddle { namespace operators { namespace math { // All tensors are in NCHW or NHWC format, and the groups must be greater than 1 -template -class MaxOutFunctor { - public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, framework::Tensor* output, - const int groups, const int axis) { - const int batch_size = input.dims()[0]; - const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); - const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); - const int output_channels = output->dims()[axis]; - int fea_size = input_height * input_width; - // c_size means the output size of each sample - int c_size = fea_size * output_channels; - const T* input_data = input.data(); - T* output_data = output->mutable_data(context.GetPlace()); - for (int i = 0; i < batch_size; ++i) { - int new_bindex = c_size * i; - for (int c = 0; c < output_channels; ++c) { - int new_cindex = fea_size * c; - for (int f = 0; f < fea_size; ++f) { - T ele = static_cast(-FLT_MAX); - int input_idx, output_idx; - for (int ph = 0; ph < groups; ++ph) { - if (axis == 1) { - input_idx = - (new_bindex + new_cindex) * groups + ph * fea_size + f; - } else { - input_idx = (new_bindex + f * output_channels + c) * groups + ph; - } - T x = input_data[input_idx]; - ele = ele > x ? ele : x; - } +template +void MaxOutFunctor::operator()(const DeviceContext& context, + const framework::Tensor& input, + framework::Tensor* output, + const int groups, + const int axis) { + const int batch_size = input.dims()[0]; + const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); + const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); + const int output_channels = output->dims()[axis]; + int fea_size = input_height * input_width; + // c_size means the output size of each sample + int c_size = fea_size * output_channels; + const T* input_data = input.data(); + T* output_data = output->mutable_data(context.GetPlace()); + for (int i = 0; i < batch_size; ++i) { + int new_bindex = c_size * i; + for (int c = 0; c < output_channels; ++c) { + int new_cindex = fea_size * c; + for (int f = 0; f < fea_size; ++f) { + T ele = static_cast(-FLT_MAX); + int input_idx, output_idx; + for (int ph = 0; ph < groups; ++ph) { if (axis == 1) { - output_idx = new_bindex + new_cindex + f; + input_idx = (new_bindex + new_cindex) * groups + ph * fea_size + f; } else { - output_idx = new_bindex + f * output_channels + c; + input_idx = (new_bindex + f * output_channels + c) * groups + ph; } - output_data[output_idx] = ele; + T x = input_data[input_idx]; + ele = ele > x ? ele : x; } + if (axis == 1) { + output_idx = new_bindex + new_cindex + f; + } else { + output_idx = new_bindex + f * output_channels + c; + } + output_data[output_idx] = ele; } } } -}; +} -template -class MaxOutGradFunctor { - public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, framework::Tensor* input_grad, - const framework::Tensor& output, - const framework::Tensor& output_grad, const int groups, - const int axis) { - const int batch_size = input.dims()[0]; - const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); - const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); - const int output_channels = output.dims()[axis]; - int fea_size = input_height * input_width; - const T* input_data = input.data(); - const T* output_data = output.data(); - const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); +template +void MaxOutGradFunctor::operator()( + const DeviceContext& context, const framework::Tensor& input, + framework::Tensor* input_grad, const framework::Tensor& output, + const framework::Tensor& output_grad, const int groups, const int axis) { + const int batch_size = input.dims()[0]; + const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); + const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); + const int output_channels = output.dims()[axis]; + int fea_size = input_height * input_width; + const T* input_data = input.data(); + const T* output_data = output.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); - for (int i = 0; i < batch_size; ++i) { - int blen = fea_size * output_channels * i; - for (int c = 0; c < output_channels; ++c) { - int clen = fea_size * c; - for (int f = 0; f < fea_size; ++f) { - int input_idx0, output_idx; - bool continue_match = true; - if (axis == 1) { - input_idx0 = (blen + clen) * groups + f; - output_idx = blen + clen + f; - } else { - input_idx0 = (blen + f * output_channels + c) * groups; - output_idx = blen + f * output_channels + c; - } - for (int g = 0; g < groups && continue_match; ++g) { - int idx_offset = (axis == 1 ? fea_size * g : g); - int input_idx = input_idx0 + idx_offset; - if (input_data[input_idx] == output_data[output_idx]) { - input_grad_data[input_idx] += output_grad_data[output_idx]; - continue_match = false; - } + for (int i = 0; i < batch_size; ++i) { + int blen = fea_size * output_channels * i; + for (int c = 0; c < output_channels; ++c) { + int clen = fea_size * c; + for (int f = 0; f < fea_size; ++f) { + int input_idx0, output_idx; + bool continue_match = true; + if (axis == 1) { + input_idx0 = (blen + clen) * groups + f; + output_idx = blen + clen + f; + } else { + input_idx0 = (blen + f * output_channels + c) * groups; + output_idx = blen + f * output_channels + c; + } + for (int g = 0; g < groups && continue_match; ++g) { + int idx_offset = (axis == 1 ? fea_size * g : g); + int input_idx = input_idx0 + idx_offset; + if (input_data[input_idx] == output_data[output_idx]) { + input_grad_data[input_idx] += output_grad_data[output_idx]; + continue_match = false; } } } } } -}; +} template class MaxOutGradFunctor; template class MaxOutGradFunctor; template class MaxOutFunctor; template class MaxOutFunctor; +template class MaxOutGradFunctor; +template class MaxOutGradFunctor; +template class MaxOutFunctor; +template class MaxOutFunctor; + } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/maxouting.cu b/paddle/fluid/operators/math/maxouting.cu index 1856fb4eb48c73..1d0478db5ef4a8 100644 --- a/paddle/fluid/operators/math/maxouting.cu +++ b/paddle/fluid/operators/math/maxouting.cu @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/maxouting.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_context.h" namespace paddle { namespace operators { @@ -95,61 +96,57 @@ __global__ void KernelMaxoutGrad(const int nthreads, const T* input_data, /* * All tensors are in NCHW or NHWC format. */ -template -class MaxOutFunctor { - public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, framework::Tensor* output, - const int groups, const int axis) { - const int batch_size = input.dims()[0]; - const int input_channels = input.dims()[axis]; - const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); - const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); - const int output_channels = output->dims()[axis]; - - const T* input_data = input.data(); - T* output_data = output->mutable_data(context.GetPlace()); - int nthreads = output->numel(); - int blocks = (nthreads + 1024 - 1) / 1024; - dim3 threads(1024, 1); - dim3 grid(blocks, 1); - - KernelMaxOut<<>>( - nthreads, input_data, input_channels, input_height, input_width, groups, - axis, output_data); - } -}; +template +void MaxOutFunctor::operator()(const DeviceContext& context, + const framework::Tensor& input, + framework::Tensor* output, + const int groups, + const int axis) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[axis]; + const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); + const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); + const int output_channels = output->dims()[axis]; + + const T* input_data = input.data(); + T* output_data = output->mutable_data(context.GetPlace()); + int nthreads = output->numel(); + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelMaxOut<<>>( + nthreads, input_data, input_channels, input_height, input_width, groups, + axis, output_data); +} + /* * All tensors are in NCHW or NHWC format. */ -template -class MaxOutGradFunctor { - public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, framework::Tensor* input_grad, - const framework::Tensor& output, - const framework::Tensor& output_grad, const int groups, - const int axis) { - const int batch_size = input.dims()[0]; - const int input_channels = input.dims()[axis]; - const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); - const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); - const int output_channels = output.dims()[axis]; - - const T* input_data = input.data(); - const T* output_data = output.data(); - const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); - int nthreads = output.numel(); - int blocks = (nthreads + 1024 - 1) / 1024; - dim3 threads(1024, 1); - dim3 grid(blocks, 1); - - KernelMaxoutGrad<<>>( - nthreads, input_data, output_data, output_grad_data, input_grad_data, - input_channels, input_height, input_width, groups, axis); - } -}; +template +void MaxOutGradFunctor::operator()( + const DeviceContext& context, const framework::Tensor& input, + framework::Tensor* input_grad, const framework::Tensor& output, + const framework::Tensor& output_grad, const int groups, const int axis) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[axis]; + const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); + const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); + const int output_channels = output.dims()[axis]; + + const T* input_data = input.data(); + const T* output_data = output.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + int nthreads = output.numel(); + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelMaxoutGrad<<>>( + nthreads, input_data, output_data, output_grad_data, input_grad_data, + input_channels, input_height, input_width, groups, axis); +} template class MaxOutGradFunctor; template class MaxOutGradFunctor; @@ -157,6 +154,12 @@ template class MaxOutGradFunctor; template class MaxOutFunctor; template class MaxOutFunctor; +template class MaxOutGradFunctor; +template class MaxOutGradFunctor; + +template class MaxOutFunctor; +template class MaxOutFunctor; + } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/maxouting.h b/paddle/fluid/operators/math/maxouting.h index 0d8372df8a2fec..1f4964f7715426 100644 --- a/paddle/fluid/operators/math/maxouting.h +++ b/paddle/fluid/operators/math/maxouting.h @@ -30,7 +30,7 @@ class MaxOutFunctor { const int axis = 1); }; -template +template class MaxOutGradFunctor { public: void operator()(const DeviceContext& context, const framework::Tensor& input, diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc index 788dbb2204109d..01fa01e3c6ed04 100644 --- a/paddle/fluid/operators/matmul_v2_op.cc +++ b/paddle/fluid/operators/matmul_v2_op.cc @@ -524,8 +524,8 @@ REGISTER_OPERATOR(matmul_v2, ops::MatMulV2Op, ops::MatMulV2OpMaker, ops::MatMulV2GradOpMaker, ops::MatMulV2GradOpMaker); -DELCARE_INFER_SHAPE_FUNCTOR(matmul_v2_grad, MatMulV2GradInferShapeFunctor, - PT_INFER_META(phi::GeneralBinaryGradInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(matmul_v2_grad, MatMulV2GradInferShapeFunctor, + PD_INFER_META(phi::GeneralBinaryGradInferMeta)); REGISTER_OPERATOR(matmul_v2_grad, ops::MatMulV2OpGrad, ops::MatMulV2OpDoubleGradMaker, ops::MatMulV2OpDoubleGradMaker, diff --git a/paddle/fluid/operators/matrix_power_op.cc b/paddle/fluid/operators/matrix_power_op.cc index c65af3129f3646..cdf204628b638f 100644 --- a/paddle/fluid/operators/matrix_power_op.cc +++ b/paddle/fluid/operators/matrix_power_op.cc @@ -12,7 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/matrix_power_op.h" +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor_util.h" namespace paddle { namespace operators { @@ -119,13 +122,3 @@ REGISTER_OPERATOR(matrix_power, ops::MatrixPowerOp, ops::MatrixPowerOpMaker, ops::MatrixPowerGradOpMaker); REGISTER_OPERATOR(matrix_power_grad, ops::MatrixPowerGradOp); - -REGISTER_OP_CPU_KERNEL( - matrix_power, - ops::MatrixPowerKernel, - ops::MatrixPowerKernel); - -REGISTER_OP_CPU_KERNEL( - matrix_power_grad, - ops::MatrixPowerGradKernel, - ops::MatrixPowerGradKernel); diff --git a/paddle/fluid/operators/matrix_power_op.h b/paddle/fluid/operators/matrix_power_op.h deleted file mode 100644 index d2c67d80b4f5a5..00000000000000 --- a/paddle/fluid/operators/matrix_power_op.h +++ /dev/null @@ -1,277 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/math/matrix_inverse.h" -#include "paddle/fluid/platform/for_range.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -struct IdentityMatrixFunctor { - IdentityMatrixFunctor(const int m, T* output) : m_(m), output_(output) {} - - HOSTDEVICE void operator()(size_t index) const { - const int row = index / m_ % m_; - const int col = index % m_; - output_[index] = col == row ? static_cast(1) : static_cast(0); - } - - const int m_; - T* output_; -}; - -template -void MatrixPowerFunction(const Tensor* X, const int n, Tensor* Out, - const paddle::framework::ExecutionContext& ctx) { - const auto& x_dims = X->dims(); - const int x_ndim = x_dims.size(); - T* out_data = Out->mutable_data(ctx.GetPlace()); - - auto& dev_ctx = ctx.template device_context(); - platform::ForRange for_range(dev_ctx, X->numel()); - - if (n == 0) { - // Out = Identity Matrix - IdentityMatrixFunctor functor(x_dims[x_ndim - 1], out_data); - for_range(functor); - return; - } - - auto blas = phi::funcs::GetBlas(dev_ctx); - - Tensor new_x = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - int new_n = n; - if (n > 0) { - // newX = X - framework::TensorCopy(*X, ctx.GetPlace(), dev_ctx, &new_x); - } else { - // newX = X^{-1}, n = -n - math::MatrixInverseFunctor mat_inv; - mat_inv(dev_ctx, *X, &new_x); - new_n = -n; - } - - if (new_n == 1) { - framework::TensorCopy(new_x, ctx.GetPlace(), dev_ctx, Out); - return; - } - - auto no_trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, false); - - if (new_n == 2) { - // Out = newX * newX - Out->mutable_data(ctx.GetPlace()); - blas.MatMul(new_x, no_trans_desc, new_x, no_trans_desc, static_cast(1), - Out, static_cast(0)); - return; - } else if (new_n == 3) { - // Out = (newX * newX) * newX - // Note: C[i] matrices in MatMul must not overlap, i.e. the individual - // gemm operations must be computable independently; otherwise, - // undefined behavior is expected. - Tensor temp = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - blas.MatMul(new_x, no_trans_desc, new_x, no_trans_desc, static_cast(1), - &temp, static_cast(0)); - blas.MatMul(temp, no_trans_desc, new_x, no_trans_desc, static_cast(1), - Out, static_cast(0)); - return; - } else if (new_n == 4) { - // Out = (newX * newX) * (newX * newX) - Tensor temp = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - blas.MatMul(new_x, no_trans_desc, new_x, no_trans_desc, static_cast(1), - &temp, static_cast(0)); - blas.MatMul(temp, no_trans_desc, temp, no_trans_desc, static_cast(1), - Out, static_cast(0)); - return; - } - - // Calculate Out = newX^{n} for abs(n) > 4 with time complexity as O(logN) - int bit = 0; - Tensor z = Tensor(X->dtype()); - bool out_inited = false; - Tensor temp_out = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - Tensor temp_z = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - while (new_n > 0) { - bit = new_n & 0x1; - new_n >>= 1; - if (z.IsInitialized()) { - blas.MatMul(z, no_trans_desc, z, no_trans_desc, static_cast(1), - &temp_z, static_cast(0)); - framework::TensorCopy(temp_z, ctx.GetPlace(), dev_ctx, &z); - } else { - z = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - framework::TensorCopy(new_x, ctx.GetPlace(), dev_ctx, &z); - } - if (bit == 1) { - if (out_inited == true) { - blas.MatMul(*Out, no_trans_desc, z, no_trans_desc, static_cast(1), - &temp_out, static_cast(0)); - framework::TensorCopy(temp_out, ctx.GetPlace(), dev_ctx, Out); - } else { - framework::TensorCopy(z, ctx.GetPlace(), dev_ctx, Out); - out_inited = true; - } - } - } - return; -} - -template -class MatrixPowerKernel : public framework::OpKernel { - public: - void Compute(const paddle::framework::ExecutionContext& ctx) const override { - const Tensor* X = ctx.Input("X"); - Tensor* Out = ctx.Output("Out"); - int n = ctx.Attr("n"); - - const auto& x_dims = X->dims(); - const int x_ndim = x_dims.size(); - PADDLE_ENFORCE_EQ( - x_dims[x_ndim - 2], x_dims[x_ndim - 1], - platform::errors::InvalidArgument( - "The inner-most 2 dimensions of Input(X) should be equal." - "X's shape[-2] = %d and shape[-1] = %d.", - x_dims[x_ndim - 2], x_dims[x_ndim - 1])); - - MatrixPowerFunction(X, n, Out, ctx); - } -}; - -template -void MatrixPowerGradFunction(const Tensor* X, const Tensor* Out, - const Tensor* dOut, const int n, Tensor* dX, - const paddle::framework::ExecutionContext& ctx) { - dX->mutable_data(ctx.GetPlace()); - const auto& x_dims = X->dims(); - - auto& dev_ctx = ctx.template device_context(); - auto blas = phi::funcs::GetBlas(dev_ctx); - - if (n == 0) { - // \nabla X = O - phi::funcs::SetConstant zero; - zero(dev_ctx, dX, static_cast(0)); - return; - } else if (n == 1) { - // \nabla X = \nabla Out - framework::TensorCopy(*dOut, ctx.GetPlace(), dev_ctx, dX); - return; - } - - auto trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, true); - auto no_trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, false); - - if (n == -1) { - // \nabla X = Out^{T} * \nabla Out * Out^{T} - Tensor temp_dx = - ctx.AllocateTmpTensor(X->dims(), dev_ctx); - blas.MatMul(*Out, trans_desc, *dOut, no_trans_desc, static_cast(-1), - &temp_dx, static_cast(0)); - blas.MatMul(temp_dx, no_trans_desc, *Out, trans_desc, static_cast(1), dX, - static_cast(0)); - return; - } - - Tensor new_x = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - int new_n = n; - if (n > 0) { - // newX = X - framework::TensorCopy(*X, ctx.GetPlace(), dev_ctx, &new_x); - } else { - // newX = X^{-1}, n = -n - math::MatrixInverseFunctor mat_inv; - mat_inv(dev_ctx, *X, &new_x); - new_n = -n; - } - - // Use chain rule blow to compute \nabla newX^{n} - // First, Get newX^{0}, newX^{1}, ..., newX^{n - 1}, - // Note that newX^{0} can be omitted - std::vector> tensor_list(new_n - 1); - tensor_list[0] = std::make_shared(new_x); - int index = 1; - while (index < new_n - 1) { - tensor_list[index] = std::make_shared( - ctx.AllocateTmpTensor(X->dims(), dev_ctx)); - blas.MatMul(*tensor_list[index - 1], no_trans_desc, new_x, no_trans_desc, - static_cast(1), tensor_list[index].get(), static_cast(0)); - index++; - } - - // Second, \nabla newX = \sum_{i = 0}^{n - 1} (newX^{T}^{i} - // * \nabla Out - // * (newX^{T}^{n - i - 1}) - Tensor dx_new = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - blas.MatMul(*tensor_list[new_n - 2], trans_desc, *dOut, no_trans_desc, - static_cast(1), &dx_new, static_cast(0)); - Tensor da_an_minus1 = - ctx.AllocateTmpTensor(X->dims(), dev_ctx); - blas.MatMul(*dOut, no_trans_desc, *tensor_list[new_n - 2], trans_desc, - static_cast(1), &da_an_minus1, static_cast(0)); - blas.AXPY(X->numel(), static_cast(1), da_an_minus1.data(), - dx_new.data()); - int start = 0; - while (start < new_n - 2) { - Tensor a_da = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - Tensor a_da_a = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - blas.MatMul(*tensor_list[start], trans_desc, *dOut, no_trans_desc, - static_cast(1), &a_da, static_cast(0)); - blas.MatMul(a_da, no_trans_desc, *tensor_list[new_n - 3 - start], - trans_desc, static_cast(1), &a_da_a, static_cast(0)); - blas.AXPY(X->numel(), static_cast(1), a_da_a.data(), - dx_new.data()); - start++; - } - - if (n > 0) { - // \nabla X = \nabla newX - framework::TensorCopy(dx_new, ctx.GetPlace(), dev_ctx, dX); - } else { - // \nabla X = newX^{T} * \nabla newX * newX^{T} - Tensor temp_dx = - ctx.AllocateTmpTensor(X->dims(), dev_ctx); - blas.MatMul(new_x, trans_desc, dx_new, no_trans_desc, static_cast(-1), - &temp_dx, static_cast(0)); - blas.MatMul(temp_dx, no_trans_desc, new_x, trans_desc, static_cast(1), - dX, static_cast(0)); - } - return; -} - -template -class MatrixPowerGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const Tensor* X = ctx.Input("X"); - const Tensor* Out = ctx.Input("Out"); - const Tensor* dOut = ctx.Input(framework::GradVarName("Out")); - const int n = ctx.Attr("n"); - Tensor* dX = ctx.Output(framework::GradVarName("X")); - - MatrixPowerGradFunction(X, Out, dOut, n, dX, ctx); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/maxout_op.cc b/paddle/fluid/operators/maxout_op.cc index bd9ebd29777def..e55369e0691ee5 100644 --- a/paddle/fluid/operators/maxout_op.cc +++ b/paddle/fluid/operators/maxout_op.cc @@ -12,14 +12,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "paddle/fluid/operators/maxout_op.h" #include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" + namespace paddle { namespace operators { -using framework::Tensor; - class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -130,10 +130,3 @@ REGISTER_OPERATOR( paddle::framework::DefaultGradOpMaker, paddle::framework::DefaultGradOpMaker); REGISTER_OPERATOR(maxout_grad, ops::MaxOutOpGrad); -REGISTER_OP_CPU_KERNEL( - maxout, ops::MaxOutKernel, - ops::MaxOutKernel); -REGISTER_OP_CPU_KERNEL( - maxout_grad, - ops::MaxOutGradKernel, - ops::MaxOutGradKernel); diff --git a/paddle/fluid/operators/maxout_op.h b/paddle/fluid/operators/maxout_op.h deleted file mode 100644 index 922998293943ed..00000000000000 --- a/paddle/fluid/operators/maxout_op.h +++ /dev/null @@ -1,72 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/maxouting.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class MaxOutKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* in_x = context.Input("X"); - Tensor* out = context.Output("Out"); - int groups = context.template Attr("groups"); - int axis = context.template Attr("axis"); - if (axis < 0) { - axis += in_x->dims().size(); - } - - math::MaxOutFunctor maxout_forward; - maxout_forward(context.template device_context(), *in_x, out, - groups, axis); - } -}; - -template -class MaxOutGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* in_x = context.Input("X"); - const Tensor* out = context.Input("Out"); - const Tensor* out_grad = - context.Input(framework::GradVarName("Out")); - Tensor* in_x_grad = context.Output(framework::GradVarName("X")); - int groups = context.template Attr("groups"); - int axis = context.template Attr("axis"); - if (axis < 0) { - axis += in_x->dims().size(); - } - - auto& device_ctx = context.template device_context(); - phi::funcs::SetConstant zero; - if (in_x_grad) { - in_x_grad->mutable_data(context.GetPlace()); - zero(device_ctx, in_x_grad, static_cast(0.0)); - math::MaxOutGradFunctor maxout_backward; - maxout_backward(device_ctx, *in_x, in_x_grad, *out, *out_grad, groups, - axis); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/metrics/accuracy_op.cc b/paddle/fluid/operators/metrics/accuracy_op.cc index 3692ace8bb5a46..056620db5b9669 100644 --- a/paddle/fluid/operators/metrics/accuracy_op.cc +++ b/paddle/fluid/operators/metrics/accuracy_op.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/metrics/accuracy_op.h" +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { @@ -123,13 +123,10 @@ with the input Out(Inference). } // namespace operators } // namespace paddle +// FIXME(typhoonzero): types of T is for infernece data. +// label data is always int. namespace ops = paddle::operators; REGISTER_OPERATOR( accuracy, ops::AccuracyOp, ops::AccuracyOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker); -// FIXME(typhoonzero): types of T is for infernece data. -// label data is always int. -REGISTER_OP_CPU_KERNEL(accuracy, - ops::AccuracyKernel, - ops::AccuracyKernel); diff --git a/paddle/fluid/operators/metrics/accuracy_op.cu b/paddle/fluid/operators/metrics/accuracy_op.cu deleted file mode 100644 index 6f19100fa9d37e..00000000000000 --- a/paddle/fluid/operators/metrics/accuracy_op.cu +++ /dev/null @@ -1,110 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include "paddle/fluid/operators/metrics/accuracy_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/fluid/platform/float16.h" - -namespace paddle { -namespace operators { -using platform::PADDLE_CUDA_NUM_THREADS; - -template -__global__ void AccuracyCudaKernel(const int N, const int D, - const int64_t* Xdata, - const int64_t* labeldata, int* correct_data, - float* accuracy, int* total_data) { - int count = 0; - __shared__ int total[BlockSize]; - - // support only 1 block - for (int i = threadIdx.x; i < (N); i += BlockSize) { - for (int j = 0; j < D; ++j) { - if (Xdata[i * D + j] == labeldata[i]) { - ++count; - break; - } - } - } - total[threadIdx.x] = count; - __syncthreads(); - -// reduce the count with init value 0, and output accuracy. -#ifdef PADDLE_WITH_CUDA - int result = thrust::reduce(thrust::device, total, total + BlockSize, 0); -#else - // HIP thrust::reduce not support __device__ - for (int s = BlockSize / 2; s > 0; s >>= 1) { - if (threadIdx.x < s) { - total[threadIdx.x] += total[threadIdx.x + s]; - } - __syncthreads(); - } - int result = total[0]; -#endif - if (threadIdx.x == 0) { - *correct_data = result; - *accuracy = static_cast(result) / static_cast(N); - *total_data = N; - } -} - -template -class AccuracyOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* inference = ctx.Input("Out"); - auto* indices = ctx.Input("Indices"); - auto* label = ctx.Input("Label"); - - auto* accuracy = ctx.Output("Accuracy"); - auto* correct = ctx.Output("Correct"); - auto* total = ctx.Output("Total"); - // FIXME(typhoonzero): only support indices currently - // if add support for output values, how to detect the data type? - const int64_t* indices_data = indices->data(); - const int64_t* label_data = label->data(); - - int* correct_data = correct->mutable_data(ctx.GetPlace()); - int* total_data = total->mutable_data(ctx.GetPlace()); - float* accuracy_data = accuracy->mutable_data(ctx.GetPlace()); - - int num_samples = static_cast(inference->dims()[0]); - size_t infer_width = inference->dims()[1]; - auto stream = ctx.cuda_device_context().stream(); - platform::GpuMemsetAsync(accuracy_data, 0, sizeof(float), stream); - - if (num_samples == 0) { - return; - } - - AccuracyCudaKernel< - PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>( - num_samples, infer_width, indices_data, label_data, correct_data, - accuracy_data, total_data); - } -}; - -} // namespace operators -} // namespace paddle - -// FIXME(typhoonzero): types of T is for inference data. -// label data is always int64 -REGISTER_OP_CUDA_KERNEL( - accuracy, paddle::operators::AccuracyOpCUDAKernel, - paddle::operators::AccuracyOpCUDAKernel, - paddle::operators::AccuracyOpCUDAKernel); diff --git a/paddle/fluid/operators/metrics/accuracy_op.h b/paddle/fluid/operators/metrics/accuracy_op.h deleted file mode 100644 index 94e5bf8257e67b..00000000000000 --- a/paddle/fluid/operators/metrics/accuracy_op.h +++ /dev/null @@ -1,74 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class AccuracyKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* inference = ctx.Input("Out"); - auto* indices = ctx.Input("Indices"); - auto* label = ctx.Input("Label"); - auto* accuracy = ctx.Output("Accuracy"); - auto* correct = ctx.Output("Correct"); - auto* total = ctx.Output("Total"); - - int* correct_data = correct->mutable_data(ctx.GetPlace()); - int* total_data = total->mutable_data(ctx.GetPlace()); - float* accuracy_data = accuracy->mutable_data(ctx.GetPlace()); - - const int64_t* indices_data = indices->data(); - const int64_t* label_data = label->data(); - - size_t num_samples = inference->dims()[0]; - size_t class_dim = inference->dims()[1]; - *accuracy_data = 0.0f; - - if (num_samples == 0) { - return; - } - - int num_correct = 0; - // assume inference is already the topk of the output - for (size_t i = 0; i < num_samples; ++i) { - PADDLE_ENFORCE_GE( - label_data[i], 0, - platform::errors::InvalidArgument( - "label of AccuracyOp must >= 0, But received label[%d] is %d", i, - label_data[i])); - for (size_t j = 0; j < class_dim; ++j) { - if (indices_data[i * class_dim + j] == label_data[i]) { - ++num_correct; - break; - } - } - } - - *correct_data = num_correct; - *total_data = num_samples; - *accuracy_data = - static_cast(num_correct) / static_cast(num_samples); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/metrics/accuracy_op_mlu.cc b/paddle/fluid/operators/metrics/accuracy_op_mlu.cc index 2598d3b0277c94..1ce02ff4525c96 100644 --- a/paddle/fluid/operators/metrics/accuracy_op_mlu.cc +++ b/paddle/fluid/operators/metrics/accuracy_op_mlu.cc @@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/metrics/accuracy_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" namespace paddle { diff --git a/paddle/fluid/operators/metrics/accuracy_op_npu.cc b/paddle/fluid/operators/metrics/accuracy_op_npu.cc index e83278f88b82a3..9f2ca4165f33a2 100644 --- a/paddle/fluid/operators/metrics/accuracy_op_npu.cc +++ b/paddle/fluid/operators/metrics/accuracy_op_npu.cc @@ -13,7 +13,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/metrics/accuracy_op.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc index de71312d78df99..3cc1be4de8a82f 100644 --- a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc +++ b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc @@ -14,12 +14,14 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/metrics/accuracy_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device/xpu/xpu_header.h" namespace paddle { namespace operators { +using Tensor = paddle::framework::Tensor; template class AccuracyXPUKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/metrics/auc_op.cc b/paddle/fluid/operators/metrics/auc_op.cc index 54ecba08a82dce..f3ed98c3f4d1e4 100644 --- a/paddle/fluid/operators/metrics/auc_op.cc +++ b/paddle/fluid/operators/metrics/auc_op.cc @@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/multiary.h" namespace paddle { namespace operators { @@ -21,70 +24,6 @@ class AucOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - protected: - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Predict"), "Input", "Predict", "Auc"); - OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "Auc"); - auto predict_dims = ctx->GetInputDim("Predict"); - auto label_dims = ctx->GetInputDim("Label"); - PADDLE_ENFORCE_GE( - predict_dims.size(), 2, - platform::errors::InvalidArgument( - "The Input(Predict) has not been initialized properly. The " - "shape of Input(Predict) = [%s], the shape size must be " - "greater_equal 2.", - predict_dims)); - auto predict_width = predict_dims[1]; - PADDLE_ENFORCE_NE( - phi::product(predict_dims), 0, - platform::errors::InvalidArgument( - "The Input(Predict) has not been initialized properly. The " - "shape of Input(Predict) = [%s], the shape can not involes 0.", - predict_dims)); - PADDLE_ENFORCE_NE( - phi::product(label_dims), 0, - platform::errors::InvalidArgument( - "The Input(Label) has not been initialized properly. The " - "shape of Input(Label) = [%s], the shape can not involes 0.", - label_dims)); - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_LE(predict_width, 2, - platform::errors::InvalidArgument( - "Only support binary classification," - "prediction dims[1] should be 1 or 2")); - } - auto predict_height = ctx->GetInputDim("Predict")[0]; - auto label_height = ctx->GetInputDim("Label")[0]; - - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ(predict_height, label_height, - platform::errors::InvalidArgument( - "Out and Label should have same height.")); - } - - int num_pred_buckets = ctx->Attrs().Get("num_thresholds") + 1; - int slide_steps = ctx->Attrs().Get("slide_steps"); - - PADDLE_ENFORCE_GE( - num_pred_buckets, 1, - platform::errors::InvalidArgument("num_thresholds must larger than 1")); - PADDLE_ENFORCE_GE(slide_steps, 0, - platform::errors::InvalidArgument( - "slide_steps must be natural number")); - - ctx->SetOutputDim("AUC", {1}); - - if (slide_steps) { - ctx->SetOutputDim("StatPosOut", - {(1 + slide_steps) * num_pred_buckets + 1}); - ctx->SetOutputDim("StatNegOut", - {(1 + slide_steps) * num_pred_buckets + 1}); - } else { - ctx->SetOutputDim("StatPosOut", {1, num_pred_buckets}); - ctx->SetOutputDim("StatNegOut", {1, num_pred_buckets}); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -145,4 +84,7 @@ There are two types of possible curves: } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(auc, ops::AucOp, ops::AucOpMaker); +DECLARE_INFER_SHAPE_FUNCTOR(auc, AucInferShapeFunctor, + PD_INFER_META(phi::AucInferMeta)); +REGISTER_OP_WITHOUT_GRADIENT(auc, ops::AucOp, ops::AucOpMaker, + AucInferShapeFunctor); diff --git a/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc index 780c6e7f153e7b..a3b764b0e1c46a 100644 --- a/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc @@ -13,19 +13,32 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/shape_op.h" #include "paddle/fluid/platform/mkldnn_helper.h" namespace paddle { namespace operators { -using paddle::framework::Tensor; +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using SelectedRows = phi::SelectedRows; template -class ShapeMKLDNNKernel : public ShapeKernel { +class ShapeMKLDNNKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ShapeKernel::Compute(ctx); + auto* in_var = ctx.InputVar("Input"); + framework::DDim in_dims; + if (in_var->IsType()) { + in_dims = in_var->Get().value().dims(); + } else { + in_dims = in_var->Get().dims(); + } + auto* out_t = ctx.Output("Out"); + out_t->Resize({in_dims.size()}); + auto out_data = out_t->mutable_data(platform::CPUPlace()); + for (int i = 0; i < in_dims.size(); ++i) { + out_data[i] = in_dims[i]; + } auto* out = ctx.Output("Out"); out->set_layout(framework::DataLayout::kMKLDNN); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc index 05cd264cf3ec9e..23428dd403e9b1 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc @@ -29,7 +29,7 @@ USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP(elementwise_mul); USE_OP_DEVICE_KERNEL(elementwise_mul, MKLDNN); -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_DEVICE_KERNEL(relu, MKLDNN); USE_OP_ITSELF(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc index c776cf2a7c792c..e9dadd5ec937cd 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc @@ -27,7 +27,7 @@ USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_DEVICE_KERNEL(relu, MKLDNN); USE_OP_ITSELF(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc index 3791fed23a84ff..916f02179b364b 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc @@ -27,7 +27,7 @@ USE_OP(pool2d); USE_OP_DEVICE_KERNEL(pool2d, MKLDNN); -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_DEVICE_KERNEL(relu, MKLDNN); USE_OP_ITSELF(transpose); USE_OP_DEVICE_KERNEL(transpose, MKLDNN); diff --git a/paddle/fluid/operators/mlu/activation_op_mlu_test.cc b/paddle/fluid/operators/mlu/activation_op_mlu_test.cc index 884521301750ce..6e3bd5e43c9c1d 100644 --- a/paddle/fluid/operators/mlu/activation_op_mlu_test.cc +++ b/paddle/fluid/operators/mlu/activation_op_mlu_test.cc @@ -22,7 +22,7 @@ limitations under the License. */ namespace fw = paddle::framework; namespace plat = paddle::platform; -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_DEVICE_KERNEL(relu, MLU); // relu diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc index 9de03582cbbf53..1fdaa153e3c27e 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.cc +++ b/paddle/fluid/operators/mlu/mlu_baseop.cc @@ -499,6 +499,27 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() { output_desc, output)); } +/* static */ void MLUCnnl::Concat(const MLUDeviceContext& dev_ctx, + const int pack_num, const int axis, + const cnnlTensorDescriptor_t inputs_desc[], + const void* const inputs[], + const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = dev_ctx.cnnl_handle(); + + size_t workspace_size = 0; + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlGetConcatWorkspaceSize(handle, pack_num, &workspace_size)); + + Tensor workspace(paddle::experimental::DataType::INT8); + workspace.Resize(framework::DDim({static_cast(workspace_size)})); + void* workspace_ptr = workspace.mutable_data(dev_ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlConcat(handle, pack_num, axis, inputs_desc, + inputs, workspace_ptr, workspace_size, + output_desc, output)); +} + /* static */ void MLUCnnl::Div( const ExecutionContext& ctx, cnnlComputationPreference_t prefer, const cnnlTensorDescriptor_t in0_desc, const void* in0, @@ -977,6 +998,27 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() { output_descs, output_ptrs)); } +/* static */ void MLUCnnl::Split(const MLUDeviceContext& dev_ctx, int split_num, + int axis, + const cnnlTensorDescriptor_t input_desc, + const void* input_ptr, + const cnnlTensorDescriptor_t output_descs[], + void* output_ptrs[]) { + cnnlHandle_t handle = dev_ctx.cnnl_handle(); + + size_t workspace_size; + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlGetSplitWorkspaceSize(handle, split_num, &workspace_size)); + + Tensor workspace(paddle::experimental::DataType::INT8); + workspace.Resize(framework::DDim({static_cast(workspace_size)})); + void* workspace_ptr = workspace.mutable_data(dev_ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSplit(handle, split_num, axis, input_desc, + input_ptr, workspace_ptr, workspace_size, + output_descs, output_ptrs)); +} + /* static */ void MLUCnnl::GatherFunctor( const ExecutionContext& ctx, const int axis, const int batch_dims, const cnnlTensorDescriptor_t params_desc, const void* params, diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h index 2a54a8392c7c5b..b55b10686e92e2 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.h +++ b/paddle/fluid/operators/mlu/mlu_baseop.h @@ -403,6 +403,11 @@ class MLUCnnl { const void* const inputs[], const cnnlTensorDescriptor_t output_desc, void* output); + static void Concat(const MLUDeviceContext& dev_ctx, const int pack_num, + const int axis, const cnnlTensorDescriptor_t inputs_desc[], + const void* const inputs[], + const cnnlTensorDescriptor_t output_desc, void* output); + static void Cast(const ExecutionContext& ctx, cnnlCastDataType_t cast_type, const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t output_desc, void* output); @@ -566,6 +571,12 @@ class MLUCnnl { const cnnlTensorDescriptor_t output_descs[], void* output_ptrs[]); + static void Split(const MLUDeviceContext& dev_ctx, int split_num, int axis, + const cnnlTensorDescriptor_t input_desc, + const void* input_ptr, + const cnnlTensorDescriptor_t output_descs[], + void* output_ptrs[]); + static void Scale(const ExecutionContext& ctx, const int axis, const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t alpha_desc, const void* alpha, diff --git a/paddle/fluid/operators/mode_op.cu b/paddle/fluid/operators/mode_op.cu index afb949d3374c62..2bacda8afb0eb3 100644 --- a/paddle/fluid/operators/mode_op.cu +++ b/paddle/fluid/operators/mode_op.cu @@ -24,7 +24,6 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/mode_op.h" #include "paddle/fluid/operators/top_k_function_cuda.h" -#include "paddle/fluid/operators/top_k_v2_op.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/multi_dot_op.cc b/paddle/fluid/operators/multi_dot_op.cc index fe4609b3ad91e7..b309e1b87ef903 100644 --- a/paddle/fluid/operators/multi_dot_op.cc +++ b/paddle/fluid/operators/multi_dot_op.cc @@ -87,135 +87,6 @@ inline framework::DDim ComputeAndCheckShape( return out_dim; } -template -inline framework::Tensor MatMul(const framework::ExecutionContext& ctx, - const framework::Tensor& matrix_a, - const framework::Tensor& matrix_b, - const framework::DDim& a_dim, - const framework::DDim& b_dim) { - auto place = ctx.GetPlace(); - auto blas = phi::funcs::GetBlas(ctx); - - framework::Tensor matrix_c; - framework::DDim c_dim = phi::make_ddim({a_dim[0], b_dim[1]}); - matrix_c.Resize(c_dim); - matrix_c.mutable_data(place); - - auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a_dim, 0, false); - auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b_dim, 0, false); - const T alpha = static_cast(1.0); - blas.MatMul(matrix_a, mat_dim_a, matrix_b, mat_dim_b, alpha, &matrix_c, T(0)); - return matrix_c; -} - -/** - * @brief Recursively calculate matrix multiplication according to the optimal - * order - * Let k = order[i,j], then ins[i...j] = ins[i...k] * ins[k+1 ...j] - * - * @param - * ins: the input tensors - * ins_dims: the shape of ins after reshape - * order: the optimal order - * i: the left of sub chain - * j: the righe of sub chain - * save_result: set true by backward - * results: save the intermediate result during backward - */ -template -inline framework::Tensor MatChainMul( - const framework::ExecutionContext& ctx, - const std::vector& ins, - const std::vector& ins_dims, - const std::vector& order, const uint64_t i, const uint64_t j, - const bool save_result, std::vector* results) { - if (i == j) { - return *ins[i]; - } - - const auto A = MatChainMul(ctx, ins, ins_dims, order, i, - order[i * ins.size() + j], - save_result, results); - framework::DDim a_dim = A.dims(); - if (i == order[i * ins.size() + j]) { - a_dim = ins_dims[i]; - } - - const auto B = MatChainMul(ctx, ins, ins_dims, order, - order[i * ins.size() + j] + 1, j, - save_result, results); - framework::DDim b_dim = B.dims(); - if (j == order[i * ins.size() + j] + 1) { - b_dim = ins_dims[j]; - } - - auto result = MatMul(ctx, A, B, a_dim, b_dim); - if (save_result) { - (*results)[i * ins.size() + j] = result; - } - return result; -} - -/** - * @brief get the optimal order - */ -std::vector GetOrder(const std::vector& ins, - const std::vector& ins_dims) { - auto n = ins.size(); - // p: save the ins shape, the ins[i] shape is (p[i], p[i+1]) - std::vector p(n + 1); - for (uint64_t i = 0; i < n; i++) { - p[i] = ins_dims[i][0]; - } - p[n] = ins_dims[n - 1][1]; - - // m[i, j]: save the lowest cost for multiplying ins[i...j] - std::vector m(n * n, 0); - // define ins[i...j] means multiplying matrices from ins[i] to ins[j] - // order[i, j] = k, this means that ins[i...k] and ins[k...j] fist and then - // multiply the resulting matrices is the optimal order for ins[i...j] - std::vector order(n * n); - for (uint64_t l = 1; l < n; l++) { - for (uint64_t i = 0; i < n - l; i++) { - auto j = i + l; - m[i * n + j] = 0xffffffff; - for (uint64_t k = i; k < j; k++) { - uint64_t q = - m[i * n + k] + m[(k + 1) * n + j] + p[i] * p[k + 1] * p[j + 1]; - if (q < m[i * n + j]) { - m[i * n + j] = q; - order[i * n + j] = k; - } - } - } - } - return order; -} - -template -static inline framework::Tensor MultiDotMatChainOrder( - const framework::ExecutionContext& ctx, - const std::vector& ins, - const std::vector& ins_dims, const bool save_result, - std::vector* results) { - auto order = GetOrder(ins, ins_dims); - return MatChainMul(ctx, ins, ins_dims, order, 0, - ins.size() - 1, save_result, results); -} - -inline void GetDims(const std::vector& ins, - std::vector* ins_dims) { - const auto n = ins.size(); - for (size_t i = 0; i < n; i++) { - (*ins_dims)[i] = ins[i]->dims(); - if (i == 0 && (*ins_dims)[i].size() == 1) { - (*ins_dims)[i] = phi::make_ddim({1, (*ins_dims)[i][0]}); - } else if (i == n - 1 && (*ins_dims)[i].size() == 1) { - (*ins_dims)[i] = phi::make_ddim({(*ins_dims)[i][0], 1}); - } - } -} - class MultiDotOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -252,78 +123,6 @@ class MultiDotOp : public framework::OperatorWithKernel { } }; -/** - * 1. there are only 2 matrices: direct matrix multiplication A*B - * 2. there are only 3 matrices: calculate the cost of (A*B)*C and A*(B*C), - * choose the least cost order for calculation - * 3. more than 3 matrices: call MultiDotMatChainOrder - */ -template -class MultiDotKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto ins = ctx.MultiInput("X"); - auto* out = ctx.Output("Out"); - - auto place = ctx.GetPlace(); - out->mutable_data(place); - - auto blas = phi::funcs::GetBlas(ctx); - - auto n = ins.size(); - std::vector ins_dims(n); - GetDims(ins, &ins_dims); - - const T scale = static_cast(1.0); - if (n == 2) { - auto mat_dim_a = - phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false); - auto mat_dim_b = - phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false); - blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, out, T(0)); - } else if (n == 3) { - const auto Ma = ins_dims[0][0]; - const auto Ka = ins_dims[0][1]; - const auto Nb = ins_dims[1][1]; - const auto Nc = ins_dims[2][1]; - const uint64_t cost1 = Ma * Nb * (Ka + Nc); - const uint64_t cost2 = Ka * Nc * (Nb + Ma); - auto mat_dim_a = - phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false); - auto mat_dim_b = - phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false); - auto mat_dim_c = - phi::funcs::CreateMatrixDescriptor(ins_dims[2], 0, false); - if (cost1 < cost2) { - framework::Tensor tmp_out; - tmp_out.mutable_data(place, Ma * Nb * sizeof(T)); - framework::DDim tmp_dim = phi::make_ddim({Ma, Nb}); - blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, &tmp_out, - T(0)); - auto mat_dim_tmp = - phi::funcs::CreateMatrixDescriptor(tmp_dim, 0, false); - blas.MatMul(tmp_out, mat_dim_tmp, *ins[2], mat_dim_c, scale, out, T(0)); - } else { - framework::Tensor tmp_out; - tmp_out.mutable_data(place, Ka * Nc * sizeof(T)); - framework::DDim tmp_dim = phi::make_ddim({Ka, Nc}); - blas.MatMul(*ins[1], mat_dim_b, *ins[2], mat_dim_c, scale, &tmp_out, - T(0)); - auto mat_dim_tmp = - phi::funcs::CreateMatrixDescriptor(tmp_dim, 0, false); - blas.MatMul(*ins[0], mat_dim_a, tmp_out, mat_dim_tmp, scale, out, T(0)); - } - } else { - std::vector results; - const auto tmp = MultiDotMatChainOrder( - ctx, ins, ins_dims, false, &results); - auto out_dim = out->dims(); - *out = tmp; - out->Resize(out_dim); - } - } -}; - class MultiDotOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -341,180 +140,6 @@ class MultiDotOpGrad : public framework::OperatorWithKernel { } }; -template -class MultiDotGradKernel : public framework::OpKernel { - public: - /** - * @brief calculate dA and dB - * dA = dout * transpose(B) - * dB = transpose(A) * dout - */ - void CalcGrad(const framework::ExecutionContext& ctx, - const framework::Tensor& dout, const framework::Tensor& A, - const framework::Tensor& B, const framework::DDim& dout_dim, - const framework::DDim& a_dim, const framework::DDim& b_dim, - framework::Tensor* dA, framework::Tensor* dB) const { - auto mat_dim_dout = phi::funcs::CreateMatrixDescriptor(dout_dim, 0, false); - auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a_dim, 0, true); - auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b_dim, 0, true); - T alpha = static_cast(1.0); - auto blas = phi::funcs::GetBlas(ctx); - blas.MatMul(A, mat_dim_a, dout, mat_dim_dout, alpha, dB, T(0)); - blas.MatMul(dout, mat_dim_dout, B, mat_dim_b, alpha, dA, T(0)); - } - - /** - * @brief calculate multi matrix multiplication grad by a chain order - * @param - * dout: the grad of multi matrix multiplication out - * dx: the out grad of inputs - * ins: the input tensors - * ins_dims: the shape of ins after reshape - * order: the optimal order - * i: the left of sub chain - * j: the righe of sub chain - * results: the intermediate result of farward - */ - void MatChainMulGrad(const framework::ExecutionContext& ctx, - const framework::Tensor& dout, - std::vector* dx, - const std::vector& ins, - const framework::DDim& dout_dim, - const std::vector& ins_dims, - const std::vector& order, const uint64_t i, - const uint64_t j, - const std::vector& results) const { - if (i == j) { - *((*dx)[i]) = dout; - return; - } - - const auto n = ins.size(); - const auto right = order[i * n + j]; - const auto left = order[i * n + j] + 1; - // get the multi result of left sub chain - const auto* A = &results[i * n + right]; - framework::DDim a_dim = A->dims(); - if (i == right) { - A = ins[i]; - a_dim = ins_dims[i]; - } - // get the multi result of right sub chain - const auto* B = &results[left * n + j]; - framework::DDim b_dim = B->dims(); - if (left == j) { - B = ins[j]; - b_dim = ins_dims[j]; - } - framework::Tensor dA, dB; - dA.Resize({dout_dim[0], b_dim[0]}); - dB.Resize({a_dim[1], dout_dim[1]}); - dA.mutable_data(ctx.GetPlace()); - dB.mutable_data(ctx.GetPlace()); - - CalcGrad(ctx, dout, *A, *B, dout_dim, a_dim, b_dim, &dA, &dB); - MatChainMulGrad(ctx, dA, dx, ins, dA.dims(), ins_dims, order, i, right, - results); - MatChainMulGrad(ctx, dB, dx, ins, dB.dims(), ins_dims, order, left, j, - results); - } - - void MultiDotGradMatChainOrder( - const framework::ExecutionContext& ctx, const framework::Tensor& dout, - const std::vector& ins, - const framework::DDim& dout_dim, - const std::vector& ins_dims, - std::vector* dx) const { - auto order = GetOrder(ins, ins_dims); - auto n = ins.size(); - std::vector results(n * n); - MatChainMul(ctx, ins, ins_dims, order, 0, n - 1, true, - &results); - MatChainMulGrad(ctx, dout, dx, ins, dout_dim, ins_dims, order, 0, n - 1, - results); - } - - void Compute(const framework::ExecutionContext& ctx) const { - auto ins = ctx.MultiInput("X"); - auto dout = *ctx.Input(framework::GradVarName("Out")); - auto dx = ctx.MultiOutput(framework::GradVarName("X")); - - auto blas = phi::funcs::GetBlas(ctx); - auto place = ctx.GetPlace(); - - const auto n = ins.size(); - for (size_t i = 0; i < n; i++) { - dx[i]->mutable_data(place); - } - - std::vector ins_dims(n); - GetDims(ins, &ins_dims); - - framework::DDim dout_dim = dout.dims(); - if (ins[0]->dims().size() == 1 && ins[n - 1]->dims().size() == 1) { - dout_dim = phi::make_ddim({1, 1}); - } else if (ins[0]->dims().size() == 1) { - if (dout_dim.size() == 1) { - dout_dim = phi::make_ddim({1, dout_dim[0]}); - } - } else if (ins[n - 1]->dims().size() == 1) { - if (dout_dim.size() == 1) { - dout_dim = phi::make_ddim({dout_dim[0], 1}); - } - } - - T alpha = static_cast(1); - auto mat_dim_dout = phi::funcs::CreateMatrixDescriptor(dout_dim, 0, false); - if (n == 2) { - CalcGrad(ctx, dout, *ins[0], *ins[1], dout_dim, ins_dims[0], ins_dims[1], - dx[0], dx[1]); - } else if (n == 3) { - const auto Ma = ins_dims[0][0]; - const auto Ka = ins_dims[0][1]; - const auto Nb = ins_dims[1][1]; - const auto Nc = ins_dims[2][1]; - const uint64_t cost1 = Ma * Nb * (Ka + Nc); - const uint64_t cost2 = Ka * Nc * (Nb + Ma); - auto mat_dim_a = - phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false); - auto mat_dim_b = - phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false); - auto mat_dim_c = - phi::funcs::CreateMatrixDescriptor(ins_dims[2], 0, false); - if (cost1 < cost2) { - framework::Tensor tmp_out, tmp_dout; - tmp_out.Resize({Ma, Nb}); - tmp_out.mutable_data(place); - tmp_dout.Resize({mat_dim_dout.height_, Nb}); - tmp_dout.mutable_data(place); - blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, alpha, &tmp_out, - T(0)); - CalcGrad(ctx, dout, tmp_out, *ins[2], dout_dim, tmp_out.dims(), - ins_dims[2], &tmp_dout, dx[2]); - CalcGrad(ctx, tmp_dout, *ins[0], *ins[1], tmp_dout.dims(), ins_dims[0], - ins_dims[1], dx[0], dx[1]); - } else { - framework::Tensor tmp_out, tmp_dout; - tmp_out.Resize({Ka, Nc}); - tmp_out.mutable_data(place); - tmp_dout.Resize({Ka, mat_dim_dout.width_}); - tmp_dout.mutable_data(place); - blas.MatMul(*ins[1], mat_dim_b, *ins[2], mat_dim_c, alpha, &tmp_out, - T(0)); - CalcGrad(ctx, dout, *ins[0], tmp_out, dout_dim, ins_dims[0], - tmp_dout.dims(), dx[0], &tmp_dout); - CalcGrad(ctx, tmp_dout, *ins[1], *ins[2], tmp_dout.dims(), ins_dims[1], - ins_dims[2], dx[1], dx[2]); - } - } else { - MultiDotGradMatChainOrder(ctx, dout, ins, dout_dim, ins_dims, &dx); - if (ins[n - 1]->dims().size() == 1) { - dx[n - 1]->Resize({dx[n - 1]->dims()[0]}); - } - } - } -}; - template class MultiDotOpGradMaker : public framework::SingleGradOpMaker { public: @@ -552,25 +177,3 @@ REGISTER_OPERATOR(multi_dot, ops::MultiDotOp, ops::MultiDotOpMaker, REGISTER_OPERATOR(multi_dot_grad, ops::MultiDotOpGrad, ops::MultiDotOpDoubleGradMaker, ops::MultiDotOpDoubleGradMaker); - -REGISTER_OP_CPU_KERNEL( - multi_dot, ops::MultiDotKernel, - ops::MultiDotKernel); -REGISTER_OP_CPU_KERNEL( - multi_dot_grad, - ops::MultiDotGradKernel, - ops::MultiDotGradKernel); - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -REGISTER_OP_CUDA_KERNEL( - multi_dot, ops::MultiDotKernel, - ops::MultiDotKernel, - ops::MultiDotKernel); -REGISTER_OP_CUDA_KERNEL( - multi_dot_grad, - ops::MultiDotGradKernel, - ops::MultiDotGradKernel, - ops::MultiDotGradKernel); -#endif diff --git a/paddle/fluid/operators/multinomial_op.cc b/paddle/fluid/operators/multinomial_op.cc index 1143f9cb37aa54..0113f638b9a47d 100644 --- a/paddle/fluid/operators/multinomial_op.cc +++ b/paddle/fluid/operators/multinomial_op.cc @@ -53,8 +53,8 @@ class MultinomialOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; namespace plat = paddle::platform; -DELCARE_INFER_SHAPE_FUNCTOR(multinomial, MultinomialInferShapeFunctor, - PT_INFER_META(phi::MultinomialInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(multinomial, MultinomialInferShapeFunctor, + PD_INFER_META(phi::MultinomialInferMeta)); REGISTER_OPERATOR( multinomial, ops::MultinomialOp, ops::MultinomialOpMaker, paddle::framework::EmptyGradOpMaker, diff --git a/paddle/fluid/operators/mv_op.cc b/paddle/fluid/operators/mv_op.cc index ab9f10070fc60d..bf7222fc45c660 100644 --- a/paddle/fluid/operators/mv_op.cc +++ b/paddle/fluid/operators/mv_op.cc @@ -16,8 +16,11 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -42,33 +45,6 @@ class MVOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; protected: - void InferShape(framework::InferShapeContext *context) const override { - OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "mv"); - OP_INOUT_CHECK(context->HasInput("Vec"), "Input", "Vec", "mv"); - OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "mv"); - - auto dim_x = context->GetInputDim("X"); - auto dim_vec = context->GetInputDim("Vec"); - PADDLE_ENFORCE_EQ( - dim_x.size(), 2, - platform::errors::InvalidArgument( - "The rank of input X should be 2, but is %d", dim_x.size())); - PADDLE_ENFORCE_EQ( - dim_vec.size(), 1, - platform::errors::InvalidArgument( - "The rank of input Vec should be 1, but is %d", dim_vec.size())); - PADDLE_ENFORCE_EQ(dim_x[1], dim_vec[0], - platform::errors::InvalidArgument( - "X's second dimension is expected to be equal to " - "Vec's first dimension" - "but recieved X'shape = [%s], Vec's shape = [%s]", - dim_x, dim_vec)); - - framework::DDim dim_out = phi::make_ddim({dim_x[0]}); - - context->SetOutputDim("Out", dim_out); - context->ShareLoD("X", /*->*/ "Out"); - } }; template @@ -118,7 +94,11 @@ class MVOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; namespace plat = paddle::platform; +DECLARE_INFER_SHAPE_FUNCTOR(mv, MvInferShapeFunctor, + PD_INFER_META(phi::MvInferMeta)); + REGISTER_OPERATOR(mv, ops::MVOp, ops::MVOpMaker, ops::MVOpGradMaker, - ops::MVOpGradMaker); + ops::MVOpGradMaker, + MvInferShapeFunctor); REGISTER_OPERATOR(mv_grad, ops::MVOpGrad); diff --git a/paddle/fluid/operators/nll_loss_op.cc b/paddle/fluid/operators/nll_loss_op.cc index f510c7bebec876..6c35ad29e9749d 100644 --- a/paddle/fluid/operators/nll_loss_op.cc +++ b/paddle/fluid/operators/nll_loss_op.cc @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/nll_loss_op.h" #include #include +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { @@ -264,10 +264,3 @@ REGISTER_OPERATOR(nll_loss, ops::NLLLossOp, ops::NLLLossOpMaker, ops::NLLLossGradMaker, ops::NLLLossGradMaker); REGISTER_OPERATOR(nll_loss_grad, ops::NLLLossGradOp); -REGISTER_OP_CPU_KERNEL( - nll_loss, ops::NLLLossOpKernel, - ops::NLLLossOpKernel); -REGISTER_OP_CPU_KERNEL( - nll_loss_grad, - ops::NLLLossGradOpKernel, - ops::NLLLossGradOpKernel); diff --git a/paddle/fluid/operators/nll_loss_op.h b/paddle/fluid/operators/nll_loss_op.h deleted file mode 100644 index be6f4422d4ac6a..00000000000000 --- a/paddle/fluid/operators/nll_loss_op.h +++ /dev/null @@ -1,306 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -static void nll_loss_1D(T* out_data, T* total_weight_data, const T* x_data, - const int64_t* label_data, const T* weight_data, - const int64_t batch_size, const int64_t n_classes, - const std::string reduction, - const int64_t ignore_index) { - if (reduction == "none") { - for (int64_t i = 0; i < batch_size; ++i) { - const auto cur_label = label_data[i]; - if (cur_label == ignore_index) { - out_data[i] = 0; - continue; - } - PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true, - platform::errors::InvalidArgument( - "Label value is out of range. " - "Expected label value in range of [0, %d), but " - "received value is %d.", - n_classes, cur_label)); - - const auto cur_weight = - weight_data ? weight_data[cur_label] : static_cast(1); - out_data[i] = -x_data[i * n_classes + cur_label] * cur_weight; - } - return; - } - - T output_val = 0; - T total_weight_val = 0; - - for (int64_t i = 0; i < batch_size; i++) { - const auto cur_label = label_data[i]; - if (cur_label == ignore_index) { - out_data[i] = 0; - continue; - } - PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true, - platform::errors::InvalidArgument( - "label should not be out of bounds.")); - - const auto cur_weight = - weight_data ? weight_data[cur_label] : static_cast(1); - total_weight_val += cur_weight; - output_val -= x_data[i * n_classes + cur_label] * cur_weight; - } - if (reduction == "mean" && total_weight_val != 0) { - output_val /= total_weight_val; - } - *out_data = output_val; - *total_weight_data = total_weight_val; -} - -template -static void nll_loss_2D(T* out_data, T* total_weight_data, const T* x_data, - const int64_t* label_data, const T* weight_data, - const int64_t batch_size, const int64_t n_classes, - const int64_t in_dim2, const int64_t in_dim3, - const std::string reduction, - const int64_t ignore_index) { - const auto map_size = in_dim2 * in_dim3; - const auto sample_size = n_classes * map_size; - if (reduction == "none") { - for (int i = 0; i < batch_size; i++) { - for (int h = 0; h < in_dim2; h++) { - for (int w = 0; w < in_dim3; w++) { - const auto index = i * map_size + h * in_dim3 + w; - const auto cur_label = label_data[index]; - if (cur_label == ignore_index) { - out_data[index] = 0; - continue; - } - PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true, - platform::errors::InvalidArgument( - "label should not be out of bounds.")); - const auto cur_weight = - weight_data ? weight_data[cur_label] : static_cast(1); - out_data[index] = -x_data[i * sample_size + cur_label * map_size + - h * in_dim3 + w] * - cur_weight; - } - } - } - return; - } - - T output_val = 0; - T total_weight_val = 0; - - for (int i = 0; i < batch_size; i++) { - for (int h = 0; h < in_dim2; h++) { - for (int w = 0; w < in_dim3; w++) { - const auto index = i * map_size + h * in_dim3 + w; - const auto cur_label = label_data[index]; - if (cur_label == ignore_index) { - out_data[index] = 0; - continue; - } - PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true, - platform::errors::InvalidArgument( - "label should not be out of bounds.")); - const auto cur_weight = - weight_data ? weight_data[cur_label] : static_cast(1); - total_weight_val += cur_weight; - output_val -= - x_data[i * sample_size + cur_label * map_size + h * in_dim3 + w] * - cur_weight; - } - } - } - - if (reduction == "mean" && total_weight_val != 0) { - output_val /= total_weight_val; - } - *out_data = output_val; - *total_weight_data = total_weight_val; -} - -template -class NLLLossOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* labels = ctx.Input("Label"); - auto* weight = ctx.Input("Weight"); - auto* out = ctx.Output("Out"); - auto* total_weight = ctx.Output("Total_weight"); - auto reduction = ctx.Attr("reduction"); - auto ignore_index = ctx.Attr("ignore_index"); - - auto x_data = x->data(); - auto label_data = labels->data(); - auto weight_data = weight ? weight->data() : nullptr; - auto out_data = out->mutable_data(ctx.GetPlace()); - auto total_weight_data = total_weight->mutable_data(ctx.GetPlace()); - *total_weight_data = 0; - - auto x_dims = x->dims(); - const auto batch_size = x_dims[0]; - const auto n_classes = x_dims[1]; - - if (x_dims.size() == 2) { - nll_loss_1D(out_data, total_weight_data, x_data, label_data, - weight_data, batch_size, n_classes, reduction, - ignore_index); - } else if (x_dims.size() == 4) { - const auto in_dim2 = x_dims[2]; - const auto in_dim3 = x_dims[3]; - nll_loss_2D(out_data, total_weight_data, x_data, label_data, - weight_data, batch_size, n_classes, in_dim2, in_dim3, - reduction, ignore_index); - } - } -}; - -template -static void nll_loss_grad_1D(T* dx_data, const T* dout_data, - const int64_t* label_data, const T* weight_data, - const T* total_weight_data, - const int64_t batch_size, const int64_t n_classes, - const std::string reduction, - const int64_t ignore_index) { - if (reduction == "none") { - for (int i = 0; i < batch_size; i++) { - const auto cur_label = label_data[i]; - if (cur_label == ignore_index) { - continue; - } - const auto cur_weight = - weight_data ? weight_data[cur_label] : static_cast(1); - dx_data[i * n_classes + cur_label] = -dout_data[i] * cur_weight; - } - return; - } - - const T dout_val = *dout_data; - const T total_weight_val = *total_weight_data; - for (int i = 0; i < batch_size; i++) { - const auto cur_label = label_data[i]; - if (cur_label == ignore_index) { - continue; - } - const auto cur_weight = - weight_data ? weight_data[cur_label] : static_cast(1); - dx_data[i * n_classes + cur_label] = -dout_val * cur_weight; - if (reduction == "mean") { - dx_data[i * n_classes + cur_label] /= total_weight_val; - } - } -} - -template -static void nll_loss_grad_2D(T* dx_data, const T* dout_data, - const int64_t* label_data, const T* weight_data, - const T* total_weight_data, - const int64_t batch_size, const int64_t n_classes, - const int64_t in_dim2, const int64_t in_dim3, - const std::string reduction, - const int64_t ignore_index) { - const auto map_size = in_dim2 * in_dim3; - const auto sample_size = n_classes * map_size; - - if (reduction == "none") { - for (int i = 0; i < batch_size; i++) { - for (int h = 0; h < in_dim2; h++) { - for (int w = 0; w < in_dim3; w++) { - const auto index = i * map_size + h * in_dim3 + w; - const auto cur_label = label_data[index]; - if (cur_label == ignore_index) { - continue; - } - const auto cur_weight = - weight_data ? weight_data[cur_label] : static_cast(1); - dx_data[i * sample_size + cur_label * map_size + h * in_dim3 + w] = - -cur_weight * dout_data[index]; - } - } - } - return; - } - - const T dout_val = *dout_data; - const T total_weight_val = *total_weight_data; - for (int i = 0; i < batch_size; i++) { - for (int h = 0; h < in_dim2; h++) { - for (int w = 0; w < in_dim3; w++) { - const auto index = i * map_size + h * in_dim3 + w; - const auto cur_label = label_data[index]; - if (cur_label == ignore_index) { - continue; - } - const auto cur_weight = - weight_data ? weight_data[cur_label] : static_cast(1); - const auto dx_index = - i * sample_size + cur_label * map_size + h * in_dim3 + w; - dx_data[dx_index] = -dout_val * cur_weight; - if (reduction == "mean") { - dx_data[dx_index] /= total_weight_val; - } - } - } - } -} - -template -class NLLLossGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* labels = ctx.Input("Label"); - auto* weight = ctx.Input("Weight"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* total_weight = ctx.Input("Total_weight"); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto ignore_index = ctx.Attr("ignore_index"); - auto reduction = ctx.Attr("reduction"); - - auto dx_data = dx->mutable_data(ctx.GetPlace()); - auto dout_data = dout->data(); - auto label_data = labels->data(); - auto weight_data = weight ? weight->data() : nullptr; - auto total_weight_data = total_weight->data(); - memset(dx_data, 0, dx->numel() * sizeof(T)); - - const auto x_dims = x->dims(); - const auto batch_size = x_dims[0]; - const auto n_classes = x_dims[1]; - - if (x_dims.size() == 2) { - nll_loss_grad_1D(dx_data, dout_data, label_data, weight_data, - total_weight_data, batch_size, n_classes, reduction, - ignore_index); - } else if (x_dims.size() == 4) { - const auto in_dim2 = x_dims[2]; - const auto in_dim3 = x_dims[3]; - nll_loss_grad_2D(dx_data, dout_data, label_data, weight_data, - total_weight_data, batch_size, n_classes, in_dim2, - in_dim3, reduction, ignore_index); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/optimizers/adadelta_op.cc b/paddle/fluid/operators/optimizers/adadelta_op.cc index ad7f93d73e902b..315831ddc0f290 100644 --- a/paddle/fluid/operators/optimizers/adadelta_op.cc +++ b/paddle/fluid/operators/optimizers/adadelta_op.cc @@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/optimizers/adadelta_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/multiary.h" namespace paddle { namespace operators { @@ -23,77 +26,6 @@ class AdadeltaOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true, - platform::errors::InvalidArgument( - "Input(Param) of AdadeltaOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true, - platform::errors::InvalidArgument( - "Input(Grad) of AdadeltaOp should not be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasInput("AvgSquaredGrad"), true, - platform::errors::InvalidArgument( - "Input(AvgSquaredGrad) of AdadeltaOp should not be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasInput("AvgSquaredUpdate"), true, - platform::errors::InvalidArgument( - "Input(AvgSquaredUpdate) of AdadeltaOp should not be null.")); - PADDLE_ENFORCE_EQ( - ctx->GetInputsVarType("Param").front() == - framework::proto::VarType::LOD_TENSOR, - true, - platform::errors::InvalidArgument( - "The input var's type should be LoDTensor, but the received is %s", - ctx->Inputs("Param").front(), - ctx->GetInputsVarType("Param").front())); - PADDLE_ENFORCE_EQ( - ctx->GetInputsVarType("Grad").front() == - framework::proto::VarType::LOD_TENSOR, - true, - platform::errors::InvalidArgument( - "The input var's type should be LoDTensor, but the received is %s", - ctx->Inputs("Grad").front(), - ctx->GetInputsVarType("Grad").front())); - - PADDLE_ENFORCE_EQ( - ctx->HasOutput("ParamOut"), true, - platform::errors::InvalidArgument( - "Output(ParamOut) of AdadeltaOp should not be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasOutput("AvgSquaredGradOut"), true, - platform::errors::InvalidArgument( - "Output(AvgSquaredGradOut) of AdadeltaOp should not be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasOutput("AvgSquaredUpdateOut"), true, - platform::errors::InvalidArgument( - "Output(AvgSquaredUpdateOut) of AdadeltaOp should not be null.")); - - auto param_dim = ctx->GetInputDim("Param"); - PADDLE_ENFORCE_EQ( - param_dim, ctx->GetInputDim("Grad"), - platform::errors::InvalidArgument( - "Param and grad input of AdadeltaOp should have same dimension.")); - PADDLE_ENFORCE_NE( - phi::product(ctx->GetInputDim("AvgSquaredGrad")), 0, - platform::errors::InvalidArgument( - "Maybe the Input variable AvgSquaredGrad has not " - "been initialized. You may need to confirm if you put " - "exe.run(startup_program) after optimizer.minimize " - "function.")); - PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredGrad"), - platform::errors::InvalidArgument( - "Param and AvgSquaredGrad input of AdadeltaOp " - "should have same dimension")); - PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredUpdate"), - platform::errors::InvalidArgument( - "Param and AvgSquaredUpdate input of AdadeltaOp " - "should have same dimension")); - - ctx->SetOutputDim("ParamOut", param_dim); - ctx->SetOutputDim("AvgSquaredGradOut", param_dim); - ctx->SetOutputDim("AvgSquaredUpdateOut", param_dim); - } - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { return framework::OpKernelType( @@ -149,7 +81,11 @@ param\_out = param + param\_update } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker); -REGISTER_OP_CPU_KERNEL( - adadelta, ops::AdadeltaOpKernel, - ops::AdadeltaOpKernel); +namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(adadelta, AdadeltaInferMetaFunctor, + PD_INFER_META(phi::AdadeltaInferMeta)); +REGISTER_OPERATOR( + adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + AdadeltaInferMetaFunctor); diff --git a/paddle/fluid/operators/optimizers/adadelta_op.h b/paddle/fluid/operators/optimizers/adadelta_op.h deleted file mode 100644 index 85cfad35858bbe..00000000000000 --- a/paddle/fluid/operators/optimizers/adadelta_op.h +++ /dev/null @@ -1,84 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class AdadeltaOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const auto* param_var = ctx.InputVar("Param"); - PADDLE_ENFORCE_EQ(param_var->IsType(), true, - platform::errors::InvalidArgument( - "The Var(%s)'s type should be LoDTensor, " - "but the received is %s", - ctx.InputNames("Param").front(), - framework::ToTypeName(param_var->Type()))); - const auto* grad_var = ctx.InputVar("Grad"); - PADDLE_ENFORCE_EQ(grad_var->IsType(), true, - platform::errors::InvalidArgument( - "The Var(%s)'s type should be LoDTensor, " - "but the received is %s", - ctx.InputNames("Grad").front(), - framework::ToTypeName(grad_var->Type()))); - - auto param_out_tensor = ctx.Output("ParamOut"); - auto avg_squared_grad_out_tensor = - ctx.Output("AvgSquaredGradOut"); - auto avg_squared_update_out_tensor = - ctx.Output("AvgSquaredUpdateOut"); - - param_out_tensor->mutable_data(ctx.GetPlace()); - avg_squared_grad_out_tensor->mutable_data(ctx.GetPlace()); - avg_squared_update_out_tensor->mutable_data(ctx.GetPlace()); - - T rho = static_cast(ctx.Attr("rho")); - T epsilon = static_cast(ctx.Attr("epsilon")); - - auto param = framework::EigenVector::Flatten( - *ctx.Input("Param")); - auto grad = framework::EigenVector::Flatten( - *ctx.Input("Grad")); - // Squared gradient accumulator - auto avg_squared_grad = framework::EigenVector::Flatten( - *ctx.Input("AvgSquaredGrad")); - // Squared updates accumulator - auto avg_squared_update = framework::EigenVector::Flatten( - *ctx.Input("AvgSquaredUpdate")); - auto param_out = framework::EigenVector::Flatten(*param_out_tensor); - auto avg_squared_grad_out = - framework::EigenVector::Flatten(*avg_squared_grad_out_tensor); - auto avg_squared_update_out = - framework::EigenVector::Flatten(*avg_squared_update_out_tensor); - auto& place = *ctx.template device_context().eigen_device(); - - avg_squared_grad_out.device(place) = - rho * avg_squared_grad + (1 - rho) * grad.square(); - auto update = - -((avg_squared_update + epsilon) / (avg_squared_grad_out + epsilon)) - .sqrt() * - grad; - avg_squared_update_out.device(place) = - rho * avg_squared_update + (1 - rho) * update.square(); - param_out.device(place) = param + update; - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/optimizers/adamax_op.cc b/paddle/fluid/operators/optimizers/adamax_op.cc index a95a37c980c8c9..036839dd1300fe 100644 --- a/paddle/fluid/operators/optimizers/adamax_op.cc +++ b/paddle/fluid/operators/optimizers/adamax_op.cc @@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/optimizers/adamax_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/multiary.h" namespace paddle { namespace operators { @@ -22,67 +25,6 @@ class AdamaxOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Param"), "Input", "Param", "Adamax"); - OP_INOUT_CHECK(ctx->HasInput("Grad"), "Input", "Grad", "Adamax"); - OP_INOUT_CHECK(ctx->HasInput("Moment"), "Input", "Moment", "Adamax"); - OP_INOUT_CHECK(ctx->HasInput("InfNorm"), "Input", "InfNorm", "Adamax"); - OP_INOUT_CHECK(ctx->HasInput("LearningRate"), "Input", "LearningRate", - "Adamax"); - OP_INOUT_CHECK(ctx->HasInput("Beta1Pow"), "Input", "Beta1Pow", "Adamax"); - PADDLE_ENFORCE_EQ( - ctx->GetInputsVarType("Param").front(), - framework::proto::VarType::LOD_TENSOR, - platform::errors::InvalidArgument( - "The input var's type should be LoDTensor, but the received is %s", - ctx->Inputs("Param").front(), - ctx->GetInputsVarType("Param").front())); - PADDLE_ENFORCE_EQ( - ctx->GetInputsVarType("Grad").front(), - framework::proto::VarType::LOD_TENSOR, - platform::errors::InvalidArgument( - "The input var's type should be LoDTensor, but the received is %s", - ctx->Inputs("Grad").front(), - ctx->GetInputsVarType("Grad").front())); - - OP_INOUT_CHECK(ctx->HasOutput("ParamOut"), "Output", "ParamOut", "Adamax"); - OP_INOUT_CHECK(ctx->HasOutput("MomentOut"), "Output", "MomentOut", - "Adamax"); - OP_INOUT_CHECK(ctx->HasOutput("InfNormOut"), "Output", "InfNormOut", - "Adamax"); - - auto lr_dims = ctx->GetInputDim("LearningRate"); - PADDLE_ENFORCE_NE(phi::product(lr_dims), 0, - platform::errors::InvalidArgument( - "Maybe the Input variable LearningRate has not " - "been initialized. You may need to confirm " - "if you put exe.run(startup_program) " - "after optimizer.minimize function.")); - PADDLE_ENFORCE_EQ(phi::product(lr_dims), 1, - platform::errors::InvalidArgument( - "Learning rate should have 1 dimension")); - auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow"); - PADDLE_ENFORCE_EQ(phi::product(beta1_pow_dims), 1, - platform::errors::InvalidArgument( - "Beta1 power accumulator should have 1 dimension")); - auto param_dims = ctx->GetInputDim("Param"); - PADDLE_ENFORCE_EQ( - param_dims, ctx->GetInputDim("Grad"), - platform::errors::InvalidArgument( - "Param and Grad input of AdamaxOp should have same dimension")); - PADDLE_ENFORCE_EQ( - param_dims, ctx->GetInputDim("Moment"), - platform::errors::InvalidArgument( - "Param and Moment input of AdamaxOp should have same dimension")); - PADDLE_ENFORCE_EQ( - param_dims, ctx->GetInputDim("InfNorm"), - platform::errors::InvalidArgument( - "Param and InfNorm input of AdamaxOp should have same dimension")); - - ctx->SetOutputDim("ParamOut", param_dims); - ctx->SetOutputDim("MomentOut", param_dims); - ctx->SetOutputDim("InfNormOut", param_dims); - } framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { return framework::OpKernelType( @@ -150,7 +92,11 @@ division by 0 error. } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(adamax, ops::AdamaxOp, ops::AdamaxOpMaker); -REGISTER_OP_CPU_KERNEL( - adamax, ops::AdamaxOpKernel, - ops::AdamaxOpKernel); +DECLARE_INFER_SHAPE_FUNCTOR(adamax, AdamaxInferMetaFunctor, + PD_INFER_META(phi::AdamaxInferMeta)); + +REGISTER_OPERATOR( + adamax, ops::AdamaxOp, ops::AdamaxOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + AdamaxInferMetaFunctor); diff --git a/paddle/fluid/operators/optimizers/adamax_op.h b/paddle/fluid/operators/optimizers/adamax_op.h deleted file mode 100644 index df0112448b1cbc..00000000000000 --- a/paddle/fluid/operators/optimizers/adamax_op.h +++ /dev/null @@ -1,82 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class AdamaxOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const auto* param_var = ctx.InputVar("Param"); - PADDLE_ENFORCE_EQ(param_var->IsType(), true, - platform::errors::InvalidArgument( - "The Var(%s)'s type should be LoDTensor, " - "but the received is %s", - ctx.InputNames("Param").front(), - framework::ToTypeName(param_var->Type()))); - const auto* grad_var = ctx.InputVar("Grad"); - PADDLE_ENFORCE_EQ(grad_var->IsType(), true, - platform::errors::InvalidArgument( - "The Var(%s)'s type should be LoDTensor, " - "but the received is %s", - ctx.InputNames("Grad").front(), - framework::ToTypeName(grad_var->Type()))); - - auto param_out_tensor = ctx.Output("ParamOut"); - auto moment_out_tensor = ctx.Output("MomentOut"); - auto inf_norm_out_tensor = ctx.Output("InfNormOut"); - - param_out_tensor->mutable_data(ctx.GetPlace()); - moment_out_tensor->mutable_data(ctx.GetPlace()); - inf_norm_out_tensor->mutable_data(ctx.GetPlace()); - - T beta1 = static_cast(ctx.Attr("beta1")); - T beta2 = static_cast(ctx.Attr("beta2")); - T epsilon = static_cast(ctx.Attr("epsilon")); - - auto param = framework::EigenVector::Flatten( - *ctx.Input("Param")); - auto grad = framework::EigenVector::Flatten( - *ctx.Input("Grad")); - auto moment = framework::EigenVector::Flatten( - *ctx.Input("Moment")); - auto inf_norm = framework::EigenVector::Flatten( - *ctx.Input("InfNorm")); - auto lr = framework::EigenVector::Flatten( - *ctx.Input("LearningRate")); - auto beta1_pow = framework::EigenVector::Flatten( - *ctx.Input("Beta1Pow")); - auto param_out = framework::EigenVector::Flatten(*param_out_tensor); - auto moment_out = framework::EigenVector::Flatten(*moment_out_tensor); - auto inf_norm_out = - framework::EigenVector::Flatten(*inf_norm_out_tensor); - auto* place = ctx.template device_context().eigen_device(); - - moment_out.device(*place) = beta1 * moment + (1 - beta1) * grad; - inf_norm_out.device(*place) = - grad.abs().cwiseMax((beta2 * inf_norm) + epsilon); - auto lr_t = lr / (1 - beta1_pow); - Eigen::DSizes m_dsize(moment_out_tensor->numel()); - param_out.device(*place) = - param - lr_t.broadcast(m_dsize) * (moment_out / inf_norm_out); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/pixel_shuffle_op.cc b/paddle/fluid/operators/pixel_shuffle_op.cc index 2a127d9ad1db0c..21ca26f49f653d 100644 --- a/paddle/fluid/operators/pixel_shuffle_op.cc +++ b/paddle/fluid/operators/pixel_shuffle_op.cc @@ -124,8 +124,8 @@ class PixelShuffleGradOp : public framework::OperatorWithKernel { } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(pixel_shuffle, PixelShuffleInferShapeFunctor, - PT_INFER_META(phi::PixelShuffleInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(pixel_shuffle, PixelShuffleInferShapeFunctor, + PD_INFER_META(phi::PixelShuffleInferMeta)); REGISTER_OPERATOR(pixel_shuffle, ops::PixelShuffleOp, ops::PixelShuffleOpMaker, ops::PixelShuffleGradMaker, diff --git a/paddle/fluid/operators/poisson_op.cc b/paddle/fluid/operators/poisson_op.cc index 0cecbf0b9cb027..d5896c4105932e 100644 --- a/paddle/fluid/operators/poisson_op.cc +++ b/paddle/fluid/operators/poisson_op.cc @@ -87,8 +87,8 @@ class PoissonGradOpMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; namespace plat = paddle::platform; -DELCARE_INFER_SHAPE_FUNCTOR(poisson, PoissonInferShapeFunctor, - PT_INFER_META(phi::UnchangedInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(poisson, PoissonInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(poisson, ops::PoissonOp, ops::PoissonOpMaker, ops::PoissonOpInferVarType, diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc index e0c24935b47509..d061f9ae056134 100644 --- a/paddle/fluid/operators/pool_with_index_op.cc +++ b/paddle/fluid/operators/pool_with_index_op.cc @@ -81,8 +81,12 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel { output_shape.insert(output_shape.end(), ksize.begin(), ksize.end()); } else { for (size_t i = 0; i < ksize.size(); ++i) { - output_shape.push_back(MaxPoolOutputSize(in_x_dims[i + 2], ksize[i], - paddings[i], strides[i])); + if ((!ctx->IsRuntime()) && (in_x_dims[i + 2] < 0)) { + output_shape.push_back(in_x_dims[i + 2]); + } else { + output_shape.push_back(MaxPoolOutputSize(in_x_dims[i + 2], ksize[i], + paddings[i], strides[i])); + } } } ctx->SetOutputDim("Out", phi::make_ddim(output_shape)); diff --git a/paddle/fluid/operators/put_along_axis_op.cc b/paddle/fluid/operators/put_along_axis_op.cc index 6b0d6f332bcae8..54e31845ad4bd5 100644 --- a/paddle/fluid/operators/put_along_axis_op.cc +++ b/paddle/fluid/operators/put_along_axis_op.cc @@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/put_along_axis_op.h" #include #include #include + +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/phi/core/ddim.h" @@ -123,16 +124,3 @@ REGISTER_OPERATOR(put_along_axis, ops::PutAlongAxisOp, ops::PutAlongAxisOpMaker, paddle::operators::PutAlongAxisInplaceInferer); REGISTER_OPERATOR(put_along_axis_grad, ops::PutAlongAxisGradOp); - -REGISTER_OP_CPU_KERNEL(put_along_axis, ops::PutAlongAxisOpKernel, - ops::PutAlongAxisOpKernel, - ops::PutAlongAxisOpKernel, - ops::PutAlongAxisOpKernel, - ops::PutAlongAxisOpKernel); - -REGISTER_OP_CPU_KERNEL(put_along_axis_grad, - ops::PutAlongAxisGradOpKernel, - ops::PutAlongAxisGradOpKernel, - ops::PutAlongAxisGradOpKernel, - ops::PutAlongAxisGradOpKernel, - ops::PutAlongAxisGradOpKernel); diff --git a/paddle/fluid/operators/put_along_axis_op.cu b/paddle/fluid/operators/put_along_axis_op.cu deleted file mode 100644 index 5508023efad2c6..00000000000000 --- a/paddle/fluid/operators/put_along_axis_op.cu +++ /dev/null @@ -1,134 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/fluid/operators/put_along_axis_op.h" -#include "paddle/phi/core/ddim.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -class PutAlongAxisCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "PutAlongAxisCUDAKernel only runs on GPU device.")); - auto input = ctx.Input("Input"); - auto axis = ctx.Attr("Axis"); - auto value = ctx.Input("Value"); - auto index = ctx.Input("Index"); - auto reduce_op = ctx.Attr("Reduce"); - auto result = ctx.Output("Result"); - const platform::DeviceContext &device_ctx = ctx.device_context(); - - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - - framework::TensorCopy(*input, ctx.GetPlace(), result); - if (reduce_op == "add") { - if (index_type == framework::proto::VarType::INT32) { - gpu_scatter_add_kernel(*result, axis, *index, *value, - device_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - gpu_scatter_add_kernel(*result, axis, *index, *value, - device_ctx); - } - } else if (reduce_op == "multiply" || reduce_op == "mul") { - if (index_type == framework::proto::VarType::INT32) { - gpu_scatter_mul_kernel(*result, axis, *index, *value, - device_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - gpu_scatter_mul_kernel(*result, axis, *index, *value, - device_ctx); - } - } else if (reduce_op == "assign") { - if (index_type == framework::proto::VarType::INT32) { - gpu_scatter_assign_kernel(*result, axis, *index, *value, - device_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - gpu_scatter_assign_kernel(*result, axis, *index, *value, - device_ctx); - } - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "can not support reduce_op: '%s' for scatter kernel, only " - "support reduce op: 'add‘, 'assign', 'mul' and 'multiply', the " - "defalut reduce op is 'assign' ", - reduce_op)); - return; - } - } -}; - -template -class PutAlongAxisGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "PutAlongAxisGradOpCUDAKernel only runs on GPU.")); - - auto input_grad = ctx.Output(framework::GradVarName("Input")); - auto value_grad = ctx.Output(framework::GradVarName("Value")); - auto index = ctx.Input("Index"); - auto result_grad = ctx.Input(framework::GradVarName("Result")); - auto axis = ctx.Attr("Axis"); - - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - if (input_grad) { - framework::TensorCopy(*result_grad, ctx.GetPlace(), input_grad); - if (index_type == framework::proto::VarType::INT32) { - gpu_scatter_input_grad_kernel( - *result_grad, axis, *index, *input_grad, ctx.device_context()); - } else { - gpu_scatter_input_grad_kernel( - *result_grad, axis, *index, *input_grad, ctx.device_context()); - } - } - if (value_grad) { - value_grad->Resize(index->dims()); - value_grad->mutable_data(ctx.GetPlace()); - if (index_type == framework::proto::VarType::INT32) { - gpu_gather_kernel( - *result_grad, axis, *index, *value_grad, - ctx.device_context()); // the gradient of scatter is gather - } else if (index_type == framework::proto::VarType::INT64) { - gpu_gather_kernel(*result_grad, axis, *index, *value_grad, - ctx.device_context()); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL(put_along_axis, ops::PutAlongAxisCUDAKernel, - ops::PutAlongAxisCUDAKernel, - ops::PutAlongAxisCUDAKernel, - ops::PutAlongAxisCUDAKernel, - ops::PutAlongAxisCUDAKernel); -REGISTER_OP_CUDA_KERNEL(put_along_axis_grad, - ops::PutAlongAxisGradOpCUDAKernel, - ops::PutAlongAxisGradOpCUDAKernel, - ops::PutAlongAxisGradOpCUDAKernel, - ops::PutAlongAxisGradOpCUDAKernel, - ops::PutAlongAxisGradOpCUDAKernel); diff --git a/paddle/fluid/operators/put_along_axis_op.h b/paddle/fluid/operators/put_along_axis_op.h deleted file mode 100644 index 38487f5ce28c9e..00000000000000 --- a/paddle/fluid/operators/put_along_axis_op.h +++ /dev/null @@ -1,124 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather_scatter_kernel.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class PutAlongAxisOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "PutAlongAxisOpKernel only runs on CPU.")); - - auto input = ctx.Input("Input"); - auto axis = ctx.Attr("Axis"); - auto value = ctx.Input("Value"); - auto index = ctx.Input("Index"); - auto reduce_op = ctx.Attr("Reduce"); - auto result = ctx.Output("Result"); - - framework::TensorCopy(*input, ctx.GetPlace(), result); - const platform::DeviceContext &device_ctx = ctx.device_context(); - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - if (reduce_op == "add") { - if (index_type == framework::proto::VarType::INT32) { - cpu_scatter_add_kernel(*result, axis, *index, *value, - device_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - cpu_scatter_add_kernel(*result, axis, *index, *value, - device_ctx); - } - } else if (reduce_op == "multiply" || reduce_op == "mul") { - if (index_type == framework::proto::VarType::INT32) { - cpu_scatter_mul_kernel(*result, axis, *index, *value, - device_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - cpu_scatter_mul_kernel(*result, axis, *index, *value, - device_ctx); - } - } else if (reduce_op == "assign") { - if (index_type == framework::proto::VarType::INT32) { - cpu_scatter_assign_kernel(*result, axis, *index, *value, - device_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - cpu_scatter_assign_kernel(*result, axis, *index, *value, - device_ctx); - } - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "can not support reduce_op: '%s' for scatter kernel, only " - "support reduce op: 'add‘, 'assign', 'mul' and 'multiply', the " - "defalut reduce " - "op is 'assign' ", - reduce_op)); - return; - } - } -}; - -template -class PutAlongAxisGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "PutAlongAxisGradOpKernel only runs on CPU.")); - - auto input_grad = ctx.Output(framework::GradVarName("Input")); - auto value_grad = ctx.Output(framework::GradVarName("Value")); - auto index = ctx.Input("Index"); - auto result_grad = ctx.Input(framework::GradVarName("Result")); - auto axis = ctx.Attr("Axis"); - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - - if (input_grad) { - framework::TensorCopy(*result_grad, ctx.GetPlace(), input_grad); - if (index_type == framework::proto::VarType::INT32) { - cpu_scatter_input_grad_kernel( - // Here passing an unused argument *result_grad, because it's - // convenient to instantiate a bunch of template function with the - // same arguments list. - *result_grad, axis, *index, *input_grad, ctx.device_context()); - } else { - cpu_scatter_input_grad_kernel( - *result_grad, axis, *index, *input_grad, ctx.device_context()); - } - } - - if (value_grad) { - value_grad->Resize(index->dims()); - value_grad->mutable_data(ctx.GetPlace()); - if (index_type == framework::proto::VarType::INT32) { - cpu_gather_kernel(*result_grad, axis, *index, *value_grad, - ctx.device_context()); - } else if (index_type == framework::proto::VarType::INT64) { - cpu_gather_kernel(*result_grad, axis, *index, *value_grad, - ctx.device_context()); - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/range_op_npu_test.cc b/paddle/fluid/operators/range_op_npu_test.cc index 24741efe426b18..c7e91ba35dee13 100644 --- a/paddle/fluid/operators/range_op_npu_test.cc +++ b/paddle/fluid/operators/range_op_npu_test.cc @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index 21c23a7f602a35..4b6759ea165edf 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -70,9 +70,25 @@ BufferedReader::BufferedReader( stream_ = platform::NpuStreamResourcePool::Instance().New(dev_idx); } #endif + +#ifdef PADDLE_WITH_MLU + if (platform::is_mlu_place(place_)) { + int dev_idx = place_.device; + compute_stream_ = + ((platform::MLUDeviceContext *)(platform::DeviceContextPool::Instance() + .Get(place_))) + ->stream(); + events_.resize(buffer_size); + for (auto &event : events_) { + event = platform::MluEventResourcePool::Instance().New(dev_idx); + } + stream_ = platform::MluStreamResourcePool::Instance().New(dev_idx); + } +#endif cpu_buffer_.resize(buffer_size); cuda_buffer_.resize(buffer_size); npu_buffer_.resize(buffer_size); + mlu_buffer_.resize(buffer_size); ReadTillBufferFullAsync(); } @@ -256,6 +272,56 @@ void BufferedReader::ReadAsync(size_t i) { platform::NPUStreamSync(stream_.get()); } #endif + +#ifdef PADDLE_WITH_MLU + if (platform::is_mlu_place(place_)) { + TensorVec &mlu = mlu_buffer_[i]; + if (mlu.empty()) { + mlu.resize(cpu.size()); + } else { + PADDLE_ENFORCE_EQ( + mlu.size(), cpu.size(), + platform::errors::InvalidArgument( + "Input tensor number on MLU and CPU devices are not matched. " + "The number on MLU is %d, on CPU is %d", + mlu.size(), cpu.size())); + } + + std::vector mlu_ptrs; + mlu_ptrs.reserve(cpu.size()); + for (size_t i = 0; i < cpu.size(); ++i) { + mlu[i].Resize(cpu[i].dims()); + mlu[i].set_layout(cpu[i].layout()); + mlu_ptrs.emplace_back(mlu[i].mutable_data(place_, cpu[i].type())); + } + + platform::SetMLUDeviceId(place_.device); + PADDLE_ENFORCE_MLU_SUCCESS( + cnPlaceNotifier(events_[i].get(), compute_stream_)); + PADDLE_ENFORCE_MLU_SUCCESS(cnWaitNotifier(events_[i].get())); + + platform::RecordEvent record_event("BufferedReader:MemoryCopy", + platform::TracerEventType::UserDefined, + 1); + for (size_t i = 0; i < cpu.size(); ++i) { + auto cpu_place = cpu[i].place(); + auto cpu_ptr = cpu[i].data(); + auto mlu_ptr = mlu_ptrs[i]; + auto size = + cpu[i].numel() * paddle::framework::DataTypeSize(cpu[i].dtype()); + if ((platform::is_mlu_place(cpu_place))) { + memory::Copy(place_, mlu_ptr, cpu_place, cpu_ptr, size, + stream_.get()); + } else { + memory::Copy(place_, mlu_ptr, cpu_place, cpu_ptr, size, + stream_.get()); + platform::MLUStreamSync(stream_.get()); + } + mlu[i].set_lod(cpu[i].lod()); + } + platform::MLUStreamSync(stream_.get()); + } +#endif return i; })); } @@ -291,6 +357,8 @@ void BufferedReader::ReadNextImpl(std::vector *out) { *out = std::move(cuda_buffer_[i]); } else if (platform::is_npu_place(place_)) { *out = std::move(npu_buffer_[i]); + } else if (platform::is_mlu_place(place_)) { + *out = std::move(mlu_buffer_[i]); } else { *out = std::move(cpu_buffer_[i]); } diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h index 3d42486c6df881..f0f3b6b7f9fdfe 100644 --- a/paddle/fluid/operators/reader/buffered_reader.h +++ b/paddle/fluid/operators/reader/buffered_reader.h @@ -29,6 +29,11 @@ #include "paddle/fluid/platform/device/npu/npu_info.h" #include "paddle/fluid/platform/device/npu/npu_resource_pool.h" #endif +#ifdef PADDLE_WITH_MLU +#include "paddle/fluid/platform/device/mlu/mlu_info.h" +#include "paddle/fluid/platform/device/mlu/mlu_resource_pool.h" +#endif + namespace paddle { namespace operators { namespace reader { @@ -70,6 +75,7 @@ class BufferedReader : public framework::DecoratedReader { std::vector cpu_buffer_; std::vector cuda_buffer_; std::vector npu_buffer_; + std::vector mlu_buffer_; size_t prev_pos_{-1UL}; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) gpuStream_t compute_stream_; @@ -82,6 +88,12 @@ class BufferedReader : public framework::DecoratedReader { std::shared_ptr stream_; std::vector> events_; #endif + +#ifdef PADDLE_WITH_MLU + mluStream compute_stream_; + std::shared_ptr stream_; + std::vector> events_; +#endif }; } // namespace reader diff --git a/paddle/fluid/operators/real_op.cc b/paddle/fluid/operators/real_op.cc index 28a8484f539fc9..18e444702fbb2c 100644 --- a/paddle/fluid/operators/real_op.cc +++ b/paddle/fluid/operators/real_op.cc @@ -82,8 +82,8 @@ DECLARE_INPLACE_OP_INFERER(RealGradOpInplaceInferer, } // namespace operators } // namespace paddle -DELCARE_INFER_SHAPE_FUNCTOR(real, RealInferShapeFunctor, - PT_INFER_META(phi::RealAndImagInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(real, RealInferShapeFunctor, + PD_INFER_META(phi::RealAndImagInferMeta)); namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc index cb438b4a805726..41df8e4a15f093 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc @@ -14,15 +14,28 @@ #include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h" -REGISTER_REDUCE_OP(reduce_max); -REGISTER_OP_CPU_KERNEL( - reduce_max, ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel); +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + +namespace ops = paddle::operators; + +class ReduceMaxOpMaker : public ops::ReduceOpMaker { + protected: + virtual std::string GetName() const { return "reduce_max"; } + virtual std::string GetOpType() const { return "Reduce reduce_max"; } +}; + +DECLARE_INFER_SHAPE_FUNCTOR(reduce_max, ReduceMaxInferShapeFunctor, + PD_INFER_META(phi::ReduceInferMetaBase)); + +REGISTER_OPERATOR( + reduce_max, ops::ReduceOp, ReduceMaxOpMaker, + paddle::framework::DefaultGradOpMaker, + paddle::framework::DefaultGradOpMaker, + ReduceMaxInferShapeFunctor); +REGISTER_OPERATOR(reduce_max_grad, ops::ReduceGradOp) + REGISTER_OP_CPU_KERNEL( reduce_max_grad, ops::ReduceGradKernel, diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc index 6157a3a925de51..4a18330913803f 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc @@ -96,8 +96,8 @@ class __reduce_meanMaker__ : public ops::ReduceOpMaker { virtual std::string GetOpType() const { return "Reduce reduce_mean"; } }; -DELCARE_INFER_SHAPE_FUNCTOR(reduce_mean, ReduceMeanInferShapeFunctor, - PT_INFER_META(phi::MeanRawInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(reduce_mean, ReduceMeanInferShapeFunctor, + PD_INFER_META(phi::ReduceInferMetaBase)); REGISTER_OPERATOR(reduce_mean, ops::ReduceOp, __reduce_meanMaker__, ops::ReduceMeanOpGradMaker, diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h index eb76eee1048890..160617695338a9 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h @@ -36,9 +36,9 @@ void TensorReduceImpl(const platform::CUDADeviceContext& dev_ctx, gpuStream_t stream) { y->mutable_data(x.place()); - phi::funcs::TensorReduceImpl( + phi::funcs::ReduceKernel( static_cast(dev_ctx), x, y, transform, - origin_reduce_dims, stream); + origin_reduce_dims); } } // namespace operators diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc index 8ef0712dc7a757..6441d53239e955 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc @@ -102,8 +102,8 @@ class ReduceSumOpMaker : public ops::ReduceOpMaker { virtual std::string GetOpType() const { return "Reduce reduce_sum"; } }; -DELCARE_INFER_SHAPE_FUNCTOR(reduce_sum, ReduceSumInferShapeFunctor, - PT_INFER_META(phi::ReduceInferMetaBase)); +DECLARE_INFER_SHAPE_FUNCTOR(reduce_sum, ReduceSumInferShapeFunctor, + PD_INFER_META(phi::SumRawInferMeta)); REGISTER_OPERATOR(reduce_sum, ops::ReduceOp, ReduceSumOpMaker, ops::ReduceSumVarTypeInference, diff --git a/paddle/fluid/operators/rnn_op.h b/paddle/fluid/operators/rnn_op.h index b636184ae457ed..a473b54c1f8559 100644 --- a/paddle/fluid/operators/rnn_op.h +++ b/paddle/fluid/operators/rnn_op.h @@ -16,9 +16,9 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/activation_op.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/math/fc.h" #include "paddle/fluid/operators/unique_op.h" @@ -36,6 +36,14 @@ using LoDTensor = framework::LoDTensor; using Tensor = framework::Tensor; using TensorList = std::vector; +template +using EigenMatrix = framework::EigenMatrix; + +template +using EigenVector = framework::EigenVector; + #define DEFINE_MODE_DETECTOR(MODE_NAME, MODE_STR) \ inline bool is_##MODE_NAME(const framework::ExecutionContext& ctx) { \ const std::string& mode = ctx.Attr("mode"); \ diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc index e4410b21b54132..cbf2b9152079e1 100644 --- a/paddle/fluid/operators/scale_op.cc +++ b/paddle/fluid/operators/scale_op.cc @@ -121,8 +121,8 @@ DECLARE_INPLACE_OP_INFERER(ScaleOpInplaceInferer, {"X", "Out"}); namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(scale, ScaleInferShapeFunctor, - PT_INFER_META(phi::UnchangedInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(scale, ScaleInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker, ops::ScaleGradMaker, ops::ScaleGradMaker, diff --git a/paddle/fluid/operators/scatter_nd_add_op.cc b/paddle/fluid/operators/scatter_nd_add_op.cc index b7be4cfb2a3950..0ae0e1500c1662 100644 --- a/paddle/fluid/operators/scatter_nd_add_op.cc +++ b/paddle/fluid/operators/scatter_nd_add_op.cc @@ -119,12 +119,12 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(ScatterNdAddGradNoNeedBufferVarsInferer, namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(scatter_nd_add, ScatterNdAddInferShapeFunctor, - PT_INFER_META(phi::ScatterNdAddInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(scatter_nd_add, ScatterNdAddInferShapeFunctor, + PD_INFER_META(phi::ScatterNdAddInferMeta)); -DELCARE_INFER_SHAPE_FUNCTOR(scatter_nd_add_grad, +DECLARE_INFER_SHAPE_FUNCTOR(scatter_nd_add_grad, ScatterNdAddGradInferShapeFunctor, - PT_INFER_META(phi::ScatterNdAddGradInferMeta)); + PD_INFER_META(phi::ScatterNdAddGradInferMeta)); REGISTER_OPERATOR(scatter_nd_add, ops::ScatterNdAddOp, ops::ScatterNdAddOpMaker, ops::ScatterNdAddGradMaker, diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc index fec003305fdc65..5f6b04cf59e0e3 100644 --- a/paddle/fluid/operators/scatter_op.cc +++ b/paddle/fluid/operators/scatter_op.cc @@ -103,11 +103,11 @@ DECLARE_INPLACE_OP_INFERER(ScatterInplaceInferer, {"X", "Out"}); } // namespace operators } // namespace paddle -DELCARE_INFER_SHAPE_FUNCTOR(scatter, ScatterInferShapeFunctor, - PT_INFER_META(phi::ScatterInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(scatter, ScatterInferShapeFunctor, + PD_INFER_META(phi::ScatterInferMeta)); -DELCARE_INFER_SHAPE_FUNCTOR(scatter_grad, ScatterGradInferShapeFunctor, - PT_INFER_META(phi::ScatterGradInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(scatter_grad, ScatterGradInferShapeFunctor, + PD_INFER_META(phi::ScatterGradInferMeta)); namespace ops = paddle::operators; REGISTER_OPERATOR(scatter, ops::ScatterOp, ops::ScatterOpMaker, diff --git a/paddle/fluid/operators/segment_pool_op.cc b/paddle/fluid/operators/segment_pool_op.cc index 322cd97f01c3ad..9d4c8532a82c06 100644 --- a/paddle/fluid/operators/segment_pool_op.cc +++ b/paddle/fluid/operators/segment_pool_op.cc @@ -12,9 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/segment_pool_op.h" #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -23,22 +26,6 @@ class SegmentPoolOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "SegmentPool"); - OP_INOUT_CHECK(ctx->HasInput("SegmentIds"), "Input", "SegmentIds", - "SegmentPool"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "SegmentPool"); - auto dims = ctx->GetInputDim("X"); - dims[0] = -1; - ctx->SetOutputDim("Out", dims); - - if (ctx->Attrs().Get("pooltype") == "MEAN") { - OP_INOUT_CHECK(ctx->HasOutput("SummedIds"), "Output", "SummedIds", - "SegmentPool"); - ctx->SetOutputDim("SummedIds", {-1, 1}); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -150,17 +137,11 @@ class SegmentPoolGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(segment_pool, SegmentPoolInferShapeFunctor, + PD_INFER_META(phi::SegmentPoolInferMeta)); + REGISTER_OPERATOR(segment_pool, ops::SegmentPoolOp, ops::SegmentPoolOpMaker, ops::SegmentPoolGradOpMaker, - ops::SegmentPoolGradOpMaker); + ops::SegmentPoolGradOpMaker, + SegmentPoolInferShapeFunctor); REGISTER_OPERATOR(segment_pool_grad, ops::SegmentPoolGradOp); - -REGISTER_OP_CPU_KERNEL( - segment_pool, - ops::SegmentPoolKernel, - ops::SegmentPoolKernel); - -REGISTER_OP_CPU_KERNEL( - segment_pool_grad, - ops::SegmentPoolGradKernel, - ops::SegmentPoolGradKernel); diff --git a/paddle/fluid/operators/segment_pool_op.cu b/paddle/fluid/operators/segment_pool_op.cu deleted file mode 100644 index e147e62a983540..00000000000000 --- a/paddle/fluid/operators/segment_pool_op.cu +++ /dev/null @@ -1,27 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/segment_pool_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - segment_pool, - ops::SegmentPoolKernel, - ops::SegmentPoolKernel); -REGISTER_OP_CUDA_KERNEL( - segment_pool_grad, - ops::SegmentPoolGradKernel, - ops::SegmentPoolGradKernel); diff --git a/paddle/fluid/operators/segment_pool_op.h b/paddle/fluid/operators/segment_pool_op.h deleted file mode 100644 index 2f5ef7f54f9888..00000000000000 --- a/paddle/fluid/operators/segment_pool_op.h +++ /dev/null @@ -1,176 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/segment_pooling.h" -#include "paddle/fluid/platform/macros.h" -#include "paddle/phi/common/place.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -void SegmentKernelLaunchHelper(const framework::ExecutionContext& context) { - auto* input = context.Input("X"); - auto* segment = context.Input("SegmentIds"); - auto* output = context.Output("Out"); - std::string pooltype = context.Attr("pooltype"); - Tensor* summed_ids = nullptr; - - int64_t num_indices = segment->numel(); - PADDLE_ENFORCE_EQ( - num_indices, input->dims()[0], - platform::errors::InvalidArgument( - "Segment_ids should be the same size as dimension 0 of input X.")); - PADDLE_ENFORCE_EQ(num_indices, segment->dims()[0], - platform::errors::InvalidArgument( - "Segment_ids should be 1-D tensor, or it's other " - "dimension size is 1. Segment_ids's shape is: [%s].", - segment->dims())); - - if (input->numel() == 0 || segment->numel() == 0) { - return; - } - - bool cpu_place = context.GetPlace().GetType() == phi::AllocationType::CPU; - if (cpu_place) { - auto dims = input->dims(); - auto* segment_ids = segment->data(); - dims[0] = static_cast(segment_ids[segment->numel() - 1] + 1); - PADDLE_ENFORCE_GT( - dims[0], 0, - platform::errors::InvalidArgument( - "Segment ids must be >= 0, but got last id %d", dims[0])); - output->Resize({dims}); - output->mutable_data(context.GetPlace()); - phi::funcs::SetConstant set_zero; - auto& dev_ctx = context.template device_context(); - set_zero(dev_ctx, output, static_cast(0)); - } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (!cpu_place) { - Tensor length; - length.mutable_data(phi::make_ddim({1}), platform::CPUPlace()); - IndexT* length_data = length.data(); - const IndexT* segment_ids = segment->data(); - -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_GPU_SUCCESS( - hipMemcpy(length_data, segment_ids + num_indices - 1, sizeof(IndexT), - hipMemcpyDeviceToHost)); -#else - PADDLE_ENFORCE_GPU_SUCCESS( - cudaMemcpy(length_data, segment_ids + num_indices - 1, sizeof(IndexT), - cudaMemcpyDeviceToHost)); -#endif - - IndexT length_host = length_data[0]; - length_host++; - PADDLE_ENFORCE_GT( - length_host, 0, - platform::errors::InvalidArgument( - "Segment ids must be >= 0, but got last id %d", length_data[0])); - auto dims = input->dims(); - dims[0] = static_cast(length_host); - output->Resize({dims}); - output->mutable_data(context.GetPlace()); - T init_value = 0; - if (pooltype == "MAX") { - init_value = static_cast(-FLT_MAX); - } else if (pooltype == "MIN") { - init_value = static_cast(FLT_MAX); - } - phi::funcs::SetConstant setconst; - auto& dev_ctx = context.template device_context(); - setconst(dev_ctx, output, static_cast(init_value)); - // the gpu kernel of mean pool record the counts of segment_ids - if (pooltype == "MEAN") { - summed_ids = context.Output("SummedIds"); - summed_ids->Resize({dims[0], 1}); - summed_ids->mutable_data(context.GetPlace()); - setconst(dev_ctx, summed_ids, static_cast(1e-12)); - } - } -#endif - - SegmentPoolFunctor pool; - - pool(context.template device_context(), *input, *segment, - output, summed_ids, pooltype); -} - -template -class SegmentPoolKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* segment = context.Input("SegmentIds"); - auto index_type = framework::TransToProtoVarType(segment->dtype()); - if (index_type == framework::proto::VarType::INT32) { - SegmentKernelLaunchHelper(context); - } else if (index_type == framework::proto::VarType::INT64) { - SegmentKernelLaunchHelper(context); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported index type, Expected int, int64, but got %s.", - index_type)); - } - } -}; - -template -class SegmentPoolGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* input = context.Input("X"); - auto* output = context.Input("Out"); - auto* segment = context.Input("SegmentIds"); - auto* out_g = context.Input(framework::GradVarName("Out")); - auto* in_g = context.Output(framework::GradVarName("X")); - std::string pooltype = context.Attr("pooltype"); - - const Tensor* summed_ids = nullptr; - if (pooltype == "MEAN") { - summed_ids = context.Input("SummedIds"); - } - - in_g->mutable_data(context.GetPlace()); - phi::funcs::SetConstant set_zero; - auto& dev_ctx = context.template device_context(); - set_zero(dev_ctx, in_g, static_cast(0)); - - auto index_type = framework::TransToProtoVarType(segment->dtype()); - if (index_type == framework::proto::VarType::INT32) { - SegmentPoolGradFunctor pool; - pool(context.template device_context(), *input, *output, - *out_g, *segment, in_g, summed_ids, pooltype); - } else if (index_type == framework::proto::VarType::INT64) { - SegmentPoolGradFunctor pool; - pool(context.template device_context(), *input, *output, - *out_g, *segment, in_g, summed_ids, pooltype); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported index type, Expected int, int64, but got %s.", - index_type)); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/selu_op.cc b/paddle/fluid/operators/selu_op.cc index 0372a79b967a48..59c6e16535738b 100644 --- a/paddle/fluid/operators/selu_op.cc +++ b/paddle/fluid/operators/selu_op.cc @@ -120,8 +120,8 @@ class SeluGradOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(selu, SeluInferShapeFunctor, - PT_INFER_META(phi::UnchangedInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(selu, SeluInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(selu, ops::SeluOp, ops::SeluOpMaker, ops::SeluOpInferVarType, ops::SeluGradMaker, diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc b/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc index 6c33ff52044b26..23c6a0133e1eda 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc @@ -184,9 +184,6 @@ class SequenceConvGradXPUKernel : public framework::OpKernel { col_data, paddle::platform::errors::Fatal("XPU memory is not enough")); if (in_g || filter_g) { - int r = xpu::constant(xpu_context, col_data, col_numel, T(0)); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant"); - bool trans_a = false; bool trans_b = true; int m = out_g->dims()[0]; @@ -208,7 +205,7 @@ class SequenceConvGradXPUKernel : public framework::OpKernel { const T* data_b = filter->data(); T* data_c = col_data; - r = xpu::fc_fusion( + int r = xpu::fc_fusion( xpu_context, data_a, data_b, data_c, m, n, k, trans_a, trans_b, nullptr, nullptr, nullptr, lda, ldb, ldc, alpha, beta, nullptr, xpu::Activation_t::LINEAR); @@ -222,7 +219,6 @@ class SequenceConvGradXPUKernel : public framework::OpKernel { in_g->mutable_data(context.GetPlace()); in_g->set_lod(in->lod()); - xpu::constant(xpu_context, in_g->data(), in_g->numel(), T(0)); int r = xpu::sequence_context_projection_grad( xpu_context, in_g->data(), col_data, nullptr, lodx, sequence_width, @@ -232,8 +228,6 @@ class SequenceConvGradXPUKernel : public framework::OpKernel { if (filter_g) { filter_g->mutable_data(context.GetPlace()); - xpu::constant(xpu_context, filter_g->data(), filter_g->numel(), - T(0)); int r = xpu::sequence_context_projection( xpu_context, in->data(), col_data, nullptr, lodx, sequence_width, diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc index ec3e04e71faf0b..7d0d782b837c4c 100644 --- a/paddle/fluid/operators/set_value_op.cc +++ b/paddle/fluid/operators/set_value_op.cc @@ -241,13 +241,6 @@ REGISTER_OPERATOR(set_value, ops::SetValue, ops::SetValueMaker, ops::SetValueGradMaker, ops::SetValueOpInplaceInferer); -REGISTER_OP_CPU_KERNEL( - set_value, ops::SetValueKernel, - ops::SetValueKernel, - ops::SetValueKernel, - ops::SetValueKernel, - ops::SetValueKernel); - REGISTER_OPERATOR(set_value_grad, ops::SetValueGrad); REGISTER_OP_CPU_KERNEL( diff --git a/paddle/fluid/operators/set_value_op.cu b/paddle/fluid/operators/set_value_op.cu index f9701b0acaac76..9f291a863c067a 100644 --- a/paddle/fluid/operators/set_value_op.cu +++ b/paddle/fluid/operators/set_value_op.cu @@ -16,13 +16,6 @@ namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - set_value, ops::SetValueKernel, - ops::SetValueKernel, - ops::SetValueKernel, - ops::SetValueKernel, - ops::SetValueKernel); - REGISTER_OP_CUDA_KERNEL( set_value_grad, ops::SetValueGradKernel, diff --git a/paddle/fluid/operators/set_value_op.h b/paddle/fluid/operators/set_value_op.h index 9dd727959202c6..4d459f8c01b159 100644 --- a/paddle/fluid/operators/set_value_op.h +++ b/paddle/fluid/operators/set_value_op.h @@ -121,201 +121,6 @@ inline void CheckIsDimsMatch(const framework::DDim first, "of target shape: %d, but now shape is %d.", second.to_str(), first.to_str())); } - -template -class SetValueKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - const int rank = ctx.Input("Input")->dims().size(); - - // TODO(liym27): A more elegent code to do this. C++ has to make template - // integer as constant, but we had better have alternative writing in the - // future. - switch (rank) { - case 1: - SetValueCompute<1>(ctx); - break; - case 2: - SetValueCompute<2>(ctx); - break; - case 3: - SetValueCompute<3>(ctx); - break; - case 4: - SetValueCompute<4>(ctx); - break; - case 5: - SetValueCompute<5>(ctx); - break; - case 6: - SetValueCompute<6>(ctx); - break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "The rank of input should be less than 7, but received %d.", rank)); - } - } - - private: - template - void SetValueCompute(const framework::ExecutionContext& ctx) const { - auto* in = ctx.Input("Input"); - auto* value_tensor = ctx.Input("ValueTensor"); - auto* out = ctx.Output("Out"); - - auto starts_tensor_list = - ctx.MultiInput("StartsTensorList"); - auto ends_tensor_list = ctx.MultiInput("EndsTensorList"); - auto steps_tensor_list = - ctx.MultiInput("StepsTensorList"); - - auto axes = ctx.Attr>("axes"); - auto starts = ctx.Attr>("starts"); - auto ends = ctx.Attr>("ends"); - auto steps = ctx.Attr>("steps"); - auto shape = ctx.Attr>("shape"); - auto decrease_axes = ctx.Attr>("decrease_axes"); - auto none_axes = ctx.Attr>("none_axes"); - - if (!starts_tensor_list.empty()) { - starts = GetDataFromTensorList(starts_tensor_list); - } - if (!ends_tensor_list.empty()) { - ends = GetDataFromTensorList(ends_tensor_list); - } - if (!steps_tensor_list.empty()) { - steps = GetDataFromTensorList(steps_tensor_list); - } - - auto in_dims = in->dims(); - CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends, &steps); - auto slice_dims = GetSliceDims(in_dims, axes, starts, ends, &steps); - auto decrease_slice_dims = GetDecreasedDims(slice_dims, decrease_axes); - - auto slice_dims_for_assign = decrease_slice_dims; - if (!none_axes.empty()) { - std::vector slice_dims_with_none; - - size_t none_axes_cur = 0, decrease_axes_cur = 0; - for (int i = 0; i < slice_dims.size(); ++i) { - while (none_axes_cur < none_axes.size() && - none_axes[none_axes_cur] <= i) { - slice_dims_with_none.push_back(1); - none_axes_cur++; - } - if (decrease_axes_cur < decrease_axes.size() && - decrease_axes[decrease_axes_cur] == i) { - decrease_axes_cur++; - } else { - slice_dims_with_none.push_back(slice_dims[i]); - } - } - while (none_axes_cur < none_axes.size()) { - slice_dims_with_none.push_back(1); - none_axes_cur++; - } - - slice_dims_for_assign = phi::make_ddim(slice_dims_with_none); - } - - auto place = ctx.GetPlace(); - auto& eigen_place = - *ctx.template device_context().eigen_device(); - - // Here copy data from input to avoid data loss at PE and Graph level. - // TODO(liym27): Speed up in the future version. - // - Q: Why don't call ShareDataWith to speed up? - // - A: Because it's not supported to ShareDataWith on OP's input and output - // https://github.com/PaddlePaddle/Paddle/wiki/ShareDataWith-and-ShareBufferWith-are-prohibited-in-OP - // - Q: Why don't delete Input, after all, the input and output are the same - // Tensor at program level? - // - A: If deleting Input, the graph will be complex, such as there will - // be two ops points to the output in graph: op1 -> output <- set_value. - // In this case, we have to find a way to handle the running order of - // set_value is what we want. - paddle::framework::TensorCopy(*in, place, out); - - Tensor slice_tensor(in->dtype()), pad_tensor(in->dtype()); - slice_tensor.mutable_data(slice_dims, place); - pad_tensor.mutable_data(in_dims, place); - - auto pad_e = framework::EigenTensor::From(pad_tensor, in_dims); - auto out_e = framework::EigenTensor::From(*out); - auto slice_e = framework::EigenTensor::From(slice_tensor, slice_dims); - - // Step 1: Set the value of out at `_index` to zero - slice_e.device(eigen_place) = slice_e.constant(T(0)); - - auto starts_indices = Eigen::DSizes(); - auto ends_indices = Eigen::DSizes(); - auto strides_indices = Eigen::DSizes(); - - for (size_t i = 0; i < D; ++i) { - starts_indices[i] = 0; - ends_indices[i] = slice_dims[i]; - strides_indices[i] = 1; - } - for (size_t i = 0; i < axes.size(); i++) { - int axis_index = axes[i]; - starts_indices[axis_index] = starts[i]; - ends_indices[axis_index] = ends[i]; - strides_indices[axis_index] = steps[i]; - if (starts[i] == ends[i]) { // slice is empty, data will not be changed - return; - } - } - - out_e.stridedSlice(starts_indices, ends_indices, strides_indices) - .device(eigen_place) = slice_e; - - // Step 2: Set a tensor with the same shape as out tensor. And its data at - // '_index' is the same as value_tensor, and data out of '_index' to zero - - // - Step 2.1 Set slice tensor with value - - // NOTE(liym27): [ Why resize slice_tensor here? ] - // A: When do broadcasting on slice_tensor and value_tensor, the shape of - // slice_tensor should be decreased dims. - // e.g. - // x[:,0] = value_tensor - // x's shape = [3, 4], value_tensor's shape = [3] - // We get slice_dims = [3, 1], decrease_slice_dims = [3] - // If do broadcasting on Tensor with shape [3, 1] and [3], the result's - // shape is [3, 3], which cross the border; - // If do broadcasting on Tensor with shape [3] and [3], the result's shape - // is [3], which is right. - - slice_tensor.Resize(slice_dims_for_assign); - if (value_tensor != nullptr) { - CheckIsDimsMatch(slice_dims_for_assign, value_tensor->dims()); - // ElementwiseComputeEx can do broadcasting - ElementwiseComputeEx, DeviceContext, T>( - ctx, &slice_tensor, value_tensor, -1, SubFunctor(), &slice_tensor); - } else { - Tensor value_t(in->dtype()); - auto value_dims = phi::make_ddim(shape); - CheckIsDimsMatch(slice_dims_for_assign, value_dims); - - value_t.mutable_data(value_dims, place); - auto value_name = - GetValueName(framework::TransToProtoVarType(in->dtype())); - CopyVecotorToTensor(value_name.c_str(), &value_t, ctx); - value_t.Resize(value_dims); - ElementwiseComputeEx, DeviceContext, T>( - ctx, &slice_tensor, &value_t, -1, SubFunctor(), &slice_tensor); - } - slice_tensor.Resize(slice_dims); - - // - Step 2.2 Pad slice tensor with 0 - pad_e.device(eigen_place) = pad_e.constant(T(0)); - pad_e.stridedSlice(starts_indices, ends_indices, strides_indices) - .device(eigen_place) = slice_e; - - // Step 3: Set out tensor with value_tensor - out_e.device(eigen_place) = out_e - pad_e; - } -}; - template class SetValueGradKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/set_value_op_npu.cc b/paddle/fluid/operators/set_value_op_npu.cc index 599697059c4dcf..46d64333b608b7 100644 --- a/paddle/fluid/operators/set_value_op_npu.cc +++ b/paddle/fluid/operators/set_value_op_npu.cc @@ -174,6 +174,9 @@ class SetValueNPUKernel : public framework::OpKernel { .AddInput(std::move(index_indices)) .AddInput(val_temp) .AddOutput(out_temp) +#if (CANN_VERSION_CODE >= 504001) + .AddAttrs({{"use_locking", false}}) +#endif .Run(stream); } }; diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc index 5b7ccdde81097a..e2c8359beb1290 100644 --- a/paddle/fluid/operators/shape_op.cc +++ b/paddle/fluid/operators/shape_op.cc @@ -12,10 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/shape_op.h" #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/complex.h" namespace paddle { namespace operators { @@ -95,9 +93,3 @@ REGISTER_OPERATOR( shape, ops::ShapeOp, ops::ShapeOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL(shape, ops::ShapeKernel, ops::ShapeKernel, - ops::ShapeKernel, ops::ShapeKernel, - ops::ShapeKernel, ops::ShapeKernel, - ops::ShapeKernel, - ops::ShapeKernel>, - ops::ShapeKernel>); diff --git a/paddle/fluid/operators/shape_op.cu b/paddle/fluid/operators/shape_op.cu deleted file mode 100644 index c6e380a94f84db..00000000000000 --- a/paddle/fluid/operators/shape_op.cu +++ /dev/null @@ -1,27 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/shape_op.h" -#include "paddle/fluid/platform/complex.h" - -REGISTER_OP_CUDA_KERNEL( - shape, paddle::operators::ShapeKernel, - paddle::operators::ShapeKernel, paddle::operators::ShapeKernel, - paddle::operators::ShapeKernel, - paddle::operators::ShapeKernel, - paddle::operators::ShapeKernel, - paddle::operators::ShapeKernel, - paddle::operators::ShapeKernel, - paddle::operators::ShapeKernel>, - paddle::operators::ShapeKernel>); diff --git a/paddle/fluid/operators/shape_op.h b/paddle/fluid/operators/shape_op.h deleted file mode 100644 index 39ebcca46a710e..00000000000000 --- a/paddle/fluid/operators/shape_op.h +++ /dev/null @@ -1,46 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; -using SelectedRows = phi::SelectedRows; - -template -class ShapeKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in_var = ctx.InputVar("Input"); - framework::DDim in_dims; - if (in_var->IsType()) { - in_dims = in_var->Get().value().dims(); - } else { - in_dims = in_var->Get().dims(); - } - auto* out_t = ctx.Output("Out"); - out_t->Resize({in_dims.size()}); - auto out_data = out_t->mutable_data(platform::CPUPlace()); - for (int i = 0; i < in_dims.size(); ++i) { - out_data[i] = in_dims[i]; - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/shape_op_npu.cc b/paddle/fluid/operators/shape_op_npu.cc index 7bff7b2d668347..f751ab41014c21 100644 --- a/paddle/fluid/operators/shape_op_npu.cc +++ b/paddle/fluid/operators/shape_op_npu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/shape_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/shape_op_xpu.cc b/paddle/fluid/operators/shape_op_xpu.cc index 2e9092a6432538..a62d1b434e7643 100644 --- a/paddle/fluid/operators/shape_op_xpu.cc +++ b/paddle/fluid/operators/shape_op_xpu.cc @@ -10,12 +10,41 @@ * limitations under the License. */ #ifdef PADDLE_WITH_XPU +#include +#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/shape_op.h" +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using SelectedRows = phi::SelectedRows; + +template +class ShapeXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in_var = ctx.InputVar("Input"); + framework::DDim in_dims; + if (in_var->IsType()) { + in_dims = in_var->Get().value().dims(); + } else { + in_dims = in_var->Get().dims(); + } + auto* out_t = ctx.Output("Out"); + out_t->Resize({in_dims.size()}); + auto out_data = out_t->mutable_data(platform::CPUPlace()); + for (int i = 0; i < in_dims.size(); ++i) { + out_data[i] = in_dims[i]; + } + } +}; +} // namespace operators +} // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_XPU_KERNEL(shape, ops::ShapeKernel, ops::ShapeKernel, - ops::ShapeKernel, ops::ShapeKernel, - ops::ShapeKernel); +REGISTER_OP_XPU_KERNEL(shape, ops::ShapeXPUKernel, + ops::ShapeXPUKernel, ops::ShapeXPUKernel, + ops::ShapeXPUKernel, ops::ShapeXPUKernel); #endif diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc index 8e502fc04dbdb0..016ff54645b02e 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc @@ -15,7 +15,10 @@ limitations under the License. */ #include #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -26,46 +29,6 @@ const int kIgnoreIndex = -100; class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", - "SigmoidCrossEntropyWithLogitsOp"); - OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", - "SigmoidCrossEntropyWithLogitsOp"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", - "SigmoidCrossEntropyWithLogitsOp"); - - auto x_dims = ctx->GetInputDim("X"); - auto labels_dims = ctx->GetInputDim("Label"); - - int rank = x_dims.size(); - PADDLE_ENFORCE_EQ(rank, labels_dims.size(), - platform::errors::InvalidArgument( - "Input(X) and Input(Label) shall have the same rank." - "But received: the rank of Input(X) is [%d], " - "the rank of Input(Label) is [%d].", - rank, labels_dims.size())); - - bool check = true; - if ((!ctx->IsRuntime()) && - (phi::product(x_dims) <= 0 || phi::product(labels_dims) <= 0)) { - check = false; - } - - if (check) { - PADDLE_ENFORCE_EQ( - phi::slice_ddim(x_dims, 0, rank), - phi::slice_ddim(labels_dims, 0, rank), - platform::errors::InvalidArgument( - "Input(X) and Input(Label) shall have the same shape " - "except the last dimension. But received: the shape of " - "Input(X) is [%s], the shape of Input(Label) is [%s].", - x_dims, labels_dims)); - } - - ctx->ShareDim("X", /*->*/ "Out"); - ctx->ShareLoD("X", /*->*/ "Out"); - } }; class SigmoidCrossEntropyWithLogitsGradOp @@ -201,12 +164,17 @@ DECLARE_INPLACE_OP_INFERER(SigmoidCrossEntropyWithLogitsGradInplaceInferer, } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR( + sigmoid_cross_entropy_with_logits, + SigmoidCrossEntropyWithLogitsInferShapeFunctor, + PD_INFER_META(phi::SigmoidCrossEntropyWithLogitsInferMeta)); REGISTER_OPERATOR( sigmoid_cross_entropy_with_logits, ops::SigmoidCrossEntropyWithLogitsOp, ops::SigmoidCrossEntropyWithLogitsOpMaker, ops::SigmoidCrossEntropyWithLogitsGradOpMaker, ops::SigmoidCrossEntropyWithLogitsGradOpMaker, - ops::SigmoidCrossEntropyWithLogitsInplaceInferer); + ops::SigmoidCrossEntropyWithLogitsInplaceInferer, + SigmoidCrossEntropyWithLogitsInferShapeFunctor); REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits_grad, ops::SigmoidCrossEntropyWithLogitsGradOp, ops::SigmoidCrossEntropyWithLogitsGradInplaceInferer); diff --git a/paddle/fluid/operators/sign_op.cc b/paddle/fluid/operators/sign_op.cc index e2381c76f7e45a..ceb42dcf3e5921 100644 --- a/paddle/fluid/operators/sign_op.cc +++ b/paddle/fluid/operators/sign_op.cc @@ -60,8 +60,8 @@ class SignGradMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(sign, SignInferShapeFunctor, - PT_INFER_META(phi::UnchangedInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(sign, SignInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(sign, ops::SignOp, ops::SignOpMaker, ops::SignGradMaker, ops::SignGradMaker, diff --git a/paddle/fluid/operators/size_op.cc b/paddle/fluid/operators/size_op.cc index e584c1a4cce1e8..84b0f403be0389 100644 --- a/paddle/fluid/operators/size_op.cc +++ b/paddle/fluid/operators/size_op.cc @@ -44,8 +44,8 @@ Return the number of elements in the input. } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(size, SizeInferShapeFunctor, - PT_INFER_META(phi::SizeInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(size, SizeInferShapeFunctor, + PD_INFER_META(phi::SizeInferMeta)); REGISTER_OPERATOR( size, ops::SizeOp, ops::SizeOpMaker, paddle::framework::EmptyGradOpMaker, diff --git a/paddle/fluid/operators/softmax_op_npu_test.cc b/paddle/fluid/operators/softmax_op_npu_test.cc index 3bc55fafd81e18..3148b31a8322e2 100644 --- a/paddle/fluid/operators/softmax_op_npu_test.cc +++ b/paddle/fluid/operators/softmax_op_npu_test.cc @@ -22,7 +22,6 @@ limitations under the License. */ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/squeeze_op_npu_test.cc b/paddle/fluid/operators/squeeze_op_npu_test.cc index 956544c53609eb..d61f5aa3f634cd 100644 --- a/paddle/fluid/operators/squeeze_op_npu_test.cc +++ b/paddle/fluid/operators/squeeze_op_npu_test.cc @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h index c92d468f3462c9..af29aac6b90528 100644 --- a/paddle/fluid/operators/strided_memcpy.h +++ b/paddle/fluid/operators/strided_memcpy.h @@ -109,6 +109,11 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx, auto& npu_ctx = reinterpret_cast(ctx); memory::Copy(npu_place, dst + i * dst_after, npu_place, src + i * src_after, sizeof(T) * size, npu_ctx.stream()); +#elif defined(PADDLE_WITH_MLU) + auto& mlu_place = place; + auto& mlu_ctx = reinterpret_cast(ctx); + memory::Copy(mlu_place, dst + i * dst_after, mlu_place, + src + i * src_after, sizeof(T) * size, mlu_ctx.stream()); #else PADDLE_THROW(platform::errors::PreconditionNotMet( "Paddle is not compiled with GPU.")); diff --git a/paddle/fluid/operators/take_along_axis_op.cc b/paddle/fluid/operators/take_along_axis_op.cc index 664f1031915e46..fa8a5e92712ec8 100644 --- a/paddle/fluid/operators/take_along_axis_op.cc +++ b/paddle/fluid/operators/take_along_axis_op.cc @@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/take_along_axis_op.h" #include #include #include + +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/phi/core/ddim.h" @@ -139,16 +140,3 @@ REGISTER_OPERATOR(take_along_axis, ops::TakeAlongAxisOp, ops::TakeAlongAxisGradOpMaker); REGISTER_OPERATOR(take_along_axis_grad, ops::TakeAlongAxisGradOp); - -REGISTER_OP_CPU_KERNEL(take_along_axis, ops::TakeAlongAxisOpKernel, - ops::TakeAlongAxisOpKernel, - ops::TakeAlongAxisOpKernel, - ops::TakeAlongAxisOpKernel, - ops::TakeAlongAxisOpKernel); - -REGISTER_OP_CPU_KERNEL(take_along_axis_grad, - ops::TakeAlongAxisGradOpKernel, - ops::TakeAlongAxisGradOpKernel, - ops::TakeAlongAxisGradOpKernel, - ops::TakeAlongAxisGradOpKernel, - ops::TakeAlongAxisGradOpKernel); diff --git a/paddle/fluid/operators/take_along_axis_op.cu b/paddle/fluid/operators/take_along_axis_op.cu deleted file mode 100644 index b6c62d497b379d..00000000000000 --- a/paddle/fluid/operators/take_along_axis_op.cu +++ /dev/null @@ -1,97 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/fluid/operators/take_along_axis_op.h" -#include "paddle/phi/core/ddim.h" - -namespace paddle { -namespace operators { - -template -class TakeAlongAxisCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "This kernel only runs on GPU device.")); - auto input = ctx.Input("Input"); - auto axis = ctx.Attr("Axis"); - auto index = ctx.Input("Index"); - auto result = ctx.Output("Result"); - result->Resize(index->dims()); - result->mutable_data(ctx.GetPlace()); - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - if (index_type == framework::proto::VarType::INT32) { - gpu_gather_kernel(*input, axis, *index, *result, - ctx.device_context()); - } else if (index_type == framework::proto::VarType::INT64) { - gpu_gather_kernel(*input, axis, *index, *result, - ctx.device_context()); - } - } -}; - -template -class TakeAlongAxisGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on GPU.")); - - auto input_grad = ctx.Output(framework::GradVarName("Input")); - auto index = ctx.Input("Index"); - auto result_grad = ctx.Input(framework::GradVarName("Result")); - auto axis = ctx.Attr("Axis"); - // We need to know the shape of input matrix to determine the shape of grad - // matrix of input. - auto input = ctx.Input("Input"); - input_grad->Resize(input->dims()); - input_grad->mutable_data(ctx.GetPlace()); - - // Set to zero tensor. - auto &dev_ctx = ctx.template device_context(); - phi::funcs::SetConstant functor; - functor(reinterpret_cast(dev_ctx), - input_grad, static_cast(0)); - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - - if (index_type == framework::proto::VarType::INT32) { - gpu_scatter_add_kernel( - *input_grad, axis, *index, *result_grad, - ctx.device_context()); // the gradient of gather is scatter - } else if (index_type == framework::proto::VarType::INT64) { - gpu_scatter_add_kernel(*input_grad, axis, *index, - *result_grad, ctx.device_context()); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL(take_along_axis, ops::TakeAlongAxisCUDAKernel, - ops::TakeAlongAxisCUDAKernel, - ops::TakeAlongAxisCUDAKernel, - ops::TakeAlongAxisCUDAKernel, - ops::TakeAlongAxisCUDAKernel); -REGISTER_OP_CUDA_KERNEL(take_along_axis_grad, - ops::TakeAlongAxisGradOpCUDAKernel, - ops::TakeAlongAxisGradOpCUDAKernel, - ops::TakeAlongAxisGradOpCUDAKernel, - ops::TakeAlongAxisGradOpCUDAKernel, - ops::TakeAlongAxisGradOpCUDAKernel); diff --git a/paddle/fluid/operators/take_along_axis_op.h b/paddle/fluid/operators/take_along_axis_op.h deleted file mode 100644 index fc781dbddf2ad2..00000000000000 --- a/paddle/fluid/operators/take_along_axis_op.h +++ /dev/null @@ -1,92 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather_scatter_kernel.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class TakeAlongAxisOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on CPU.")); - - auto input = ctx.Input("Input"); - auto axis = ctx.Attr("Axis"); - auto index = ctx.Input("Index"); - auto result = ctx.Output("Result"); - result->Resize(index->dims()); - result->mutable_data(ctx.GetPlace()); - - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - if (index_type == framework::proto::VarType::INT32) { - cpu_gather_kernel(*input, axis, *index, *result, - ctx.device_context()); - } else if (index_type == framework::proto::VarType::INT64) { - cpu_gather_kernel(*input, axis, *index, *result, - ctx.device_context()); - } - } -}; - -template -class TakeAlongAxisGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on CPU.")); - - auto input_grad = ctx.Output(framework::GradVarName("Input")); - auto index = ctx.Input("Index"); - auto result_grad = ctx.Input(framework::GradVarName("Result")); - auto axis = ctx.Attr("Axis"); - // We need to know the shape of input matrix to determine the shape of grad - // matrix of input. - auto input = ctx.Input("Input"); - input_grad->Resize(input->dims()); - input_grad->mutable_data(ctx.GetPlace()); - - // Set to zero tensor. - auto &dev_ctx = ctx.template device_context(); - phi::funcs::SetConstant functor; - functor(reinterpret_cast(dev_ctx), - input_grad, static_cast(0)); - - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - if (index_type == framework::proto::VarType::INT32) { - cpu_scatter_add_kernel( - *input_grad, axis, *index, *result_grad, - ctx.device_context()); // the gradient of gather is scatter - } else if (index_type == framework::proto::VarType::INT64) { - cpu_scatter_add_kernel(*input_grad, axis, *index, - *result_grad, ctx.device_context()); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/test_common_infer_shape_functions.cc b/paddle/fluid/operators/test_common_infer_shape_functions.cc index a7c7e33f58af6c..1de1b590a1311b 100644 --- a/paddle/fluid/operators/test_common_infer_shape_functions.cc +++ b/paddle/fluid/operators/test_common_infer_shape_functions.cc @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/operators/common_infer_shape_functions.h" #include "paddle/phi/core/ddim.h" -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(softmax); diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h index d60976928e00cb..80c9935057cb5d 100644 --- a/paddle/fluid/operators/top_k_function_cuda.h +++ b/paddle/fluid/operators/top_k_function_cuda.h @@ -51,6 +51,19 @@ namespace operators { using Tensor = framework::Tensor; +inline void GetDims(const phi::DDim& dim, int axis, int* pre, int* n, + int* post) { + *pre = 1; + *post = 1; + *n = dim[axis]; + for (int i = 0; i < axis; ++i) { + (*pre) *= dim[i]; + } + for (int i = axis + 1; i < dim.size(); ++i) { + (*post) *= dim[i]; + } +} + struct SegmentOffsetIter { EIGEN_DEVICE_FUNC explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {} diff --git a/paddle/fluid/operators/top_k_v2_op.cc b/paddle/fluid/operators/top_k_v2_op.cc index 810afc901df57b..d1add111e1d24c 100644 --- a/paddle/fluid/operators/top_k_v2_op.cc +++ b/paddle/fluid/operators/top_k_v2_op.cc @@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/top_k_v2_op.h" #include +#include "paddle/fluid/framework/op_registry.h" + namespace paddle { namespace operators { @@ -173,15 +174,3 @@ REGISTER_OPERATOR(top_k_v2, ops::TopkV2Op, ops::TopkV2OpMaker, ops::TopkV2GradOpMaker); REGISTER_OPERATOR(top_k_v2_grad, ops::TopkV2OpGrad); - -REGISTER_OP_CPU_KERNEL(top_k_v2, - ops::TopkV2Kernel, - ops::TopkV2Kernel, - ops::TopkV2Kernel, - ops::TopkV2Kernel) - -REGISTER_OP_CPU_KERNEL( - top_k_v2_grad, ops::TopkV2GradKernel, - ops::TopkV2GradKernel, - ops::TopkV2GradKernel, - ops::TopkV2GradKernel) diff --git a/paddle/fluid/operators/top_k_v2_op.cu b/paddle/fluid/operators/top_k_v2_op.cu deleted file mode 100644 index 84d8eef53bf72c..00000000000000 --- a/paddle/fluid/operators/top_k_v2_op.cu +++ /dev/null @@ -1,296 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/top_k_function_cuda.h" -#include "paddle/fluid/operators/top_k_v2_op.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -#define FIXED_BLOCK_DIM_BASE(dim, ...) \ - case (dim): { \ - constexpr auto kBlockDim = (dim); \ - __VA_ARGS__; \ - } break - -#define FIXED_BLOCK_DIM(...) \ - FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__) - -template -class TopkV2OpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::InvalidArgument( - "It must use CUDAPlace, you must check your device set.")); - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - auto* indices = ctx.Output("Indices"); - - // get the attributes - int k = static_cast(ctx.Attr("k")); - int axis = static_cast(ctx.Attr("axis")); - const bool& sorted = static_cast(ctx.Attr("sorted")); - const bool& largest = static_cast(ctx.Attr("largest")); - - // get the input dims - const auto& in_dims = input->dims(); - // calcluate the real axis - if (axis < 0) axis += in_dims.size(); - - auto* k_t = ctx.Input("K"); - if (k_t) { - Tensor k_host; - framework::TensorCopySync(*k_t, platform::CPUPlace(), &k_host); - k = k_host.data()[0]; - framework::DDim output_dims = output->dims(); - output_dims[axis] = k; - output->Resize(output_dims); - indices->Resize(output_dims); - } - - const auto& out_dims = output->dims(); - - const T* input_data = input->data(); - T* output_data = output->mutable_data(ctx.GetPlace()); - int64_t* indices_data = indices->mutable_data(ctx.GetPlace()); - - if (axis == in_dims.size() - 1) { - // if get the topK from the last axis - const int64_t& input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t& input_width = in_dims[in_dims.size() - 1]; - const auto& dev_ctx = ctx.cuda_device_context(); - - if (k > input_width) k = input_width; - - // The conclusion is drawn from the data through multiple sets of - // statistics - if (input_width >= 128 && k >= input_width * 0.75) { - if (SortTopk(dev_ctx, input, input_width, input_height, k, output, - indices, largest)) { - // Successed, return. - return; - } else { - LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use " - "default topk kernel."; - } - } - - // NOTE: pass lds and dim same to input width. - // NOTE: old matrix implementation of stride is different to eigen. - const int kMaxHeight = 2048; - int gridx = input_height < kMaxHeight ? input_height : kMaxHeight; - switch (GetDesiredBlockDim(input_width)) { -#ifdef PADDLE_WITH_HIP - FIXED_BLOCK_DIM( - KeMatrixTopK<<>>( - output_data, k, indices_data, input_data, input_width, - input_width, static_cast(k), gridx, input_height, - largest)); -#else - FIXED_BLOCK_DIM( - KeMatrixTopK<<>>( - output_data, k, indices_data, input_data, input_width, - input_width, static_cast(k), gridx, input_height, - largest)); -#endif - default: - PADDLE_THROW(platform::errors::Fatal( - "the input data shape has error in the topk cuda kernel.")); - } - } else { - // if get topK not from the last axis, will tranpose the tensor and get - // TopK - - // first step, prepare the trans args for the tranpose - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.emplace_back(i); - } - trans.emplace_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.emplace_back(i); - } - trans.emplace_back(axis); - - framework::DDim trans_dims(in_dims); - framework::DDim trans_out_dims(output->dims()); - for (int i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - trans_out_dims[i] = out_dims[trans[i]]; - } - // second step, tranpose the input - Tensor trans_input; - trans_input.mutable_data(trans_dims, ctx.GetPlace()); - int ndims = trans.size(); - const auto& dev_ctx = ctx.cuda_device_context(); - TransCompute(ndims, dev_ctx, *input, - &trans_input, trans); - // third step, calcluate the topk - // allocate the tmp cuda memory for the tmp result - Tensor trans_ind; - trans_ind.mutable_data(trans_out_dims, ctx.GetPlace()); - Tensor trans_out; - trans_out.mutable_data(trans_out_dims, ctx.GetPlace()); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t input_width = trans_dims[trans_dims.size() - 1]; - - if (k > input_width) k = input_width; - - // The conclusion is drawn from the data through multiple sets of - // statistics - if (input_width >= 128 && k >= input_width * 0.75) { - if (SortTopk(dev_ctx, &trans_input, input_width, input_height, k, - &trans_out, &trans_ind, largest)) { - // last step, tranpose back the indices and output - TransCompute( - ndims, dev_ctx, trans_ind, indices, trans); - TransCompute( - ndims, dev_ctx, trans_out, output, trans); - return; - } else { - LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use " - "default topk kernel."; - } - } - - const int kMaxHeight = 2048; - int gridx = input_height < kMaxHeight ? input_height : kMaxHeight; - switch (GetDesiredBlockDim(input_width)) { -#ifdef PADDLE_WITH_HIP - FIXED_BLOCK_DIM( - KeMatrixTopK<<>>( - trans_out.data(), k, trans_ind.data(), - trans_input.data(), input_width, input_width, - static_cast(k), gridx, input_height, largest)); -#else - FIXED_BLOCK_DIM( - KeMatrixTopK<<>>( - trans_out.data(), k, trans_ind.data(), - trans_input.data(), input_width, input_width, - static_cast(k), gridx, input_height, largest)); -#endif - default: - PADDLE_THROW(platform::errors::Fatal( - "the input data shape has error in the topk cuda kernel.")); - } - - // last step, tranpose back the indices and output - TransCompute( - ndims, dev_ctx, trans_ind, indices, trans); - TransCompute(ndims, dev_ctx, trans_out, - output, trans); - } - } -}; - -#undef FIXED_BLOCK_DIM_BASE -#undef FIXED_BLOCK_DIM -template -class TopkV2OpGradCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(context.GetPlace()), true, - platform::errors::InvalidArgument( - "It must use CUDAPlace, you must check your device set.")); - auto* x = context.Input("X"); - auto* out_grad = context.Input(framework::GradVarName("Out")); - auto* indices = context.Input("Indices"); - auto* x_grad = context.Output(framework::GradVarName("X")); - int axis = context.Attr("axis"); - - const auto& in_dims = x->dims(); - const auto& out_dims = indices->dims(); - - // get the real the axis and the k - if (axis < 0) axis += in_dims.size(); - const int& k = out_dims[axis]; - const int& raw_height = in_dims[axis]; - - // allocate the cuda memory for the x_grad - T* x_grad_data = x_grad->mutable_data(context.GetPlace()); - const T* out_grad_data = out_grad->data(); - const int64_t* indices_data = indices->data(); - - int pre, n, post; - GetDims(in_dims, axis, &pre, &n, &post); - - // calcluate the block and grid num - auto& dev_ctx = context.cuda_device_context(); - auto ComputeBlockSize = [](int col) { - if (col > 512) - return 1024; - else if (col > 256 && col <= 512) - return 512; - else if (col > 128 && col <= 256) - return 256; - else if (col > 64 && col <= 128) - return 128; - else - return 64; - }; - int block_size = ComputeBlockSize(post * k); - int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); - const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1); - int grid_size = std::min(max_blocks, pre); - - // lanuch the cuda kernel to assign the grad - AssignGradWithAxis<<>>( - out_grad_data, indices_data, x_grad_data, pre, post, n, k); - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OP_CUDA_KERNEL( - top_k_v2, - paddle::operators::TopkV2OpCUDAKernel, - paddle::operators::TopkV2OpCUDAKernel, - paddle::operators::TopkV2OpCUDAKernel, - paddle::operators::TopkV2OpCUDAKernel, - paddle::operators::TopkV2OpCUDAKernel); - -REGISTER_OP_CUDA_KERNEL( - top_k_v2_grad, paddle::operators::TopkV2OpGradCUDAKernel< - paddle::platform::CUDADeviceContext, float>, - paddle::operators::TopkV2OpGradCUDAKernel< - paddle::platform::CUDADeviceContext, double>, - paddle::operators::TopkV2OpGradCUDAKernel< - paddle::platform::CUDADeviceContext, int>, - paddle::operators::TopkV2OpGradCUDAKernel< - paddle::platform::CUDADeviceContext, int64_t>, - paddle::operators::TopkV2OpGradCUDAKernel< - paddle::platform::CUDADeviceContext, paddle::platform::float16>); diff --git a/paddle/fluid/operators/top_k_v2_op.h b/paddle/fluid/operators/top_k_v2_op.h deleted file mode 100644 index a808207476f3b9..00000000000000 --- a/paddle/fluid/operators/top_k_v2_op.h +++ /dev/null @@ -1,335 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -/* - The reason why we need the topk v2 is because the compatibility. We redefine - the NaN is maximum value - in the process of comparing. If do not add the topk v2, will affect the - inference result of model that traing - by the older version paddlepaddle. -*/ - -#pragma once -#include -#include -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/top_k_op.h" -#include "paddle/fluid/operators/transpose_op.h" - -namespace paddle { -namespace operators { - -inline void GetDims(const framework::DDim& dim, int axis, int* pre, int* n, - int* post) { - *pre = 1; - *post = 1; - *n = dim[axis]; - for (int i = 0; i < axis; ++i) { - (*pre) *= dim[i]; - } - for (int i = axis + 1; i < dim.size(); ++i) { - (*post) *= dim[i]; - } -} - -template -static void FullTopK(Type input_height, Type input_width, int input_dim, - const framework::Tensor* input, T* t_out, Type* t_indices, - const int& k, const bool& largest, const bool& sorted) { - // when the k is small, will the partial sort - bool partial_sort_flag = (k * 64) < input_width; - -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - // Eigen::DSizes flat2dims(input_height, input_width); - for (Type i = 0; i < input_height; ++i) { - std::vector> col_vec; - col_vec.reserve(input_width); - if (input_dim == 1) { - auto e_input = framework::EigenVector::Flatten(*input); - for (Type j = 0; j < input_width; ++j) { - col_vec.emplace_back(std::pair(e_input(j), j)); - } - } else { - auto e_input = framework::EigenMatrix::Reshape(*input, input_dim - 1); - for (Type j = 0; j < input_width; ++j) { - col_vec.emplace_back(std::pair(e_input(i, j), j)); - } - } - if (partial_sort_flag) { - std::partial_sort( - col_vec.begin(), col_vec.begin() + k, col_vec.end(), - [&largest](const std::pair& l, const std::pair& r) { - if (largest) { - return (std::isnan(static_cast(l.first)) && - !std::isnan(static_cast(r.first))) || - (l.first > r.first); - } else { - return (!std::isnan(static_cast(l.first)) && - std::isnan(static_cast(r.first))) || - (l.first < r.first); - } - }); - } else { - // use the nth-element to get the K-larger or K-small element - if (largest) { - std::nth_element( - col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(), - [](const std::pair& l, const std::pair& r) { - return (std::isnan(static_cast(l.first)) && - !std::isnan(static_cast(r.first))) || - (l.first > r.first); - }); - // the nth-element will get the unorder elements, sort the element - if (sorted) { - std::sort(col_vec.begin(), col_vec.begin() + k - 1, - [&largest](const std::pair& l, - const std::pair& r) { - return (std::isnan(static_cast(l.first)) && - !std::isnan(static_cast(r.first))) || - (l.first > r.first); - }); - } - } else { - std::nth_element( - col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(), - [](const std::pair& l, const std::pair& r) { - return (!std::isnan(static_cast(l.first)) && - std::isnan(static_cast(r.first))) || - (l.first < r.first); - }); - // the nth-element will get the unorder elements, sort the element - if (sorted) { - std::sort( - col_vec.begin(), col_vec.begin() + k - 1, - [](const std::pair& l, const std::pair& r) { - return (!std::isnan(static_cast(l.first)) && - std::isnan(static_cast(r.first))) || - (l.first < r.first); - }); - } - } - } - for (Type j = 0; j < k; ++j) { - t_out[i * k + j] = col_vec[j].first; - t_indices[i * k + j] = col_vec[j].second; - } - } -} - -template -static void FullTopKAssign(const Type& input_height, const Type& input_width, - const int& input_dim, const framework::Tensor* input, - const framework::Tensor* indices, T* output_data, - const int& k) { -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (Type i = 0; i < input_height; ++i) { - if (input_dim == 1) { - auto e_input = framework::EigenVector::Flatten(*input); - auto e_indices = framework::EigenVector::Flatten(*indices); - for (Type j = 0; j < k; ++j) { - output_data[i * input_width + e_indices(j)] = e_input(j); - } - } else { - auto e_input = framework::EigenMatrix::Reshape(*input, input_dim - 1); - auto e_indices = - framework::EigenMatrix::Reshape(*indices, input_dim - 1); - for (Type j = 0; j < k; ++j) { - output_data[i * input_width + e_indices(i, j)] = e_input(i, j); - } - } - } -} - -template -class TopkV2Kernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - // Get the top k elements of each row of input tensor - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - auto* indices = context.Output("Indices"); - const auto& in_dims = input->dims(); - int k = static_cast(context.Attr("k")); - const auto& sorted = static_cast(context.Attr("sorted")); - const auto& largest = static_cast(context.Attr("largest")); - - // axis < 0, cacluate the real axis - int axis = static_cast(context.Attr("axis")); - if (axis < 0) axis += in_dims.size(); - - // if K tensor is not null, will the use K tesnor as k - auto* k_t = context.Input("K"); - if (k_t) { - k = k_t->data()[0]; - framework::DDim output_dims = output->dims(); - // accroding to axis to set K value in the dim - output_dims[axis] = k; - output->Resize(output_dims); - indices->Resize(output_dims); - } - - T* output_data = output->mutable_data(context.GetPlace()); - int64_t* indices_data = indices->mutable_data(context.GetPlace()); - const auto& out_dims = output->dims(); - if (axis + 1 == in_dims.size()) { - const int64_t& input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t& input_width = in_dims[in_dims.size() - 1]; - FullTopK(input_height, input_width, in_dims.size(), input, - output_data, indices_data, k, largest, sorted); - } else { - // if the topk dims is not last dim, will tranpose and do topk - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.emplace_back(i); - } - trans.push_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.emplace_back(i); - } - trans.emplace_back(axis); - - // get the trans input_dims, out_dims - framework::DDim trans_dims(in_dims); - framework::DDim trans_out_dims(output->dims()); - for (size_t i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - } - for (size_t i = 0; i < trans.size(); i++) { - trans_out_dims[i] = out_dims[trans[i]]; - } - - Tensor trans_inp; - trans_inp.mutable_data(trans_dims, context.GetPlace()); - int ndims = trans.size(); - auto& dev_context = - context.template device_context(); - - // transpose the input value - TransCompute(ndims, dev_context, *input, - &trans_inp, trans); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t input_width = trans_dims[trans_dims.size() - 1]; - - // Allocate the temp tensor to the save the topk indices, values - Tensor tmp_out; - T* t_out = tmp_out.mutable_data(trans_out_dims, context.GetPlace()); - Tensor tmp_indices; - auto* t_ind = - tmp_indices.mutable_data(trans_out_dims, context.GetPlace()); - - // get the TopK value - FullTopK(input_height, input_width, in_dims.size(), - &trans_inp, t_out, t_ind, k, largest, sorted); - // transpose back - TransCompute( - ndims, dev_context, tmp_indices, indices, trans); - TransCompute(ndims, dev_context, tmp_out, - output, trans); - } - } -}; - -template -class TopkV2GradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* out_grad = context.Input(framework::GradVarName("Out")); - auto* indices = context.Input("Indices"); - auto* x_grad = context.Output(framework::GradVarName("X")); - int axis = static_cast(context.Attr("axis")); - - const auto& in_dims = x->dims(); - const auto& out_dims = indices->dims(); - - // axis < 0, get the real axis - axis = (axis < 0) ? (in_dims.size() + axis) : axis; - const size_t& k = out_dims[axis]; - - T* x_grad_data = x_grad->mutable_data(context.GetPlace()); - if (axis + 1 == in_dims.size()) { - // allocate the memory for the input_grad - - // assign the out_grad to input_grad directly - const int64_t input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t input_width = in_dims[in_dims.size() - 1]; - - // init the output grad with 0, because some input elements has no grad - memset(x_grad_data, 0, x_grad->numel() * sizeof(T)); - // Assign the output_grad to input_grad - FullTopKAssign(input_height, input_width, in_dims.size(), out_grad, - indices, x_grad_data, k); - } else { - // can not assign grad to input_grad, must do the transpose - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.emplace_back(i); - } - trans.emplace_back(out_dims.size() - 1); - for (int i = axis + 1; i < out_dims.size() - 1; i++) { - trans.emplace_back(i); - } - trans.emplace_back(axis); - framework::DDim trans_dims(out_dims); - framework::DDim trans_in_dims(in_dims); - for (size_t i = 0; i < trans.size(); i++) { - trans_dims[i] = out_dims[trans[i]]; - trans_in_dims[i] = in_dims[trans[i]]; - } - // transpose the out_grad, indices - Tensor trans_dO; - trans_dO.mutable_data(trans_dims, context.GetPlace()); - Tensor trans_ind; - trans_ind.mutable_data(trans_dims, context.GetPlace()); - int ndims = trans.size(); - auto& dev_context = - context.template device_context(); - - // Do transpose - TransCompute(ndims, dev_context, *out_grad, - &trans_dO, trans); - TransCompute( - ndims, dev_context, *indices, &trans_ind, trans); - const int64_t input_height = phi::product( - phi::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1)); - const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1]; - - // Assign the out_grad to tranpose input_grad - Tensor tmp_out; - T* t_out = tmp_out.mutable_data(trans_in_dims, context.GetPlace()); - memset(t_out, 0, x_grad->numel() * sizeof(T)); - - FullTopKAssign(input_height, input_width, in_dims.size(), - &trans_dO, &trans_ind, t_out, k); - - // Transpose back - TransCompute(ndims, dev_context, tmp_out, - x_grad, trans); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/top_k_v2_op_mlu.cc b/paddle/fluid/operators/top_k_v2_op_mlu.cc index 5b8a6b3e754495..caaae02124c926 100644 --- a/paddle/fluid/operators/top_k_v2_op_mlu.cc +++ b/paddle/fluid/operators/top_k_v2_op_mlu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/top_k_v2_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" namespace paddle { diff --git a/paddle/fluid/operators/top_k_v2_op_npu.cc b/paddle/fluid/operators/top_k_v2_op_npu.cc index e11070638834c4..dff5c2d3f39378 100644 --- a/paddle/fluid/operators/top_k_v2_op_npu.cc +++ b/paddle/fluid/operators/top_k_v2_op_npu.cc @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/top_k_v2_op.h" #include #include +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/top_k_v2_op_xpu.cc b/paddle/fluid/operators/top_k_v2_op_xpu.cc index 49daac2ff0da63..4d9c39be92eff0 100644 --- a/paddle/fluid/operators/top_k_v2_op_xpu.cc +++ b/paddle/fluid/operators/top_k_v2_op_xpu.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include -#include "paddle/fluid/operators/top_k_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/transpose_op.h" #include "xpu/refactor/math.h" diff --git a/paddle/fluid/operators/trace_op.cc b/paddle/fluid/operators/trace_op.cc index 63b914a31a86ae..0590b66f6f8688 100644 --- a/paddle/fluid/operators/trace_op.cc +++ b/paddle/fluid/operators/trace_op.cc @@ -107,8 +107,8 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(TraceGradNoNeedBufferVarsInferer, "Input"); } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(trace, TraceInferShapeFunctor, - PT_INFER_META(phi::TraceInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(trace, TraceInferShapeFunctor, + PD_INFER_META(phi::TraceInferMeta)); REGISTER_OPERATOR(trace, ops::TraceOp, ops::TraceOpMaker, ops::TraceGradOpMaker, ops::TraceGradOpMaker, diff --git a/paddle/fluid/operators/transpose_op_npu_test.cc b/paddle/fluid/operators/transpose_op_npu_test.cc index 5617d728a51dc1..fb39034c8e92c1 100644 --- a/paddle/fluid/operators/transpose_op_npu_test.cc +++ b/paddle/fluid/operators/transpose_op_npu_test.cc @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/triangular_solve_op.cc b/paddle/fluid/operators/triangular_solve_op.cc index 9233917b0931b9..df84659a00f4c4 100644 --- a/paddle/fluid/operators/triangular_solve_op.cc +++ b/paddle/fluid/operators/triangular_solve_op.cc @@ -13,7 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/triangular_solve_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/solve_op.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -22,58 +25,6 @@ class TriangularSolveOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "TriangularSolve"); - OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "TriangularSolve"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "TriangularSolve"); - - auto x_dims = ctx->GetInputDim("X"); - auto y_dims = ctx->GetInputDim("Y"); - - auto x_dims_n = x_dims.size(); - auto y_dims_n = y_dims.size(); - - PADDLE_ENFORCE_GE( - x_dims_n, 2, platform::errors::InvalidArgument( - "The input tensor X's dimensions of TriangularSolveOp " - "should be >= 2. But received X's " - "dimensions = %d, X's shape = [%s]", - x_dims.size(), x_dims)); - - PADDLE_ENFORCE_GE( - y_dims_n, 2, platform::errors::InvalidArgument( - "The input tensor Y's dimensions of TriangularSolveOp " - "should be >=2. But received Y's " - "dimensions = %d, Y's shape = [%s]", - y_dims.size(), y_dims)); - - PADDLE_ENFORCE_EQ(x_dims[x_dims_n - 2], x_dims[x_dims_n - 1], - platform::errors::InvalidArgument( - "The inner-most 2 dimensions of Input(X) all should " - "be square matrices " - "But received X's shape[-2] = %d and shape[-1] = %d.", - x_dims[x_dims_n - 2], x_dims[x_dims_n - 1])); - - std::vector x_dims_vec = phi::vectorize(x_dims); - std::vector y_dims_vec = phi::vectorize(y_dims); - - std::vector x_dims_vec_cut(x_dims_vec.begin(), - x_dims_vec.end() - 2); - std::vector y_dims_vec_cut(y_dims_vec.begin(), - y_dims_vec.end() - 2); - - std::vector expand_batch_portion = - get_broadcast_batch_portion(x_dims_vec_cut, y_dims_vec_cut); - - std::vector y_broadcast_dims({expand_batch_portion}); - y_broadcast_dims.insert(y_broadcast_dims.end(), {y_dims_vec[y_dims_n - 2], - y_dims_vec[y_dims_n - 1]}); - - // dim of 'Out' is the same with 'Y' after broadcast - ctx->SetOutputDim("Out", phi::make_ddim(y_broadcast_dims)); - ctx->ShareLoD("X", /*->*/ "Out"); - } - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const { return framework::OpKernelType( @@ -168,20 +119,15 @@ class TriangularSolveOpGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(triangular_solve, TriangularSolveInferShapeFunctor, + PD_INFER_META(phi::TriangularSolveInferMeta)); + REGISTER_OPERATOR(triangular_solve, ops::TriangularSolveOp, ops::TriangularSolveOpMaker, ops::TriangularSolveOpInferVarType, ops::TriangularSolveOpGradMaker, - ops::TriangularSolveOpGradMaker); + ops::TriangularSolveOpGradMaker, + TriangularSolveInferShapeFunctor); REGISTER_OPERATOR(triangular_solve_grad, ops::TriangularSolveGradOp); - -REGISTER_OP_CPU_KERNEL( - triangular_solve, - ops::TriangularSolveKernel, - ops::TriangularSolveKernel); - -REGISTER_OP_CPU_KERNEL( - triangular_solve_grad, - ops::TriangularSolveGradKernel, - ops::TriangularSolveGradKernel); diff --git a/paddle/fluid/operators/triangular_solve_op.cu b/paddle/fluid/operators/triangular_solve_op.cu deleted file mode 100644 index 7df98517e84189..00000000000000 --- a/paddle/fluid/operators/triangular_solve_op.cu +++ /dev/null @@ -1,65 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/reduce_ops/reduce_op.h" -#include "paddle/fluid/operators/triangular_solve_op.h" - -namespace paddle { -namespace operators { - -template -class MatrixReduceSumFunctor { - public: - void operator()(const Tensor& in, Tensor* out, - const framework::ExecutionContext& ctx) { - // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3] - // out_reduce_dim should be [0, 2] - const std::vector in_dims = phi::vectorize(in.dims()); - auto in_size = in_dims.size(); - const std::vector out_dims = phi::vectorize(out->dims()); - auto out_size = out_dims.size(); - - std::vector out_bst_dims(in_size); - - std::fill(out_bst_dims.data(), out_bst_dims.data() + in_size - out_size, 1); - std::copy(out_dims.data(), out_dims.data() + out_size, - out_bst_dims.data() + in_size - out_size); - - std::vector out_reduce_dims; - for (size_t idx = 0; idx <= in_size - 3; idx++) { - if (in_dims[idx] != 1 && out_bst_dims[idx] == 1) { - out_reduce_dims.push_back(idx); - } - } - gpuStream_t stream = ctx.cuda_device_context().stream(); - TensorReduceImpl>( - ctx.cuda_device_context(), in, out, kps::IdentityFunctor(), - out_reduce_dims, stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - triangular_solve, - ops::TriangularSolveKernel, - ops::TriangularSolveKernel); - -REGISTER_OP_CUDA_KERNEL( - triangular_solve_grad, - ops::TriangularSolveGradKernel, - ops::TriangularSolveGradKernel); diff --git a/paddle/fluid/operators/triangular_solve_op.h b/paddle/fluid/operators/triangular_solve_op.h index 4e68add096ff28..315847b4d800e4 100644 --- a/paddle/fluid/operators/triangular_solve_op.h +++ b/paddle/fluid/operators/triangular_solve_op.h @@ -21,7 +21,6 @@ limitations under the License. */ #include "paddle/fluid/operators/solve_op.h" #include "paddle/fluid/operators/tril_triu_op.h" #include "paddle/phi/core/ddim.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/complex_functors.h" namespace paddle { @@ -30,10 +29,10 @@ namespace operators { using Tensor = framework::Tensor; template -static void triangular_solve(const DeviceContext& context, const Tensor& x, - const Tensor& y, Tensor* out, bool upper, +static void triangular_solve(const DeviceContext &context, const Tensor &x, + const Tensor &y, Tensor *out, bool upper, bool transpose, bool unitriangular) { - // Tensor broadcast use eigen + // Tensor broadcast use eigen library std::vector x_bst_dims_vec; std::vector y_bst_dims_vec; std::tie(x_bst_dims_vec, y_bst_dims_vec) = get_broadcast_dims(x, y); @@ -64,15 +63,15 @@ static void triangular_solve(const DeviceContext& context, const Tensor& x, template class MatrixReduceSumFunctor { public: - void operator()(const Tensor& input, Tensor* output, - const framework::ExecutionContext& ctx); + void operator()(const Tensor &input, Tensor *output, + const framework::ExecutionContext &ctx); }; template class MatrixReduceSumFunctor { public: - void operator()(const Tensor& in, Tensor* out, - const framework::ExecutionContext& ctx) { + void operator()(const Tensor &in, Tensor *out, + const framework::ExecutionContext &ctx) { // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3] // out_reduce_dim should be [0, 2] const std::vector in_dims = phi::vectorize(in.dims()); @@ -101,129 +100,5 @@ class MatrixReduceSumFunctor { } }; -template -class TriangularSolveKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const auto* x = ctx.Input("X"); - const auto* y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); - - bool upper = ctx.template Attr("upper"); - bool transpose = ctx.template Attr("transpose"); - bool unitriangular = ctx.template Attr("unitriangular"); - - const auto& dev_ctx = ctx.template device_context(); - triangular_solve(dev_ctx, *x, *y, out, upper, transpose, - unitriangular); - } -}; - -template -class TriangularSolveGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const auto* x = ctx.Input("X"); - const auto* y = ctx.Input("Y"); - const auto* out = ctx.Input("Out"); - const auto* dout = - ctx.Input(framework::GradVarName("Out")); - - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - - bool upper = ctx.template Attr("upper"); - bool transpose = ctx.template Attr("transpose"); - bool unitriangular = ctx.template Attr("unitriangular"); - - auto& dev_ctx = ctx.template device_context(); - - std::vector x_bst_dims_vec; - std::vector y_bst_dims_vec; - std::tie(x_bst_dims_vec, y_bst_dims_vec) = get_broadcast_dims(*x, *y); - - Tensor dy_bst(y->type()); - if (dy) { - dy->mutable_data(y->dims(), dev_ctx.GetPlace()); - dy_bst.Resize(phi::make_ddim(y_bst_dims_vec)); - dy_bst.mutable_data(dev_ctx.GetPlace()); - - // calculate x's conjugate for complex - Tensor x_conj(x->type()); - platform::ForRange x_for_range(dev_ctx, x->numel()); - phi::funcs::ConjFunctor x_functor( - x->data(), x->numel(), - x_conj.mutable_data(x->dims(), dev_ctx.GetPlace())); - x_for_range(x_functor); - - // reuse forward to get dy_bst, and the result has been broadcated. - triangular_solve(dev_ctx, x_conj, *dout, &dy_bst, upper, - !transpose, unitriangular); - - if (dy_bst.dims() == dy->dims()) { - framework::TensorCopy(dy_bst, dev_ctx.GetPlace(), dev_ctx, dy); - } else { - MatrixReduceSumFunctor functor; - functor(dy_bst, dy, ctx); - dy->Resize(y->dims()); - } - } - - Tensor dx_bst(x->type()); - if (dx) { - dx->mutable_data(x->dims(), dev_ctx.GetPlace()); - dx_bst.Resize(phi::make_ddim(x_bst_dims_vec)); - dx_bst.mutable_data(dev_ctx.GetPlace()); - - // calculate out's conjugate for complex - Tensor out_conj(out->type()); - platform::ForRange out_for_range(dev_ctx, out->numel()); - phi::funcs::ConjFunctor out_functor( - out->data(), out->numel(), - out_conj.mutable_data(out->dims(), dev_ctx.GetPlace())); - out_for_range(out_functor); - - auto blas = phi::funcs::GetBlas(ctx); - if (transpose) { - auto mat_dim_a = - phi::funcs::CreateMatrixDescriptor(out_conj.dims(), 0, false); - auto mat_dim_b = - phi::funcs::CreateMatrixDescriptor(dy_bst.dims(), 0, true); - blas.MatMul(out_conj, mat_dim_a, dy_bst, mat_dim_b, static_cast(-1), - &dx_bst, static_cast(0)); - } else { - auto mat_dim_a = - phi::funcs::CreateMatrixDescriptor(dy_bst.dims(), 0, false); - auto mat_dim_b = - phi::funcs::CreateMatrixDescriptor(out_conj.dims(), 0, true); - blas.MatMul(dy_bst, mat_dim_a, out_conj, mat_dim_b, static_cast(-1), - &dx_bst, static_cast(0)); - } - - Tensor dx_bst_upper(x->type()); - // get upper or lower triangular - dx_bst_upper.Resize(dx_bst.dims()); - dx_bst_upper.mutable_data(dev_ctx.GetPlace()); - - const auto& dims = dx_bst.dims(); - const auto H = dims[dims.size() - 2]; - const auto W = dims[dims.size() - 1]; - platform::ForRange x_for_range(dev_ctx, dx_bst.numel()); - TrilTriuCompute tril_triu_computer(dx_bst.data(), unitriangular, - !upper, H, W, - dx_bst_upper.data()); - x_for_range(tril_triu_computer); - - if (dx_bst_upper.dims() == dx->dims()) { - framework::TensorCopy(dx_bst_upper, dev_ctx.GetPlace(), dev_ctx, dx); - } else { - MatrixReduceSumFunctor functor; - functor(dx_bst_upper, dx, ctx); - dx->Resize(x->dims()); - } - } - } -}; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/tril_triu_op_xpu.cc b/paddle/fluid/operators/tril_triu_op_xpu.cc new file mode 100644 index 00000000000000..e36cbcf228cfbf --- /dev/null +++ b/paddle/fluid/operators/tril_triu_op_xpu.cc @@ -0,0 +1,53 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under +the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU + +#include "paddle/fluid/operators/tril_triu_op.h" +#include "paddle/fluid/platform/device/device_wrapper.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class TrilTriuXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const auto* x = context.Input("X"); + const auto* x_data = x->data(); + auto* out = context.Output("Out"); + auto* out_data = out->mutable_data(context.GetPlace()); + + const int diagonal = context.Attr("diagonal"); + const bool lower = context.Attr("lower"); + auto xshape = phi::vectorize(x->dims()); + auto& dev_ctx = context.template device_context(); + int r = 0; + if (lower) { + r = xpu::tril(dev_ctx.x_context(), x_data, out_data, xshape, diagonal); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "tril_op"); + } else { + r = xpu::triu(dev_ctx.x_context(), x_data, out_data, xshape, diagonal); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "triu_op"); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_XPU_KERNEL( + tril_triu, ops::TrilTriuXPUKernel, + ops::TrilTriuXPUKernel); +#endif diff --git a/paddle/fluid/operators/trunc_op.cc b/paddle/fluid/operators/trunc_op.cc index 54f4deac80a74e..b77775f5a8c094 100644 --- a/paddle/fluid/operators/trunc_op.cc +++ b/paddle/fluid/operators/trunc_op.cc @@ -69,8 +69,8 @@ class TruncGradOpMaker : public framework::SingleGradOpMaker { } // namespace operators } // namespace paddle -DELCARE_INFER_SHAPE_FUNCTOR(trunc, TruncInferShapeFunctor, - PT_INFER_META(phi::UnchangedInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(trunc, TruncInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); namespace ops = paddle::operators; REGISTER_OPERATOR(trunc, ops::TruncOp, ops::TruncOpMaker, diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cc b/paddle/fluid/operators/truncated_gaussian_random_op.cc index 6eb7f922dfdbec..dc5a66dce16d69 100644 --- a/paddle/fluid/operators/truncated_gaussian_random_op.cc +++ b/paddle/fluid/operators/truncated_gaussian_random_op.cc @@ -17,8 +17,10 @@ limitations under the License. */ #include #include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/truncated_gaussian_random_op.h" +#include "paddle/phi/infermeta/nullary.h" namespace paddle { namespace operators { @@ -27,26 +29,6 @@ class TruncatedGaussianRandomOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasOutput("Out"), true, - platform::errors::NotFound( - "Output(Out) of TruncatedGaussianRandomOp should not be null.")); - auto shape = ctx->Attrs().Get>("shape"); - std::vector out_dim; - out_dim.reserve(shape.size()); - for (auto dim : shape) { - out_dim.push_back(static_cast(dim)); - } - PADDLE_ENFORCE_GT( - shape.size(), 0UL, - platform::errors::InvalidArgument( - "the input shape of TruncatedGaussianRandomOp must be set, " - "But the rank of shape we received is %d", - shape.size())); - ctx->SetOutputDim("Out", phi::make_ddim(out_dim)); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -99,6 +81,14 @@ Used to initialize tensors with truncated gaussian random generator. } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(truncated_gaussian_random, - ops::TruncatedGaussianRandomOp, - ops::TruncatedGaussianRandomOpMaker); + +DECLARE_INFER_SHAPE_FUNCTOR( + truncated_gaussian_random, TruncatedGaussianRandomInferShapeFunctor, + PD_INFER_META(phi::TruncatedGaussianRandomInferMeta)); + +REGISTER_OPERATOR( + truncated_gaussian_random, ops::TruncatedGaussianRandomOp, + ops::TruncatedGaussianRandomOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + TruncatedGaussianRandomInferShapeFunctor); diff --git a/paddle/fluid/operators/unfold_op.cc b/paddle/fluid/operators/unfold_op.cc index c45b839d5b40bd..02fed3de6cef74 100644 --- a/paddle/fluid/operators/unfold_op.cc +++ b/paddle/fluid/operators/unfold_op.cc @@ -119,8 +119,8 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(UnfoldGradOpNoNeedBufferVarsInferer, "X"); } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(unfold, UnfoldInferShapeFunctor, - PT_INFER_META(phi::UnfoldInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(unfold, UnfoldInferShapeFunctor, + PD_INFER_META(phi::UnfoldInferMeta)); REGISTER_OPERATOR(unfold, ops::UnfoldOp, ops::UnfoldOpMaker, ops::UnfoldGradMaker, ops::UnfoldGradMaker, diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake index 5ab2004617810b..1be8f3387dbad8 100644 --- a/paddle/fluid/operators/unity_build_rule.cmake +++ b/paddle/fluid/operators/unity_build_rule.cmake @@ -236,7 +236,6 @@ register_unity_group(cc scatter_nd_add_op.cc scatter_op.cc seed_op.cc - segment_pool_op.cc select_input_op.cc select_output_op.cc) register_unity_group(cc @@ -496,8 +495,7 @@ register_unity_group(cu scale_op.cu scatter_nd_add_op.cu scatter_op.cu - seed_op.cu - segment_pool_op.cu) + seed_op.cu) register_unity_group(cu roi_pool_op.cu selu_op.cu diff --git a/paddle/fluid/operators/unsqueeze_op_npu_test.cc b/paddle/fluid/operators/unsqueeze_op_npu_test.cc index 3e11c952d15f34..a8ced783744a96 100644 --- a/paddle/fluid/operators/unsqueeze_op_npu_test.cc +++ b/paddle/fluid/operators/unsqueeze_op_npu_test.cc @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/viterbi_decode_op.cc b/paddle/fluid/operators/viterbi_decode_op.cc index bf1cdeed65a842..602376d54e0d2a 100644 --- a/paddle/fluid/operators/viterbi_decode_op.cc +++ b/paddle/fluid/operators/viterbi_decode_op.cc @@ -9,8 +9,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/viterbi_decode_op.h" +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -19,47 +21,6 @@ class ViterbiDecodeOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "ViterbiDecode"); - OP_INOUT_CHECK(ctx->HasInput("Transition"), "Input", "Transition", - "ViterbiDecode"); - OP_INOUT_CHECK(ctx->HasInput("Length"), "Input", "Length", "ViterbiDecode"); - OP_INOUT_CHECK(ctx->HasOutput("Scores"), "Output", "Scores", - "ViterbiDecode"); - OP_INOUT_CHECK(ctx->HasOutput("Path"), "Output", "Path", "ViterbiDecode"); - auto in_dims = ctx->GetInputDim("Input"); - PADDLE_ENFORCE_EQ(in_dims.size(), 3, - platform::errors::InvalidArgument( - "The rank of Input in ViterbiDecode must be 3. But " - "received Input's rank is %d.", - in_dims.size())); - auto length_dims = ctx->GetInputDim("Length"); - PADDLE_ENFORCE_EQ(length_dims.size(), 1, - platform::errors::InvalidArgument( - "The rank of Length in ViterbiDecode must be 1. But " - "received Length's rank is %d.", - length_dims.size())); - auto transition_dims = ctx->GetInputDim("Transition"); - PADDLE_ENFORCE_EQ( - transition_dims.size(), 2, - platform::errors::InvalidArgument( - "The rank of Transition in ViterbiDecode must be 2. But " - "received Transition's rank is %d.", - transition_dims.size())); - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ( - in_dims[0], length_dims[0], - platform::errors::InvalidArgument( - "The batch size of Input and Length should be equal.")); - PADDLE_ENFORCE_EQ(in_dims[2], transition_dims[0], - platform::errors::InvalidArgument( - "The number of tags of Input (%d) and Transition " - "(%d) should be equal.", - transition_dims[0], in_dims[2])); - } - ctx->SetOutputDim("Scores", length_dims); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -102,8 +63,8 @@ class ViterbiDecodeOpMaker : public framework::OpProtoAndCheckerMaker { namespace ops = paddle::operators; namespace platform = paddle::platform; +DECLARE_INFER_SHAPE_FUNCTOR(viterbi_decode, ViterbiDecodeInferShapeFunctor, + PD_INFER_META(phi::ViterbiDecodeInferMeta)); REGISTER_OP_WITHOUT_GRADIENT(viterbi_decode, ops::ViterbiDecodeOp, - ops::ViterbiDecodeOpMaker); -REGISTER_OP_CPU_KERNEL( - viterbi_decode, ops::ViterbiDecodeKernel, - ops::ViterbiDecodeKernel); + ops::ViterbiDecodeOpMaker, + ViterbiDecodeInferShapeFunctor); diff --git a/paddle/fluid/operators/viterbi_decode_op.cu b/paddle/fluid/operators/viterbi_decode_op.cu deleted file mode 100644 index 68628fb2748c42..00000000000000 --- a/paddle/fluid/operators/viterbi_decode_op.cu +++ /dev/null @@ -1,206 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at -http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/elementwise/elementwise_functor.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" -#include "paddle/fluid/operators/viterbi_decode_op.h" -#include "paddle/phi/kernels/funcs/gather.cu.h" - -#ifdef __NVCC__ -#include "cub/cub.cuh" -#endif -#ifdef __HIPCC__ -#include -namespace cub = hipcub; -#endif - -namespace paddle { -namespace operators { - -#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...) \ - case (1 << (log2_block_dim)): { \ - constexpr auto kBlockDim = (1 << (log2_block_dim)); \ - __VA_ARGS__; \ - } break - -#define FIXED_BLOCK_DIM_CASE(...) \ - FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__); - -int64_t ComputeBlockSize(int64_t col) { - if (col > 512) - return 1024; - else if (col > 256) - return 512; - else if (col > 128) - return 256; - else if (col > 64) - return 128; - else if (col > 32) - return 64; - else if (col > 16) - return 32; - else if (col > 8) - return 16; - else - return 8; -} - -template