diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6140340890c0e..7c570e6d0d6ee 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -26,14 +26,6 @@ repos: entry: bash ./.clang_format.hook -i language: system files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$ -- repo: local - hooks: - - id: cpplint-cpp-source - name: cpplint - description: Check C++ code style using cpplint.py. - entry: bash ./tools/codestyle/cpplint_pre_commit.hook - language: system - files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx)$ - repo: https://github.com/PaddlePaddle/pre-commit-golang sha: 8337620115c25ff8333f1b1a493bd031049bd7c0 hooks: diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index 73d70c34dce8b..9a3bcd2c83305 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -9,7 +9,7 @@ if(WITH_AMD_GPU) extern_eigen3 ${EXTERNAL_PROJECT_LOG_ARGS} GIT_REPOSITORY "https://github.com/sabreshao/hipeigen.git" - GIT_TAG 0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9 + GIT_TAG e1c9e50333361eb826a2b35bda5d08c55dfbf16e PREFIX ${EIGEN_SOURCE_DIR} UPDATE_COMMAND "" CONFIGURE_COMMAND "" diff --git a/cmake/generic.cmake b/cmake/generic.cmake index e8bc285bdc95e..bef6d270b768f 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -338,9 +338,13 @@ function(hip_library TARGET_NAME) target_link_libraries(${TARGET_NAME} /opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a) find_fluid_modules(${TARGET_NAME}) endif() - if (hip_library_DEPS) - add_dependencies(${TARGET_NAME} ${hip_library_DEPS}) - target_link_libraries(${TARGET_NAME} ${hip_library_DEPS}) + if("${hip_library_DEPS}" MATCHES "ARCHIVE_START") + # Support linking flags: --whole-archive (Linux) / -force_load (MacOS). + # WARNING: Please don't use ARCHIVE_START&ARCHIVE_END if TARGET_NAME will be linked by other libraries. + target_circle_link_libraries(${TARGET_NAME} ${hip_library_DEPS}) + list(REMOVE_ITEM hip_library_DEPS ARCHIVE_START ARCHIVE_END) + else() + target_link_libraries(${TARGET_NAME} ${hip_library_DEPS}) endif() # cpplint code style foreach(source_file ${hip_library_SRCS}) diff --git a/cmake/hip.cmake b/cmake/hip.cmake index bfe491bd6b760..ebc725f92b659 100644 --- a/cmake/hip.cmake +++ b/cmake/hip.cmake @@ -11,7 +11,7 @@ include_directories("/opt/rocm/thrust") list(APPEND EXTERNAL_LIBS "-L/opt/rocm/lib/ -lhip_hcc") -set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++14" ) +set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++11" ) if(WITH_DSO) set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_USE_DSO") diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt index d725763b01d59..8bf8e7a0a6a7b 100644 --- a/paddle/fluid/CMakeLists.txt +++ b/paddle/fluid/CMakeLists.txt @@ -3,6 +3,6 @@ add_subdirectory(platform) add_subdirectory(framework) add_subdirectory(operators) add_subdirectory(pybind) -add_subdirectory(inference) +#add_subdirectory(inference) add_subdirectory(string) add_subdirectory(recordio) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index a473ed7400012..4c3ed05df8e67 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -8,6 +8,8 @@ nv_test(dim_test SRCS dim_test.cu DEPS ddim) if(WITH_GPU) nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place paddle_memory device_context framework_proto) +elseif(WITH_AMD_GPU) + hip_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place paddle_memory device_context framework_proto) else() cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place paddle_memory device_context framework_proto) endif() @@ -23,7 +25,7 @@ cc_test(eigen_test SRCS eigen_test.cc DEPS tensor) nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place paddle_memory device_context init) cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio) -cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory) +hip_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory) nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init) cc_library(reader SRCS reader.cc DEPS lod_tensor ddim) @@ -43,6 +45,9 @@ nv_test(data_device_transform_test SRCS data_device_transform_test.cu if(WITH_GPU) nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor) nv_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform) +elseif(WITH_AMD_GPU) + hip_library(data_type_transform SRCS data_type_transform.cu DEPS tensor) + hip_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform) else() cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor) cc_test(data_type_transform_test SRCS data_type_transform_test.cc DEPS data_type_transform) @@ -55,7 +60,7 @@ cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor framework_proto selected_rows data_device_transform data_type_transform data_layout_transform) cc_library(attribute SRCS attribute.cc DEPS framework_proto boost) -cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc +hip_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc device_context) cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute) cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker) @@ -63,11 +68,11 @@ cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto) cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context) cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog shape_inference data_transform lod_tensor profiler) -cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry init) +hip_test(operator_test SRCS operator_test.cc DEPS operator op_registry init) cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog) cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) -nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) +hip_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) py_proto_compile(framework_py_proto SRCS framework.proto) # Generate an empty __init__.py to make framework_py_proto as a valid python module. @@ -80,7 +85,7 @@ add_custom_command(TARGET framework_py_proto POST_BUILD WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) cc_library(backward SRCS backward.cc DEPS net_op) -cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context fill_constant_op) +hip_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context fill_constant_op) cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor) cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog) @@ -92,11 +97,11 @@ framework_proto backward glog lod_rank_table feed_fetch_method) cc_library(parallel_executor SRCS parallel_executor.cc DEPS multi_devices_graph_builder threaded_ssa_graph_executor) cc_library(prune SRCS prune.cc DEPS framework_proto) -cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) -cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry +hip_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) +hip_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry proto_desc) cc_library(selected_rows SRCS selected_rows.cc DEPS tensor) -cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows) +hip_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows) cc_library(init SRCS init.cc DEPS gflags device_context place stringpiece operator) cc_test(init_test SRCS init_test.cc DEPS init) @@ -105,7 +110,7 @@ cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_contex cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc) cc_test(channel_test SRCS channel_test.cc) -cc_test(tuple_test SRCS tuple_test.cc ) -cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op +hip_test(tuple_test SRCS tuple_test.cc ) +hip_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op channel_send_op channel_recv_op sum_op select_op elementwise_add_op compare_op conditional_block_op while_op assign_op print_op executor proto_desc) diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc index c0523f3c795b1..270022af4f991 100644 --- a/paddle/fluid/framework/data_type_transform.cc +++ b/paddle/fluid/framework/data_type_transform.cc @@ -47,7 +47,7 @@ struct CastDataType { auto* context = static_cast(ctx_); trans(*context, in_begin, in_end, out_begin, CastDataTypeFunctor()); -#ifdef __NVCC__ +#ifdef __HIPCC__ } else if (platform::is_gpu_place(in_.place())) { platform::Transform trans; auto* context = static_cast(ctx_); diff --git a/paddle/fluid/framework/ddim.h b/paddle/fluid/framework/ddim.h index f05b5ee3faee8..465207ddb3ba2 100644 --- a/paddle/fluid/framework/ddim.h +++ b/paddle/fluid/framework/ddim.h @@ -17,7 +17,11 @@ limitations under the License. */ #include #include #include +#ifdef PADDLE_WITH_HIP +#include "paddle/fluid/framework/dim_hip.h" +#else #include "paddle/fluid/framework/dim.h" +#endif #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/variant.h" diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index bf1a705ef50b6..5e2fe79179baa 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -4,6 +4,8 @@ cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_h cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory dynload_cuda) +hip_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory + dynload_hip) cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry) cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base) @@ -12,7 +14,11 @@ cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph) if(WITH_GPU) set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle) else() + if(WITH_AMD_GPU) + set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle) + else() set(multi_devices_graph_builder_deps) + endif() endif() cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle scale_loss_grad_op_handle ${multi_devices_graph_builder_deps}) diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc index 9180903b864d0..6a403d2ead976 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_op_handle.cc @@ -59,7 +59,7 @@ void FetchOpHandle::RunImpl() { auto &scope = scopes[i]; auto &t = scope->FindVar(var_name)->Get(); if (platform::is_gpu_place(var->place_)) { -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) TensorCopy(t, cpu, *dev_ctxes_[t.place()], &tensors_[i]); dev_ctxes_[t.place()]->Wait(); #endif diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index 128a5344fbb8c..1e40077d6f47f 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -17,7 +17,7 @@ #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h" #include "paddle/fluid/framework/scope.h" -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) #include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h" #endif @@ -28,7 +28,7 @@ namespace paddle { namespace framework { namespace details { -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( const std::vector &places, const std::string &loss_var_name, @@ -97,7 +97,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( if (is_forwarding) { if (var_names.size() == 1 && var_names[0] == loss_var_name_) { // Insert ScaleCost OpHandle -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) auto *communication_dev_ctx = nccl_ctxs_->DevCtx(p); #else auto *communication_dev_ctx = @@ -135,7 +135,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( og_has_been_broadcast.count(og) == 0) { // is param grad // Insert NCCL AllReduce Op og_has_been_broadcast.insert(og); -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) result.ops_.emplace_back( new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_)); auto *op_handle = result.ops_.back().get(); diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h index d3c8e582cf2cd..796a1db80cd66 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.h +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h @@ -26,7 +26,7 @@ class Scope; namespace details { class MultiDevSSAGraphBuilder : public SSAGraphBuilder { public: -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) MultiDevSSAGraphBuilder(const std::vector &places, const std::string &loss_var_name, const std::unordered_set ¶ms, @@ -47,7 +47,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { const std::vector &local_scopes_; std::unordered_set grad_names_; -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) platform::NCCLContextMap *nccl_ctxs_; #endif }; diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc index 55b5f113589e0..44cd1c69ec00c 100644 --- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc @@ -63,8 +63,8 @@ void NCCLAllReduceOpHandle::RunImpl() { auto stream = nccl_ctx.stream(); auto comm = nccl_ctx.comm_; all_reduce_calls.emplace_back([=] { - PADDLE_ENFORCE(platform::dynload::ncclAllReduce( - buffer, buffer, numel, static_cast(dtype), ncclSum, + PADDLE_ENFORCE(platform::dynload::rcclAllReduce( + buffer, buffer, numel, static_cast(dtype), rcclSum, comm, stream)); }); } diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h index ad14a3c5cb462..5c0b9f2e5677b 100644 --- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h +++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h @@ -20,7 +20,11 @@ #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" +#ifdef PADDLE_WITH_HIP +#include "paddle/fluid/platform/rccl_helper.h" +#else #include "paddle/fluid/platform/nccl_helper.h" +#endif namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index e4194a7442f67..5572cdd148a8b 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -36,6 +36,10 @@ OpHandleBase::~OpHandleBase() { for (auto &ev : events_) { PADDLE_ENFORCE(cudaEventDestroy(ev.second)); } +#elif defined(PADDLE_WITH_HIP) + for (auto &ev : events_) { + PADDLE_ENFORCE(hipEventDestroy(ev.second)); + } #endif } @@ -49,6 +53,15 @@ void OpHandleBase::Run(bool use_event) { cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming)); } } +#elif defined(PADDLE_WITH_HIP) + if (events_.empty() && use_event) { + for (auto &p : dev_ctxes_) { + int dev_id = boost::get(p.first).device; + PADDLE_ENFORCE(hipSetDevice(dev_id)); + PADDLE_ENFORCE( + hipEventCreateWithFlags(&events_[dev_id], hipEventDisableTiming)); + } + } #else PADDLE_ENFORCE(!use_event); #endif @@ -64,6 +77,15 @@ void OpHandleBase::Run(bool use_event) { PADDLE_ENFORCE(cudaEventRecord(events_.at(dev_id), stream)); } } +#elif defined(PADDLE_WITH_HIP) + if (use_event) { + for (auto &p : dev_ctxes_) { + int dev_id = boost::get(p.first).device; + auto stream = + static_cast(p.second)->stream(); + PADDLE_ENFORCE(hipEventRecord(events_.at(dev_id), stream)); + } + } #endif } @@ -80,6 +102,18 @@ void OpHandleBase::Wait(platform::DeviceContext *waited_dev) { PADDLE_ENFORCE(cudaStreamWaitEvent(stream, ev.second, 0)); } } +#elif defined(PADDLE_WITH_HIP) + if (platform::is_cpu_place(waited_dev->GetPlace()) || events_.empty()) { + for (auto &dev_ctx : dev_ctxes_) { + dev_ctx.second->Wait(); + } + } else { + auto stream = + static_cast(waited_dev)->stream(); + for (auto &ev : events_) { + PADDLE_ENFORCE(hipStreamWaitEvent(stream, ev.second, 0)); + } + } #else for (auto &dev_ctx : dev_ctxes_) { dev_ctx.second->Wait(); diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index d7a541ac4bb83..a5e4e1fb38931 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -37,6 +37,8 @@ class OpHandleBase { #ifdef PADDLE_WITH_CUDA std::unordered_map events_; +#elif defined(PADDLE_WITH_HIP) + std::unordered_map events_; #endif OpHandleBase() {} diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc index 0a6f6129b812c..88f7355a63924 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc @@ -36,7 +36,7 @@ void ScaleLossGradOpHandle::RunImpl() { if (platform::is_cpu_place(place_)) { *tmp = coeff_; } else { -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) auto stream = static_cast(this->dev_ctxes_[place_]) ->stream(); diff --git a/paddle/fluid/framework/dim_hip.h b/paddle/fluid/framework/dim_hip.h new file mode 100644 index 0000000000000..1e670b13d560c --- /dev/null +++ b/paddle/fluid/framework/dim_hip.h @@ -0,0 +1,430 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include +#include + +#include "paddle/fluid/platform/assert.h" +#include "paddle/fluid/platform/hostdevice.h" + +#ifdef __HIPCC__ +#define POSTHOSTDEVICE restrict(amp, cpu) +#define POSTDEVICE restrict(amp) +#define POSTHOST restrict(cpu) +#else +#define POSTHOSTDEVICE +#define POSTDEVICE +#define POSTHOST +#endif + + +namespace paddle { +namespace framework { + +// Statically sized, statically indexed dimension +template +struct Dim { + static constexpr int dimensions = i; + + template + Dim(int64_t _head, Args... _tail) POSTHOSTDEVICE : head(_head), tail(_tail...) { + static_assert(sizeof...(_tail) == i - 1, + "Dim initialized with the wrong number of parameters"); + } + + Dim(int64_t _head, const Dim& _tail) POSTHOSTDEVICE : head(_head), tail(_tail) {} + + Dim() POSTHOSTDEVICE : head(0), tail() {} + + /** Construct a Dim from a linear index and size. Uses Fortran order + * indexing. */ + Dim(int64_t idx, const Dim& size) POSTHOSTDEVICE + : head(idx % size.head), tail(idx / size.head, size.tail) {} + + /** Construct a Dim with each dimension set to the given index */ + Dim(int64_t idx) POSTHOSTDEVICE : head(idx), tail(idx) {} + + bool operator==(const Dim& o) const POSTHOSTDEVICE { + return (head == o.head) && (tail == o.tail); + } + + bool operator!=(const Dim& o) const POSTHOSTDEVICE { return !(*this == o); } + + int64_t& operator[](int idx) POSTHOSTDEVICE; + int64_t operator[](int idx) const POSTHOSTDEVICE; + + std::string to_string() const POSTHOST; + + int64_t head; + Dim tail; +}; + +// Base case specialization +template <> +struct Dim<0> { + static constexpr int dimensions = 0; + + Dim(int64_t _head) POSTHOSTDEVICE {} + + Dim() POSTHOSTDEVICE {} + + Dim(int idx, const Dim<0>& size) POSTHOSTDEVICE { +#ifndef __HIP_DEVICE_COMPILE__ + if (idx > 0) { + ;//throw std::invalid_argument("Index out of range."); + } +#else + PADDLE_ASSERT(idx == 0); +#endif + } + + bool operator==(const Dim<0>& o) const POSTHOSTDEVICE { return true; } + + bool operator!=(const Dim<0>& o) const POSTHOSTDEVICE { return false; } + + int64_t& operator[](int idx) POSTHOSTDEVICE; + int64_t operator[](int idx) const POSTHOSTDEVICE; + +}; + +namespace { + +// Helper for accessing Dim classes +template +struct DimGetter { + // Return a copy if Dim is const + template + static int64_t impl(const D& d) POSTHOSTDEVICE { + return DimGetter::impl(d.tail); + } + // Return a reference if Dim is mutable + template + static int64_t& impl(D& d) POSTHOSTDEVICE { + return DimGetter::impl(d.tail); + } +}; + +// Eureka! We found the element! +template <> +struct DimGetter<0> { + // Return a copy if Dim is const + template + static int64_t impl(const D& d) POSTHOSTDEVICE { + return d.head; + } + // Return a reference if Dim is mutable + template + static int64_t& impl(D& d) POSTHOSTDEVICE { + return d.head; + } +}; + +template +int64_t& indexer(Dim& dim, int idx) POSTHOSTDEVICE { +#ifndef __HIP_DEVICE_COMPILE__ + if (idx < 0) { + ;//throw std::invalid_argument("Tried to access a negative dimension"); + } +#else + PADDLE_ASSERT(idx >= 0); +#endif + if (idx == 0) { + return dim.head; + } + return indexer(dim.tail, idx - 1); +} + +template <> +int64_t& indexer<0>(Dim<0>& dim, int idx) POSTHOSTDEVICE { +#ifndef __HIP_DEVICE_COMPILE__ + static int64_t head = 0; + return head;//throw std::invalid_argument("Invalid index"); +#else + PADDLE_ASSERT(false); +#if CUDA_VERSION < 8000 + // On CUDA versions previous to 8.0, only __shared__ variables + // could be declared as static in the device code. + int64_t head = 0; +#else + static int64_t head = 0; +#endif + return head; +#endif +} + +template +int64_t indexer(const Dim& dim, int idx) POSTHOSTDEVICE { +#ifndef __HIP_DEVICE_COMPILE__ + if (idx < 0) { + ;//throw std::invalid_argument("Tried to access a negative dimension"); + } +#else + PADDLE_ASSERT(idx >= 0); +#endif + if (idx == 0) { + return dim.head; + } + return indexer(dim.tail, idx - 1); +} + +template <> +int64_t indexer<0>(const Dim<0>& dim, int idx) POSTHOSTDEVICE { +#ifndef __HIP_DEVICE_COMPILE__ + throw std::invalid_argument("Invalid index"); +#else + PADDLE_ASSERT(false); +#if CUDA_VERSION < 8000 + // On CUDA versions previous to 8.0, only __shared__ variables + // could be declared as static in the device code. + int64_t head = 0; +#else + static int64_t head = 0; +#endif + return head; +#endif +} + +} // namespace +// Static access to constant Dim +template +int64_t get(const Dim& d) POSTHOSTDEVICE { + return DimGetter::impl(d); +} + +// Static access to mutable Dim +template +int64_t& get(Dim& d) POSTHOSTDEVICE { + return DimGetter::impl(d); +} + +// Dynamic access to constant Dim +template +int64_t Dim::operator[](int i) const POSTHOSTDEVICE { + return indexer(*this, i); +} + +// Dynamic access to mutable Dim +template +int64_t& Dim::operator[](int i) POSTHOSTDEVICE { + return indexer(*this, i); +} + +// Dynamic access to constant Dim +inline int64_t Dim<0>::operator[](int i) const POSTHOSTDEVICE { + return indexer(*this, i); +} + +// Dynamic access to mutable Dim +inline int64_t& Dim<0>::operator[](int i) POSTHOSTDEVICE { + return indexer(*this, i); +} + +// Dynamic access to constant Dim +// without std::enable_if will try to instantiate this on get<0>(d) +template +typename std::enable_if<(l > 0), int64_t>::type get(const Dim& d, + int i) POSTHOSTDEVICE { + return d[i]; +} + +// Dynamic access to mutable Dim +template +typename std::enable_if<(l > 0), int64_t&>::type get(Dim& d, + int i) POSTHOSTDEVICE { + return d[i]; +} + +// Dot product of two dims +template +int64_t linearize(const Dim& a, const Dim& b) POSTHOSTDEVICE { + return a.head * b.head + linearize(a.tail, b.tail); +} + +// Base case dot product of two Dims +// Notice it is inline because it is no longer a template +template <> +inline int64_t linearize(const Dim<0>& a, const Dim<0>& b) POSTHOSTDEVICE { + return 0; +} + +// Product of a Dim +template +int64_t product(const Dim& a, int prod = 1) POSTHOSTDEVICE { + return prod * a.head * product(a.tail); +} + +// Base case product of a Dim +// Notice it is inline because it is no longer a template +template <> +inline int64_t product(const Dim<0>& a, int prod) POSTHOSTDEVICE { + return prod; +} + +// Is 0 <= idx_i < size_i for all i? +template +bool contained(const Dim& idx, const Dim& size) POSTHOSTDEVICE { + return ((0 <= idx.head) && (idx.head < size.head) && + contained(idx.tail, size.tail)); +} + +// Base case of is 0 <= idx_i < size_i ? +// Notice it is inline because it is no longer a template +template <> +inline bool contained(const Dim<0>& idx, const Dim<0>& size) POSTHOSTDEVICE { + return true; +} + +/** + * \brief Compute exclusive prefix-multiply of a Dim. + */ +template +Dim ex_prefix_mul(const Dim& src, int mul = 1) POSTHOSTDEVICE { + return Dim(mul, ex_prefix_mul(src.tail, mul * src.head)); +} + +///\cond HIDDEN +// Base case of ex_prefix_mul +// Notice it is inline because it is no longer a template +template <> +inline Dim<0> ex_prefix_mul(const Dim<0>& src, int mul) POSTHOSTDEVICE { + return Dim<0>(); +} +///\endcond + +/** + * Add two dimensions together + */ +template +Dim dim_plus(const Dim& a, const Dim& b) POSTHOSTDEVICE { + return Dim(a.head + b.head, dim_plus(a.tail, b.tail)); +} + +// Base case +template <> +inline Dim<0> dim_plus(const Dim<0>& a, const Dim<0>& b) POSTHOSTDEVICE { + return Dim<0>(); +} + +template +Dim operator+(const Dim& lhs, const Dim& rhs) POSTHOSTDEVICE { + return dim_plus(lhs, rhs); +} + +/** + * Multiply two dimensions together + */ +template +Dim dim_mult(const Dim& a, const Dim& b) POSTHOSTDEVICE { + return Dim(a.head * b.head, dim_mult(a.tail, b.tail)); +} + +// Base case +template <> +inline Dim<0> dim_mult(const Dim<0>& a, const Dim<0>& b) POSTHOSTDEVICE { + return Dim<0>(); +} + +template +Dim operator*(const Dim& lhs, const Dim& rhs) { + return dim_mult(lhs, rhs); +} + +/** + * \brief Normalize strides to ensure any dimension with extent 1 + * has stride 0. + * + * \param size Dim object containing the size of an array + * \param stride Dim object containing stride of an array + * \return Dim object the same size as \p size with normalized strides + * + */ + +template +Dim normalize_strides(const Dim& size, const Dim& stride) POSTHOSTDEVICE { + int norm_stride = size.head == 1 ? 0 : stride.head; + return Dim(norm_stride, normalize_strides(size.tail, stride.tail)); +} + +///\cond HIDDEN + +template <> +inline Dim<0> normalize_strides(const Dim<0>& size, + const Dim<0>& stride) POSTHOSTDEVICE { + return Dim<0>(); +} + +///\endcond + +/** + * Helper function to create a Dim + * + * \param idxes The type of Dim constructed depends on the number of params + * + */ + +template +Dim make_dim(Args... idxes) POSTHOSTDEVICE { + return Dim(idxes...); +} + +// Allows us to output a Dim +// XXX For some reason, overloading fails to resolve this correctly +template +typename std::enable_if<(i > 1), std::ostream&>::type operator<<( + std::ostream& os, const Dim& d) { + os << d.head << ", " << d.tail; + return os; +} + +// Base case that allows us to output a Dim +// XXX I wish this could be an overload instead of a template +template +typename std::enable_if<(i == 1), std::ostream&>::type operator<<( + std::ostream& os, const Dim& d) { + os << d.head; + return os; +} + +inline std::ostream& operator<<(std::ostream& os, const Dim<0>& d) { + return os; +} + +template +std::string Dim::to_string() const POSTHOST { + std::stringstream stream; + + stream << *this; + + return stream.str(); +} + +template +Dim linear_to_dimension(int linear_index, Dim extents) POSTHOSTDEVICE { + Dim result; + + for (int i = 0; i < D - 1; ++i) { + result[i] = linear_index % extents[i]; + linear_index /= extents[i]; + } + + result[D - 1] = linear_index; + + return result; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/dim_test.cu b/paddle/fluid/framework/dim_test.cu index 0f384d12e6f04..fab8a01bbac3d 100644 --- a/paddle/fluid/framework/dim_test.cu +++ b/paddle/fluid/framework/dim_test.cu @@ -11,6 +11,7 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +#include "hip/hip_runtime.h" #include #include @@ -34,7 +35,7 @@ TEST(Dim, Equality) { // construct a Dim on the GPU thrust::device_vector> t(2); - test<<<1, 1>>>(thrust::raw_pointer_cast(t.data())); + hipLaunchKernelGGL((test), dim3(1), dim3(1), 0, 0, thrust::raw_pointer_cast(t.data())); a = t[0]; EXPECT_EQ(paddle::framework::get<0>(a), 5); EXPECT_EQ(paddle::framework::get<1>(a), 6); @@ -61,7 +62,7 @@ TEST(Dim, Equality) { // dynamic access on GPU thrust::device_vector r(1); - dyn_idx_gpu<<<1, 1>>>(thrust::raw_pointer_cast(r.data())); + hipLaunchKernelGGL((dyn_idx_gpu), dim3(1), dim3(1), 0, 0, thrust::raw_pointer_cast(r.data())); int64_t res = r[0]; EXPECT_EQ(res, 6); diff --git a/paddle/fluid/framework/init.cc b/paddle/fluid/framework/init.cc index 3c0d93642ac41..dfe551692da11 100644 --- a/paddle/fluid/framework/init.cc +++ b/paddle/fluid/framework/init.cc @@ -62,6 +62,24 @@ void InitP2P(int count) { } }); #endif +#ifdef PADDLE_WITH_HIP + std::call_once(p2p_init_flag, [&]() { + for (int i = 0; i < count; ++i) { + for (int j = 0; j < count; ++j) { + if (i == j) continue; + int can_acess = -1; + PADDLE_ENFORCE(hipDeviceCanAccessPeer(&can_acess, i, j), + "Failed to test P2P access."); + if (can_acess != 1) { + LOG(WARNING) << "Cannot enable P2P access from " << i << " to " << j; + } else { + hipSetDevice(i); + hipDeviceEnablePeerAccess(j, 0); + } + } + } + }); +#endif } void InitDevices() { @@ -71,7 +89,7 @@ void InitDevices() { places.emplace_back(platform::CPUPlace()); int count = 0; -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) try { count = platform::GetCUDADeviceCount(); } catch (const std::exception &exp) { diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h index 4f130d2659004..723e3a54d0fec 100644 --- a/paddle/fluid/framework/lod_tensor.h +++ b/paddle/fluid/framework/lod_tensor.h @@ -15,7 +15,7 @@ limitations under the License. */ #pragma once #include -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) #include #include #endif diff --git a/paddle/fluid/framework/lod_tensor_test.cu b/paddle/fluid/framework/lod_tensor_test.cu index be65da5ba230e..b0c38da2e4527 100644 --- a/paddle/fluid/framework/lod_tensor_test.cu +++ b/paddle/fluid/framework/lod_tensor_test.cu @@ -12,8 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include -#include +#include #include #include "gtest/gtest.h" @@ -38,8 +37,8 @@ TEST(LoD, data) { auto& v = lod[0]; paddle::platform::CUDAPlace gpu(0); - test<<<1, 1>>>(v.CUDAMutableData(gpu), v.size()); - cudaDeviceSynchronize(); + hipLaunchKernelGGL((test), dim3(1), dim3(1), 0, 0, v.CUDAMutableData(gpu), v.size()); + hipDeviceSynchronize(); for (size_t i = 0; i < v.size(); ++i) { EXPECT_EQ(v[i], i * 2); } @@ -63,8 +62,8 @@ TEST(LoDTensor, LoDInGPU) { auto lod = lod_tensor.lod(); - test<<<1, 8>>>(lod[0].CUDAMutableData(place), lod[0].size()); - cudaDeviceSynchronize(); + hipLaunchKernelGGL((test), dim3(1), dim3(8), 0, 0, lod[0].CUDAMutableData(place), lod[0].size()); + hipDeviceSynchronize(); for (size_t i = 0; i < src_lod[0].size(); ++i) { EXPECT_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2); diff --git a/paddle/fluid/framework/mixed_vector_test.cu b/paddle/fluid/framework/mixed_vector_test.cu index d57f82510833d..69e8029099724 100644 --- a/paddle/fluid/framework/mixed_vector_test.cu +++ b/paddle/fluid/framework/mixed_vector_test.cu @@ -11,8 +11,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include - +#include #include "glog/logging.h" #include "gtest/gtest.h" #include "paddle/fluid/framework/mixed_vector.h" @@ -47,7 +46,7 @@ static __global__ void multiply_10(int* ptr) { } } -cudaStream_t GetCUDAStream(paddle::platform::CUDAPlace place) { +hipStream_t GetCUDAStream(paddle::platform::CUDAPlace place) { return reinterpret_cast( paddle::platform::DeviceContextPool::Instance().Get(place)) ->stream(); @@ -61,7 +60,7 @@ TEST(mixed_vector, GPU_VECTOR) { ASSERT_EQ(tmp.size(), 10UL); paddle::platform::CUDAPlace gpu(0); - multiply_10<<<1, 1, 0, GetCUDAStream(gpu)>>>(tmp.MutableData(gpu)); + hipLaunchKernelGGL(multiply_10, dim3(1), dim3(1), 0, GetCUDAStream(gpu), tmp.MutableData(gpu)); for (int i = 0; i < 10; ++i) { ASSERT_EQ(tmp[i], i * 10); @@ -82,11 +81,11 @@ TEST(mixed_vector, MultiGPU) { ASSERT_EQ(tmp.size(), 10UL); paddle::platform::CUDAPlace gpu0(0); paddle::platform::SetDeviceId(0); - multiply_10<<<1, 1, 0, GetCUDAStream(gpu0)>>>(tmp.MutableData(gpu0)); + hipLaunchKernelGGL(multiply_10, dim3(1), dim3(1), 0, GetCUDAStream(gpu0), tmp.MutableData(gpu0)); paddle::platform::CUDAPlace gpu1(1); auto* gpu1_ptr = tmp.MutableData(gpu1); paddle::platform::SetDeviceId(1); - multiply_10<<<1, 1, 0, GetCUDAStream(gpu1)>>>(gpu1_ptr); + hipLaunchKernelGGL(multiply_10, dim3(1), dim3(1), 0, GetCUDAStream(gpu1), gpu1_ptr); for (int i = 0; i < 10; ++i) { ASSERT_EQ(tmp[i], i * 100); } diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index f1424f13b4451..67bea8af50418 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -220,7 +220,7 @@ class OpKernelRegistrar : public Registrar { // TODO(fengjiayi): The following macros // seems ugly, do we have better method? -#ifndef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) #define USE_OP_KERNEL(op_type) USE_OP_DEVICE_KERNEL(op_type, CPU) #else #define USE_OP_KERNEL(op_type) \ diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index a3b4a8c0829ae..60cac782c36eb 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -78,7 +78,7 @@ static LoD GetLoD(const Scope& scope, const std::string& name) { void OperatorBase::Run(const Scope& scope, const platform::Place& place) { if (platform::is_gpu_place(place)) { -#ifndef PADDLE_WITH_CUDA +#if !(defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) PADDLE_THROW("Cannot run operator on place %s", place); #else auto dev_id = boost::get(place).device; diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index b7a7c69b4c849..549beb720ed27 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -303,7 +303,7 @@ class ExecutionContext { return device_context_; } -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) const inline platform::CUDADeviceContext& cuda_device_context() const { PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace())); return *reinterpret_cast( diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 7be93fa6002ae..e789cfcad1fe7 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -22,6 +22,10 @@ limitations under the License. */ #include "paddle/fluid/platform/nccl_helper.h" #endif +#ifdef PADDLE_WITH_HIP +#include "paddle/fluid/platform/rccl_helper.h" +#endif + #include "paddle/fluid/framework/details/multi_devices_graph_builder.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" @@ -38,7 +42,7 @@ class ParallelExecutorPrivate { Scope *global_scope_; std::unique_ptr executor_; -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) std::unique_ptr nccl_ctxs_; #endif }; @@ -61,7 +65,7 @@ ParallelExecutor::ParallelExecutor( } // Bcast Parameters to all GPUs -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) member_->nccl_ctxs_.reset(new platform::NCCLContextMap(member_->places_)); #endif if (platform::is_gpu_place(places[0]) && @@ -72,7 +76,7 @@ ParallelExecutor::ParallelExecutor( // Step 2. Convert main_program to SSA form and dependency graph. Also, insert // ncclOp -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name, params, member_->local_scopes_, member_->nccl_ctxs_.get()); @@ -100,7 +104,7 @@ ParallelExecutor::ParallelExecutor( void ParallelExecutor::BCastParamsToGPUs( const ProgramDesc &startup_program) const { -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) auto *main_scope = member_->local_scopes_[0]; for (auto *var_desc : startup_program.Block(0).AllVars()) { @@ -114,7 +118,6 @@ void ParallelExecutor::BCastParamsToGPUs( if (paddle::platform::is_gpu_place(main_tensor.place())) { size_t numel = main_tensor.numel(); - ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type()); platform::NCCLGroupGuard guard; for (size_t i = 0; i < member_->places_.size(); ++i) { auto place = member_->places_[i]; @@ -129,8 +132,15 @@ void ParallelExecutor::BCastParamsToGPUs( buffer = t->mutable_data(place, main_tensor.type()); } auto &nccl_ctx = member_->nccl_ctxs_->at(place); +#ifdef PADDLE_WITH_CUDA + ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type()); platform::dynload::ncclBcast(buffer, numel, data_type, 0, nccl_ctx.comm_, nccl_ctx.stream()); +#else + rcclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type()); + platform::dynload::rcclBcast(buffer, numel, data_type, 0, + nccl_ctx.comm_, nccl_ctx.stream()); +#endif } } else { platform::CPUPlace cpu; diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h index 7a48390440083..487e73576100b 100644 --- a/paddle/fluid/framework/tensor_impl.h +++ b/paddle/fluid/framework/tensor_impl.h @@ -129,7 +129,7 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type) { holder_.reset(new PlaceholderImpl( boost::get(place), size, type)); } else if (platform::is_gpu_place(place)) { -#ifndef PADDLE_WITH_CUDA +#if !(defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); } #else diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc index e1012de2ec36e..06da13a6ec686 100644 --- a/paddle/fluid/framework/tensor_test.cc +++ b/paddle/fluid/framework/tensor_test.cc @@ -75,7 +75,7 @@ TEST(Tensor, MutableData) { EXPECT_EQ(p1, p2); } -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) { framework::Tensor src_tensor; float* p1 = nullptr; @@ -130,7 +130,7 @@ TEST(Tensor, ShareDataWith) { ASSERT_EQ(src_tensor.data(), dst_tensor.data()); } -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) { framework::Tensor src_tensor; framework::Tensor dst_tensor; diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 1d864af011bce..c0eeb0a7c7daf 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -36,7 +36,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, memory::Copy(boost::get(dst_place), dst_ptr, boost::get(src_place), src_ptr, size); } -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) else if (platform::is_gpu_place(src_place) && // NOLINT platform::is_cpu_place(dst_place)) { auto src_gpu_place = boost::get(src_place); @@ -216,7 +216,7 @@ void TensorToStream(std::ostream& os, const Tensor& tensor, PADDLE_ENFORCE(size < std::numeric_limits::max(), "Index overflow when writing tensor"); if (platform::is_gpu_place(tensor.place())) { -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB std::unique_ptr buf(new char[kBufSize]); auto& gpu_dev_ctx = @@ -282,7 +282,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor, void* buf; auto ctx = platform::CPUDeviceContext(); if (platform::is_gpu_place(dev_ctx.GetPlace())) { -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) Tensor cpu_tensor; cpu_tensor.Resize(framework::make_ddim(dims)); framework::VisitDataType( diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index 38b6d1c5c46dc..ff46a5c5e8fbf 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -65,7 +65,7 @@ void TensorFromVector(const std::vector& src, memory::Copy(boost::get(dst_place), dst_ptr, src_place, src_ptr, size); } -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) else if (platform::is_gpu_place(dst_place)) { // NOLINT memory::Copy( boost::get(dst_place), dst_ptr, src_place, src_ptr, @@ -101,7 +101,7 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx, memory::Copy(dst_place, dst_ptr, boost::get(src.place()), src_ptr, size); } -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) else if (platform::is_gpu_place(src.place())) { // NOLINT memory::Copy( dst_place, dst_ptr, boost::get(src.place()), diff --git a/paddle/fluid/framework/tensor_util_test.cu b/paddle/fluid/framework/tensor_util_test.cu index 4766ec28aa3cf..88c7c4724669a 100644 --- a/paddle/fluid/framework/tensor_util_test.cu +++ b/paddle/fluid/framework/tensor_util_test.cu @@ -12,6 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ +#include "hip/hip_runtime.h" #include "gtest/gtest.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/platform/device_context.h" @@ -52,14 +53,14 @@ TEST(TensorContainsNAN, GPU) { { Tensor tensor; float* buf = tensor.mutable_data({3}, gpu); - FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf); + hipLaunchKernelGGL((FillNAN), dim3(1), dim3(1), 0, cuda_ctx->stream(), buf); cuda_ctx->Wait(); ASSERT_TRUE(TensorContainsNAN(tensor)); } { Tensor tensor; float16* buf = tensor.mutable_data({3}, gpu); - FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf); + hipLaunchKernelGGL((FillNAN), dim3(1), dim3(1), 0, cuda_ctx->stream(), buf); cuda_ctx->Wait(); ASSERT_TRUE(TensorContainsNAN(tensor)); } @@ -73,14 +74,14 @@ TEST(TensorContainsInf, GPU) { { Tensor tensor; float* buf = tensor.mutable_data({3}, gpu); - FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf); + hipLaunchKernelGGL((FillInf), dim3(1), dim3(1), 0, cuda_ctx->stream(), buf); cuda_ctx->Wait(); ASSERT_TRUE(TensorContainsInf(tensor)); } { Tensor tensor; float16* buf = tensor.mutable_data({3}, gpu); - FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf); + hipLaunchKernelGGL((FillInf), dim3(1), dim3(1), 0, cuda_ctx->stream(), buf); cuda_ctx->Wait(); ASSERT_TRUE(TensorContainsInf(tensor)); } diff --git a/paddle/fluid/memory/detail/CMakeLists.txt b/paddle/fluid/memory/detail/CMakeLists.txt index b9c3fc31c1523..28cb34f09cb87 100644 --- a/paddle/fluid/memory/detail/CMakeLists.txt +++ b/paddle/fluid/memory/detail/CMakeLists.txt @@ -1,5 +1,7 @@ if(${WITH_GPU}) nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info) +elseif (WITH_AMD_GPU) + hip_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info) else(${WITH_GPU}) cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info) endif(${WITH_GPU}) diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc index 876837838648d..d5706b763634c 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.cc +++ b/paddle/fluid/memory/detail/buddy_allocator.cc @@ -176,7 +176,7 @@ void* BuddyAllocator::SystemAlloc(size_t size) { } BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() { -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) if (system_allocator_->UseGpu()) { if ((total_used_ + total_free_) == 0) { // Compute the maximum allocation size for the first allocation. diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index a45f8c33ee595..a91812ced0faa 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -187,6 +187,110 @@ bool CUDAPinnedAllocator::UseGpu() const { return false; } #endif +#ifdef PADDLE_WITH_HIP + +void* GPUAllocator::Alloc(size_t& index, size_t size) { + // CUDA documentation doesn't explain if hipMalloc returns nullptr + // if size is 0. We just make sure it does. + if (size <= 0) return nullptr; + void* p; + int prev_id; + hipGetDevice(&prev_id); + if (prev_id != gpu_id_) { + hipSetDevice(gpu_id_); + } + + hipError_t result = hipMalloc(&p, size); + + if (prev_id != gpu_id_) { + hipSetDevice(prev_id); + } + + if (result == hipSuccess) { + index = 0; + gpu_alloc_size_ += size; + return p; + } else { + LOG(WARNING) + << "Cannot malloc " << size / 1024.0 / 1024.0 + << " MB GPU memory. Please shrink FLAGS_fraction_of_gpu_memory_to_use " + "environment variable to a lower value. Current value is " + << FLAGS_fraction_of_gpu_memory_to_use; + return nullptr; + } +} + +void GPUAllocator::Free(void* p, size_t size, size_t index) { + hipError_t err; + + if (index == 0) { + PADDLE_ASSERT(gpu_alloc_size_ >= size); + gpu_alloc_size_ -= size; + err = hipFree(p); + } else { + PADDLE_ASSERT(fallback_alloc_size_ >= size); + fallback_alloc_size_ -= size; + err = hipHostFree(p); + } + + if (err != hipSuccess) { + PADDLE_ENFORCE(err, "hipFree failed in GPUAllocator::Free."); + } +} + +bool GPUAllocator::UseGpu() const { return true; } + +// PINNED memory allows direct DMA transfers by the GPU to and from system +// memory. It’s locked to a physical address. +void* CUDAPinnedAllocator::Alloc(size_t& index, size_t size) { + if (size <= 0) return nullptr; + + // NOTE: here, we use CUDAPinnedMaxAllocSize as the maximum memory size + // of host pinned allocation. Allocates too much would reduce + // the amount of memory available to the underlying system for paging. + size_t usable = + paddle::platform::CUDAPinnedMaxAllocSize() - cuda_pinnd_alloc_size_; + + if (size > usable) { + LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0 + << " MB pinned memory." + << ", available " << usable / 1024.0 / 1024.0 << " MB"; + return nullptr; + } + + void* p; + // PINNED memory is visible to all HIP contexts. + hipError_t result = hipHostMalloc(&p, size); + + if (result == hipSuccess) { + index = 1; // PINNED memory + cuda_pinnd_alloc_size_ += size; + return p; + } else { + LOG(WARNING) << "hipMallocHost failed."; + return nullptr; + } + + return nullptr; +} + +void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) { + hipError_t err; + PADDLE_ASSERT(index == 1); + + PADDLE_ASSERT(cuda_pinnd_alloc_size_ >= size); + cuda_pinnd_alloc_size_ -= size; + err = hipHostFree(p); + + if (err != hipSuccess) { + PADDLE_ENFORCE(err, "hipFreeHost failed in GPUPinnedAllocator::Free."); + } +} + +bool CUDAPinnedAllocator::UseGpu() const { return false; } + +#endif + } // namespace detail } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/detail/system_allocator.h b/paddle/fluid/memory/detail/system_allocator.h index e3c50ef6483c6..bbcf1b1d40a92 100644 --- a/paddle/fluid/memory/detail/system_allocator.h +++ b/paddle/fluid/memory/detail/system_allocator.h @@ -41,7 +41,7 @@ class CPUAllocator : public SystemAllocator { virtual bool UseGpu() const; }; -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) class GPUAllocator : public SystemAllocator { public: explicit GPUAllocator(int gpu_id) : gpu_id_(gpu_id) {} diff --git a/paddle/fluid/memory/detail/system_allocator_test.cc b/paddle/fluid/memory/detail/system_allocator_test.cc index 3e1926f632c57..a3a2ae3f0cdd5 100644 --- a/paddle/fluid/memory/detail/system_allocator_test.cc +++ b/paddle/fluid/memory/detail/system_allocator_test.cc @@ -56,7 +56,7 @@ TEST(CPUAllocator, LockMem) { TestAllocator(a, 0); } -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) TEST(GPUAllocator, Alloc) { paddle::memory::detail::GPUAllocator a(0); TestAllocator(a, 2048); diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index eddcaab8befda..77102131c29c5 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -97,5 +97,76 @@ void Copy( #endif +#ifdef PADDLE_WITH_HIP +template <> +void Copy( + platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place, + const void* src, size_t num, hipStream_t stream) { + platform::SetDeviceId(src_place.device); + platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost, stream); +} + +template <> +void Copy( + platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place, + const void* src, size_t num, hipStream_t stream) { + platform::SetDeviceId(dst_place.device); + platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice, stream); +} + +template <> +void Copy( + platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place, + const void* src, size_t num, hipStream_t stream) { + if (dst_place == src_place) { + platform::SetDeviceId(src_place.device); + platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToDevice, stream); + } else { + platform::GpuMemcpyPeer(dst, dst_place.device, src, src_place.device, num, + stream); + } +} + +template <> +void Copy( + platform::CPUPlace dst_place, void* dst, + platform::CUDAPinnedPlace src_place, const void* src, size_t num) { + std::memcpy(dst, src, num); +} + +template <> +void Copy( + platform::CUDAPinnedPlace dst_place, void* dst, + platform::CPUPlace src_place, const void* src, size_t num) { + std::memcpy(dst, src, num); +} + +template <> +void Copy( + platform::CUDAPinnedPlace dst_place, void* dst, + platform::CUDAPinnedPlace src_place, const void* src, size_t num) { + std::memcpy(dst, src, num); +} + +template <> +void Copy( + platform::CUDAPinnedPlace dst_place, void* dst, + platform::CUDAPlace src_place, const void* src, size_t num, + hipStream_t stream) { + platform::SetDeviceId(src_place.device); + platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost, stream); +} + +template <> +void Copy( + platform::CUDAPlace dst_place, void* dst, + platform::CUDAPinnedPlace src_place, const void* src, size_t num, + hipStream_t stream) { + platform::SetDeviceId(dst_place.device); + platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice, stream); +} + +#endif + } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/memcpy.h b/paddle/fluid/memory/memcpy.h index 7b2b8eb0662fb..290f44f801629 100644 --- a/paddle/fluid/memory/memcpy.h +++ b/paddle/fluid/memory/memcpy.h @@ -53,6 +53,28 @@ template void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num, cudaStream_t stream); +#endif + +#ifdef PADDLE_WITH_HIP + +/** + * \brief Copy memory from one place to another place. + * + * \param[in] DstPlace Destination allocation place (CPU or GPU). + * \param[in] dst Destination memory address. + * \param[in] SrcPlace Source allocation place (CPU or GPU). + * \param[in] src Source memory address. + * \param[in] num memory size in bytes to copy. + * \param[in] stream CUDA stream. + * + * \note For GPU memory copy, CUDA stream need to be specified + * for asynchronously memory copy. + * + */ +template +void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num, + hipStream_t stream); + #endif } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/memory.cc b/paddle/fluid/memory/memory.cc index 09f82166beab3..7860ee31142a1 100644 --- a/paddle/fluid/memory/memory.cc +++ b/paddle/fluid/memory/memory.cc @@ -42,6 +42,8 @@ void* Alloc(platform::CPUPlace place, size_t size) { VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); void* p = GetCPUBuddyAllocator()->Alloc(size); VLOG(10) << " pointer=" << p; + // For debug + memset(p, 0, size); return p; } @@ -56,7 +58,7 @@ size_t Used(platform::CPUPlace place) { return GetCPUBuddyAllocator()->Used(); } -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { static BuddyAllocator** as = NULL; @@ -151,7 +153,7 @@ size_t Usage::operator()(const platform::CPUPlace& cpu) const { } size_t Usage::operator()(const platform::CUDAPlace& gpu) const { -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) return Used(gpu); #else PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); @@ -159,7 +161,7 @@ size_t Usage::operator()(const platform::CUDAPlace& gpu) const { } size_t Usage::operator()(const platform::CUDAPinnedPlace& cuda_pinned) const { -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) return Used(cuda_pinned); #else PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device."); diff --git a/paddle/fluid/memory/memory_test.cc b/paddle/fluid/memory/memory_test.cc index 03829702a0c5c..67cf86f4db615 100644 --- a/paddle/fluid/memory/memory_test.cc +++ b/paddle/fluid/memory/memory_test.cc @@ -83,7 +83,7 @@ TEST(BuddyAllocator, CPUMultAlloc) { } } -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) size_t align(size_t size, paddle::platform::CUDAPlace place) { size += sizeof(paddle::memory::detail::Metadata); diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 84eabab563e34..4adf80386b9d1 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -92,7 +92,7 @@ function(op_library TARGET) nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS} ${op_common_deps}) elseif (WITH_AMD_GPU) - hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS} + hip_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS} ${op_common_deps}) else() cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS} @@ -147,8 +147,8 @@ function(op_library TARGET) endif() # pybind USE_OP_DEVICE_KERNEL for MIOPEN - if (WITH_AMD_GPU AND ${miopen_hip_cc_srcs_len} GREATER 0) - file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MIOPEN);\n") + if (WITH_AMD_GPU AND ${cudnn_cu_cc_srcs_len} GREATER 0) + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n") endif() # pybind USE_OP_DEVICE_KERNEL for MKLDNN @@ -173,6 +173,9 @@ add_subdirectory(nccl) if(WITH_GPU) op_library(nccl_op DEPS nccl_common) file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n") +elseif (WITH_AMD_GPU) + op_library(nccl_op DEPS nccl_common) + file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n") else() set(DEPS_OPS ${DEPS_OPS} nccl_op) endif() @@ -228,6 +231,8 @@ op_library(parallel_do_op DEPS executor) if (WITH_GPU) op_library(conv_op DEPS vol2col depthwise_conv im2col) +elseif (WITH_AMD_GPU) + op_library(conv_op DEPS vol2col depthwise_conv im2col) else() op_library(conv_op DEPS vol2col im2col) endif() @@ -258,13 +263,13 @@ endforeach() set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library") -cc_test(gather_test SRCS gather_test.cc DEPS tensor) -cc_test(net_op_test SRCS net_op_test.cc DEPS net_op) -cc_test(scatter_test SRCS scatter_test.cc DEPS tensor) -cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor) -cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op) -cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory) -cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) -cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op) +hip_test(gather_test SRCS gather_test.cc DEPS tensor) +hip_test(net_op_test SRCS net_op_test.cc DEPS net_op) +hip_test(scatter_test SRCS scatter_test.cc DEPS tensor) +hip_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor) +hip_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op) +hip_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory) +hip_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) +hip_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op) nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor) diff --git a/paddle/fluid/operators/accuracy_op.cu b/paddle/fluid/operators/accuracy_op.cu index 630a4a2df2ca8..891f54360d1ac 100644 --- a/paddle/fluid/operators/accuracy_op.cu +++ b/paddle/fluid/operators/accuracy_op.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "hip/hip_runtime.h" #include #include #include "paddle/fluid/operators/accuracy_op.h" @@ -82,9 +83,9 @@ class AccuracyOpCUDAKernel : public framework::OpKernel { return; } - AccuracyCudaKernel< - PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>( - num_samples, infer_width, indices_data, label_data, correct_data, + hipLaunchKernelGGL((AccuracyCudaKernel< + PADDLE_CUDA_NUM_THREADS>), dim3(1), dim3(PADDLE_CUDA_NUM_THREADS), 0, stream, + num_samples, int(infer_width), indices_data, label_data, correct_data, accuracy_data, total_data); } }; diff --git a/paddle/fluid/operators/adagrad_op.cu b/paddle/fluid/operators/adagrad_op.cu index e798101ca6a3a..fb1489172bae4 100644 --- a/paddle/fluid/operators/adagrad_op.cu +++ b/paddle/fluid/operators/adagrad_op.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "hip/hip_runtime.h" #define EIGEN_USE_GPU #include "paddle/fluid/operators/adagrad_op.h" #include "paddle/fluid/operators/math/math_function.h" @@ -98,10 +99,10 @@ struct SparseAdagradFunctor { const int block_size = 256; dim3 threads(block_size, 1); dim3 grid2(1, merge_rows.size()); - SparseAdagradFunctorKernel< - T, 256><<), dim3(grid2), dim3(threads), 0, reinterpret_cast(context) - .stream()>>>( + .stream(), grad_merge_data, merge_rows.CUDAMutableData(context.GetPlace()), lr, param_data, moment_data, grad_width, epsilon); } diff --git a/paddle/fluid/operators/batch_norm_op.cu.cc b/paddle/fluid/operators/batch_norm_op.cu.cc index 6ceacc39924a7..de07864bbed77 100644 --- a/paddle/fluid/operators/batch_norm_op.cu.cc +++ b/paddle/fluid/operators/batch_norm_op.cu.cc @@ -17,7 +17,7 @@ limitations under the License. */ #include #include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/platform/cudnn_helper.h" +#include "paddle/fluid/platform/miopen_helper.h" #include "paddle/fluid/platform/float16.h" namespace paddle { @@ -26,9 +26,9 @@ namespace operators { using Tensor = framework::Tensor; using DataLayout = framework::DataLayout; template -using CudnnDataType = platform::CudnnDataType; +using MIOpenDataType = platform::MIOpenDataType; template -using BatchNormParamType = typename CudnnDataType::BatchNormParamType; +using BatchNormParamType = typename MIOpenDataType::BatchNormParamType; void ExtractNCWHD(const framework::DDim &dims, const DataLayout &data_layout, int *N, int *C, int *H, int *W, int *D) { @@ -57,6 +57,7 @@ class BatchNormKernel void Compute(const framework::ExecutionContext &ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), "It must use CUDAPlace."); +#if 1 double epsilon = static_cast(ctx.Attr("epsilon")); const float momentum = ctx.Attr("momentum"); const bool is_test = ctx.Attr("is_test"); @@ -74,24 +75,26 @@ class BatchNormKernel ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); // ------------------- cudnn descriptors --------------------- - cudnnTensorDescriptor_t data_desc_; - cudnnTensorDescriptor_t bn_param_desc_; - cudnnBatchNormMode_t mode_; - - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); - CUDNN_ENFORCE( - platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); + miopenTensorDescriptor_t data_desc_; + miopenTensorDescriptor_t bn_param_desc_; + miopenBatchNormMode_t mode_; + PADDLE_ENFORCE(platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); + PADDLE_ENFORCE( + platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); +#endif +# if 0 if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { LOG(ERROR) << "Provided epsilon is smaller than " << "CUDNN_BN_MIN_EPSILON. Setting it to " << "CUDNN_BN_MIN_EPSILON instead."; } epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); -#if CUDNN_VERSION_MIN(7, 0, 0) +#endif +# if 0 mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; #else - mode_ = CUDNN_BATCHNORM_SPATIAL; + mode_ = miopenBNSpatial; #endif VLOG(1) << "Setting descriptors."; @@ -104,12 +107,18 @@ class BatchNormKernel dims = {N, C, H, W, D}; strides = {H * W * D * C, 1, W * D * C, D * C, C}; } - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - data_desc_, CudnnDataType::type, - x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); + + if (x_dims.size() > 4) + { + PADDLE_THROW("miopen only supports 4D tensors, dim=%d not allowed", dims.size()); + } + // Need review. + PADDLE_ENFORCE(platform::dynload::miopenSet4dTensorDescriptor( + data_desc_, MIOpenDataType::type, + dims.data()[0], dims.data()[1], dims.data()[2], dims.data()[3])); // Note: PERSISTENT not implemented for inference - CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor( - bn_param_desc_, data_desc_, is_test ? CUDNN_BATCHNORM_SPATIAL : mode_)); + PADDLE_ENFORCE(platform::dynload::miopenDeriveBNTensorDescriptor( + bn_param_desc_, data_desc_, mode_)); const auto *scale = ctx.Input("Scale"); const auto *bias = ctx.Input("Bias"); @@ -133,7 +142,7 @@ class BatchNormKernel functor(dev_ctx, saved_mean, static_cast>(0)); functor(dev_ctx, saved_variance, static_cast>(0)); - auto handle = dev_ctx.cudnn_handle(); + auto handle = dev_ctx.miopen_handle(); // Now, depending on whether we are running test or not, we have two paths. if (is_test) { @@ -146,42 +155,43 @@ class BatchNormKernel PADDLE_ENFORCE_EQ(est_mean->dims()[0], C); PADDLE_ENFORCE_EQ(est_var->dims()[0], C); - CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardInference( + // Need review + PADDLE_ENFORCE(platform::dynload::miopenBatchNormalizationForwardInference( handle, // Note: PERSISTENT not implemented for inference - CUDNN_BATCHNORM_SPATIAL, CudnnDataType::kOne(), - CudnnDataType::kZero(), data_desc_, x->template data(), - data_desc_, y->template mutable_data(ctx.GetPlace()), - bn_param_desc_, scale->template data>(), - bias->template data>(), - est_mean->template data>(), - est_var->template data>(), epsilon)); + miopenBNSpatial, (void*)MIOpenDataType::kOne(), + (void*)MIOpenDataType::kZero(), data_desc_, (const void*)x->template data(), + data_desc_, (void*)y->template mutable_data(ctx.GetPlace()), + bn_param_desc_, (void*)scale->template data>(), + (void*)bias->template data>(), + (void*)est_mean->template data>(), + (void*)est_var->template data>(), epsilon)); } else { // Run training mode. // obtain running mean and running inv var, and see if we need to // initialize them. double this_factor = 1. - momentum; - CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining( - handle, mode_, CudnnDataType::kOne(), CudnnDataType::kZero(), - data_desc_, x->template data(), data_desc_, - y->template mutable_data(ctx.GetPlace()), bn_param_desc_, - scale->template data>(), - bias->template data>(), this_factor, - mean_out->template mutable_data>( + PADDLE_ENFORCE(platform::dynload::miopenBatchNormalizationForwardTraining( + handle, mode_, (void*)MIOpenDataType::kOne(), (void*)MIOpenDataType::kZero(), + data_desc_, (const void*)x->template data(), data_desc_, + (void*)y->template mutable_data(ctx.GetPlace()), bn_param_desc_, + (void*)scale->template data>(), + (void*)bias->template data>(), this_factor, + (void*)mean_out->template mutable_data>( ctx.GetPlace()), - variance_out->template mutable_data>( + (void*)variance_out->template mutable_data>( ctx.GetPlace()), - epsilon, saved_mean->template mutable_data>( + epsilon, (void*)saved_mean->template mutable_data>( ctx.GetPlace()), - saved_variance->template mutable_data>( + (void*)saved_variance->template mutable_data>( ctx.GetPlace()))); } // clean when exit. - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); - CUDNN_ENFORCE( - platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); + PADDLE_ENFORCE(platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); + PADDLE_ENFORCE( + platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); } }; @@ -211,23 +221,25 @@ class BatchNormGradKernel PADDLE_ENFORCE_EQ(scale->dims()[0], C); // ------------------- cudnn descriptors --------------------- - cudnnTensorDescriptor_t data_desc_; - cudnnTensorDescriptor_t bn_param_desc_; - cudnnBatchNormMode_t mode_; - - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); - CUDNN_ENFORCE( - platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); + miopenTensorDescriptor_t data_desc_; + miopenTensorDescriptor_t bn_param_desc_; + miopenBatchNormMode_t mode_; + + PADDLE_ENFORCE(platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); + PADDLE_ENFORCE( + platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); +#if 0 if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { LOG(ERROR) << "Provided epsilon is smaller than " << "CUDNN_BN_MIN_EPSILON. Setting it to " << "CUDNN_BN_MIN_EPSILON instead."; } epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); -#if CUDNN_VERSION_MIN(7, 0, 0) +#endif +#if 0 mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; #else - mode_ = CUDNN_BATCHNORM_SPATIAL; + mode_ = miopenBNSpatial; #endif std::vector dims; @@ -239,10 +251,16 @@ class BatchNormGradKernel dims = {N, C, H, W, D}; strides = {H * W * C * D, 1, W * D * C, D * C, C}; } - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - data_desc_, CudnnDataType::type, - x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); - CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor( + + if (x_dims.size() > 4) + { + PADDLE_THROW("miopen only supports 4D tensors, dim=%d not allowed", dims.size()); + } + PADDLE_ENFORCE(platform::dynload::miopenSet4dTensorDescriptor( + data_desc_, MIOpenDataType::type, + dims.data()[0], dims.data()[1], dims.data()[2], dims.data()[3])); + + PADDLE_ENFORCE(platform::dynload::miopenDeriveBNTensorDescriptor( bn_param_desc_, data_desc_, mode_)); // init output @@ -260,10 +278,10 @@ class BatchNormGradKernel const void *saved_var_data = saved_var->template data(); auto &dev_ctx = ctx.template device_context(); - CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward( - dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), - CudnnDataType::kZero(), CudnnDataType::kOne(), - CudnnDataType::kZero(), data_desc_, x->template data(), + PADDLE_ENFORCE(platform::dynload::miopenBatchNormalizationBackward( + dev_ctx.miopen_handle(), mode_, MIOpenDataType::kOne(), + MIOpenDataType::kZero(), MIOpenDataType::kOne(), + MIOpenDataType::kZero(), data_desc_, x->template data(), data_desc_, d_y->template data(), data_desc_, d_x->template mutable_data(ctx.GetPlace()), bn_param_desc_, scale->template data(), @@ -272,9 +290,9 @@ class BatchNormGradKernel saved_mean_data, saved_var_data)); // clean when exit. - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); - CUDNN_ENFORCE( - platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); + PADDLE_ENFORCE(platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); + PADDLE_ENFORCE( + platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); } }; @@ -284,7 +302,6 @@ class BatchNormGradKernel namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( - batch_norm, ops::BatchNormKernel, - ops::BatchNormKernel); + batch_norm, ops::BatchNormKernel); REGISTER_OP_CUDA_KERNEL( batch_norm_grad, ops::BatchNormGradKernel); diff --git a/paddle/fluid/operators/box_coder_op.cu b/paddle/fluid/operators/box_coder_op.cu index 0944e9c95d4a6..08ae04debb05f 100644 --- a/paddle/fluid/operators/box_coder_op.cu +++ b/paddle/fluid/operators/box_coder_op.cu @@ -9,6 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "hip/hip_runtime.h" #include "paddle/fluid/operators/box_coder_op.h" #include "paddle/fluid/platform/cuda_helper.h" @@ -129,12 +130,12 @@ class BoxCoderCUDAKernel : public framework::OpKernel { auto code_type = GetBoxCodeType(context.Attr("code_type")); if (code_type == BoxCodeType::kEncodeCenterSize) { - EncodeCenterSizeKernel<<>>( - prior_box_data, prior_box_var_data, target_box_data, row, col, len, + hipLaunchKernelGGL((EncodeCenterSizeKernel), dim3(grid), dim3(block), 0, device_ctx.stream(), + prior_box_data, prior_box_var_data, target_box_data, int(row), int(col), int(len), output); } else if (code_type == BoxCodeType::kDecodeCenterSize) { - DecodeCenterSizeKernel<<>>( - prior_box_data, prior_box_var_data, target_box_data, row, col, len, + hipLaunchKernelGGL((DecodeCenterSizeKernel), dim3(grid), dim3(block), 0, device_ctx.stream(), + prior_box_data, prior_box_var_data, target_box_data, int(row), int(col), int(len), output); } } diff --git a/paddle/fluid/operators/conditional_block_op.cc b/paddle/fluid/operators/conditional_block_op.cc index bff2c34ec893d..924623150fa67 100644 --- a/paddle/fluid/operators/conditional_block_op.cc +++ b/paddle/fluid/operators/conditional_block_op.cc @@ -56,7 +56,7 @@ class ConditionalOp : public framework::OperatorBase { } bool res = false; if (platform::is_gpu_place(ips[0]->place())) { -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) framework::LoDTensor cpu_tensor; framework::TensorCopy(*ips[0], platform::CPUPlace(), &cpu_tensor); platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait(); diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc index c70e3cc3c9198..8e7b8918bcee1 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc @@ -17,8 +17,8 @@ limitations under the License. */ #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/platform/assert.h" -#include "paddle/fluid/platform/cudnn_helper.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/platform/miopen_helper.h" namespace paddle { namespace operators { @@ -29,7 +29,7 @@ using ScopedFilterDescriptor = platform::ScopedFilterDescriptor; using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor; using DataLayout = platform::DataLayout; template -using ScalingParamType = typename platform::CudnnDataType::ScalingParamType; +using ScalingParamType = typename platform::MIOpenDataType::ScalingParamType; static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES = static_cast(1024) * 1024 * 1024; @@ -43,6 +43,9 @@ class CUDNNConvOpKernel : public framework::OpKernel { auto* input = ctx.Input("Input"); auto* filter = ctx.Input("Filter"); auto* output = ctx.Output("Output"); + auto* alg = ctx.Input("Algorithm"); + auto* algOut = ctx.Output("AlgorithmOut"); + algOut->mutable_data(ctx.GetPlace()); std::vector strides = ctx.Attr>("strides"); std::vector paddings = ctx.Attr>("paddings"); @@ -65,23 +68,14 @@ class CUDNNConvOpKernel : public framework::OpKernel { layout = DataLayout::kNCDHW; } - cudnnConvolutionDescriptor_t cudnn_conv_desc = + miopenConvolutionDescriptor_t cudnn_conv_desc = conv_desc.descriptor(paddings, strides, dilations); -#if CUDNN_VERSION_MIN(7, 0, 1) - // cudnn 7 can support groups, no need to do it mannually - // FIXME(typhoonzero): find a better way to disable groups - // rather than setting it to 1. - PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount( - cudnn_conv_desc, groups)); - groups = 1; -#endif - - cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( + miopenTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( layout, framework::vectorize2int(input->dims()), groups); - cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( + miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( layout, framework::vectorize2int(output->dims()), groups); - cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor( + miopenTensorDescriptor_t cudnn_filter_desc = filter_desc.descriptor( layout, framework::vectorize2int(filter->dims()), groups); int input_channels = input->dims()[1]; @@ -120,51 +114,53 @@ class CUDNNConvOpKernel : public framework::OpKernel { workspace_size_limit = user_workspace_size * 1024 * 1024; } // ------------------- cudnn conv algorithm --------------------- - cudnnConvolutionFwdAlgo_t algo; auto& dev_ctx = ctx.template device_context(); - auto handle = dev_ctx.cudnn_handle(); - - PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( - handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, - cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &algo)); - -#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) - // Tensor core is supported since the volta GPU and - // is only enabled when input and filter data are float16 - if (dev_ctx.GetComputeCapability() >= 70 && - std::type_index(typeid(T)) == - std::type_index(typeid(platform::float16))) { - PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( - cudnn_conv_desc, CUDNN_TENSOR_OP_MATH)); - // Currently tensor core is only enabled using this algo - algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; - } else { - PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( - cudnn_conv_desc, CUDNN_DEFAULT_MATH)); - } -#endif + auto handle = dev_ctx.miopen_handle(); // get workspace size able to allocate - PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( + PADDLE_ENFORCE(platform::dynload::miopenConvolutionForwardGetWorkSpaceSize( handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, - cudnn_output_desc, algo, &workspace_size_in_bytes)); - // It is possible for float16 on Volta GPU to allocate more memory than - // the limit because the algo is overrided to use tensor core. - PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit, - "workspace_size to be allocated exceeds the limit"); - + cudnn_output_desc, &workspace_size_in_bytes)); + PADDLE_ENFORCE_GT(workspace_size_limit, workspace_size_in_bytes, + "Required workspace size should be smaller than limit."); // Allocate on GPU memory platform::CUDAPlace gpu = boost::get(ctx.GetPlace()); cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); - // ------------------- cudnn conv forward --------------------- ScalingParamType alpha = 1.0f, beta = 0.0f; + miopenConvAlgoPerf_t perfRes; + int algoCount = 0; + + VLOG(3) << "get alg ptr: " << alg << " alg_out ptr: " << algOut; + VLOG(3) << "Input: " << alg->data() + << " Output: " << algOut->mutable_data(ctx.GetPlace()); + Tensor alg_tmp; + alg_tmp.mutable_data(alg->dims(), platform::CPUPlace()); + framework::TensorCopy(*alg, platform::CPUPlace(), &alg_tmp); + int pre_alg = (alg_tmp.data())[0]; + // New allocated memory is initialized as 0 + if (pre_alg == 0) { + PADDLE_ENFORCE(platform::dynload::miopenFindConvolutionForwardAlgorithm( + handle, cudnn_input_desc, input_data, cudnn_filter_desc, filter_data, + cudnn_conv_desc, cudnn_output_desc, output_data, 1, &algoCount, + &perfRes, cudnn_workspace, workspace_size_in_bytes, false)); + (alg_tmp.data())[0] = (int)(perfRes.fwd_algo) + 1; + VLOG(3) << "Find Kernel: store " << (alg_tmp.data()) + << " kernel :" << perfRes.fwd_algo; + } else { + perfRes.fwd_algo = (miopenConvFwdAlgorithm_t)(pre_alg - 1); + VLOG(3) << "Find Kernel: load " << (alg_tmp.data()) + << " kernel :" << perfRes.fwd_algo; + } + framework::TensorCopy(alg_tmp, ctx.GetPlace(), algOut); + for (int i = 0; i < groups; i++) { - PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward( + // ------------------- cudnn conv forward --------------------- + PADDLE_ENFORCE(platform::dynload::miopenConvolutionForward( handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, cudnn_filter_desc, filter_data + i * group_offset_filter, - cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes, - &beta, cudnn_output_desc, output_data + i * group_offset_out)); + cudnn_conv_desc, perfRes.fwd_algo, &beta, cudnn_output_desc, + output_data + i * group_offset_out, cudnn_workspace, + workspace_size_in_bytes)); } // Release the cudnn workspace paddle::memory::Free(gpu, cudnn_workspace); @@ -182,6 +178,20 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { auto output_grad = ctx.Input(framework::GradVarName("Output")); auto input_grad = ctx.Output(framework::GradVarName("Input")); auto filter_grad = ctx.Output(framework::GradVarName("Filter")); + // #if 0 + // This block is commented out since it triggers assertion. + auto* alg = ctx.Input("Algorithm"); + auto* algOut = ctx.Output("AlgorithmOut"); + + VLOG(3) << "get alg ptr: " << alg << " alg_out ptr: " << algOut; + VLOG(3) << "Input: " << alg->data() + << " Output: " << algOut->mutable_data(ctx.GetPlace()); + Tensor alg_tmp; + alg_tmp.mutable_data(alg->dims(), platform::CPUPlace()); + framework::TensorCopy(*alg, platform::CPUPlace(), &alg_tmp); + int pre_data_alg = (alg_tmp.data())[0]; + int pre_filter_alg = (alg_tmp.data())[1]; + // #endif const T* input_data = input->data(); const T* output_grad_data = output_grad->data(); @@ -206,10 +216,10 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { layout = DataLayout::kNCDHW; } - cudnnConvolutionDescriptor_t cudnn_conv_desc = + miopenConvolutionDescriptor_t cudnn_conv_desc = conv_desc.descriptor(paddings, strides, dilations); -#if CUDNN_VERSION_MIN(7, 0, 1) +#if 0 // cudnn 7 can support groups, no need to do it mannually // FIXME(typhoonzero): find a better way to disable groups // rather than setting it to 1. @@ -218,12 +228,12 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { groups = 1; #endif - cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( + miopenTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( layout, framework::vectorize2int(input->dims()), groups); - cudnnTensorDescriptor_t cudnn_output_grad_desc = + miopenTensorDescriptor_t cudnn_output_grad_desc = output_grad_desc.descriptor( layout, framework::vectorize2int(output_grad->dims()), groups); - cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor( + miopenTensorDescriptor_t cudnn_filter_desc = filter_desc.descriptor( layout, framework::vectorize2int(filter->dims()), groups); int input_channels = input->dims()[1]; @@ -256,8 +266,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { output_grad_width * output_grad_depth; int group_offset_filter = filter->numel() / groups; // ------------------- cudnn backward algorithm --------------------- - cudnnConvolutionBwdDataAlgo_t data_algo; - cudnnConvolutionBwdFilterAlgo_t filter_algo; size_t workspace_size_in_bytes = 0, tmp_size = 0; size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES; if (user_workspace_size > 0) { @@ -265,40 +273,24 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { } auto& dev_ctx = ctx.template device_context(); - auto handle = dev_ctx.cudnn_handle(); + auto handle = dev_ctx.miopen_handle(); if (input_grad) { PADDLE_ENFORCE( - platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( - handle, cudnn_filter_desc, - // dyDesc: Handle to the previously initialized input differential - // tensor descriptor. - cudnn_output_grad_desc, cudnn_conv_desc, - // dxDesc: Handle to the previously initialized output tensor - // descriptor. - cudnn_input_desc, - CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &data_algo)); - PADDLE_ENFORCE( - platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( - handle, cudnn_filter_desc, cudnn_output_grad_desc, - cudnn_conv_desc, cudnn_input_desc, data_algo, &tmp_size)); + platform::dynload::miopenConvolutionBackwardDataGetWorkSpaceSize( + handle, cudnn_output_grad_desc, cudnn_filter_desc, + cudnn_conv_desc, cudnn_input_desc, &tmp_size)); workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size); } if (filter_grad) { PADDLE_ENFORCE( - platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( - handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc, - cudnn_filter_desc, - CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &filter_algo)); - - PADDLE_ENFORCE( - platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( - handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc, - cudnn_filter_desc, filter_algo, &tmp_size)); + platform::dynload::miopenConvolutionBackwardWeightsGetWorkSpaceSize( + handle, cudnn_output_grad_desc, cudnn_input_desc, cudnn_conv_desc, + cudnn_filter_desc, &tmp_size)); workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size); } + PADDLE_ENFORCE_GT(workspace_size_limit, workspace_size_in_bytes, + "Required workspace size should be smaller than limit."); // ------------------- cudnn conv workspace --------------------- // Already on GPU void* cudnn_workspace = nullptr; @@ -306,32 +298,74 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); // ------------------- cudnn conv backward data --------------------- ScalingParamType alpha = 1.0f, beta = 0.0f; + miopenConvAlgoPerf_t perfRes; + int algoCount = 0; if (input_grad) { T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); // Because beta is zero, it is unnecessary to reset input_grad. for (int i = 0; i < groups; i++) { - PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( - handle, &alpha, cudnn_filter_desc, - filter_data + i * group_offset_filter, cudnn_output_grad_desc, - output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo, - cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, - input_grad_data + i * group_offset_in)); + if (pre_data_alg == 0) { + PADDLE_ENFORCE( + platform::dynload::miopenFindConvolutionBackwardDataAlgorithm( + handle, cudnn_output_grad_desc, + output_grad_data + i * group_offset_out, cudnn_filter_desc, + filter_data + i * group_offset_filter, cudnn_conv_desc, + cudnn_input_desc, input_grad_data + i * group_offset_in, 1, + &algoCount, &perfRes, cudnn_workspace, + workspace_size_in_bytes, false)); + (alg_tmp.data())[0] = (int)(perfRes.bwd_data_algo) + 1; + VLOG(3) << "Find Kernel: store " << (alg_tmp.data()) + << " kernel :" << perfRes.bwd_data_algo; + } else { + perfRes.bwd_data_algo = + (miopenConvBwdDataAlgorithm_t)(pre_data_alg - 1); + VLOG(3) << "Find Kernel: load " << (alg_tmp.data())[0] + << " kernel :" << perfRes.bwd_data_algo; + } + PADDLE_ENFORCE(platform::dynload::miopenConvolutionBackwardData( + handle, &alpha, cudnn_output_grad_desc, + output_grad_data + i * group_offset_out, cudnn_filter_desc, + filter_data + i * group_offset_filter, cudnn_conv_desc, + perfRes.bwd_data_algo, &beta, cudnn_input_desc, + input_grad_data + i * group_offset_in, cudnn_workspace, + workspace_size_in_bytes)); } } // ------------------- cudnn conv backward filter --------------------- if (filter_grad) { T* filter_grad_data = filter_grad->mutable_data(ctx.GetPlace()); + // Because beta is zero, it is unnecessary to reset filter_grad. for (int i = 0; i < groups; i++) { - PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( - handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, - cudnn_output_grad_desc, output_grad_data + i * group_offset_out, - cudnn_conv_desc, filter_algo, cudnn_workspace, - workspace_size_in_bytes, &beta, cudnn_filter_desc, - filter_grad_data + i * group_offset_filter)); + if (pre_filter_alg == 0) { + PADDLE_ENFORCE( + platform::dynload::miopenFindConvolutionBackwardWeightsAlgorithm( + handle, cudnn_output_grad_desc, + output_grad_data + i * group_offset_out, cudnn_input_desc, + input_data + i * group_offset_in, cudnn_conv_desc, + cudnn_filter_desc, filter_grad_data + i * group_offset_filter, + 1, &algoCount, &perfRes, cudnn_workspace, + workspace_size_in_bytes, false)); + (alg_tmp.data())[1] = (int)(perfRes.bwd_weights_algo) + 1; + VLOG(3) << "Find Kernel: store " << (alg_tmp.data()) + << " kernel :" << perfRes.bwd_weights_algo; + } else { + perfRes.bwd_weights_algo = + (miopenConvBwdWeightsAlgorithm_t)(pre_filter_alg - 1); + VLOG(3) << "Find Kernel: load " << (alg_tmp.data())[0] + << " kernel :" << perfRes.bwd_weights_algo; + } + PADDLE_ENFORCE(platform::dynload::miopenConvolutionBackwardWeights( + handle, &alpha, cudnn_output_grad_desc, + output_grad_data + i * group_offset_out, cudnn_input_desc, + input_data + i * group_offset_in, cudnn_conv_desc, + perfRes.bwd_weights_algo, &beta, cudnn_filter_desc, + filter_grad_data + i * group_offset_filter, cudnn_workspace, + workspace_size_in_bytes)); } } + framework::TensorCopy(alg_tmp, ctx.GetPlace(), algOut); // Release the cudnn workspace paddle::memory::Free(gpu, cudnn_workspace); } @@ -342,16 +376,11 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { namespace plat = paddle::platform; REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel); + paddle::operators::CUDNNConvOpKernel); REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel); + paddle::operators::CUDNNConvGradOpKernel); REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel); + paddle::operators::CUDNNConvOpKernel); REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel); + paddle::operators::CUDNNConvGradOpKernel); diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 695db841a4ec6..5546a4ab4bedc 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -20,6 +20,9 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/cudnn_helper.h" #endif +#ifdef PADDLE_WITH_HIP +#include "paddle/fluid/platform/miopen_helper.h" +#endif #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif @@ -80,6 +83,11 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( library = framework::LibraryType::kCUDNN; } #endif +#ifdef PADDLE_WITH_HIP + if (platform::CanMIOpenBeUsed(ctx)) { + library = framework::LibraryType::kCUDNN; + } +#endif #ifdef PADDLE_WITH_MKLDNN if (library == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { @@ -121,9 +129,11 @@ Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker) "H is the height of the filter, and W is the width of the filter. " "If the groups attribute is greater than 1, C equals the number of " "input image channels divided by the groups."); + AddInput("Algorithm", "Selected algorithm for conv2d"); AddOutput("Output", "(Tensor) The output tensor of convolution operator. " "The format of output tensor is also NCHW."); + AddOutput("AlgorithmOut", "Tuned algorithm for conv2d"); AddAttr>("strides", "(vector default:{1, 1}), the " "strides(h_stride, w_stride) of " @@ -217,9 +227,11 @@ Conv3DOpMaker::Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker) "is the width of the filter." "If the groups attribute is greater than 1, C equals the number of " "input image channels divided by the groups."); + AddInput("Algorithm", "Selected algorithm for conv3d"); AddOutput("Output", "(Tensor) The output tensor of convolution operator." "The format of output tensor is also NCDHW."); + AddOutput("AlgorithmOut", "Tuned algorithm for conv3d"); AddAttr>("strides", "(vector, default:{1, 1, 1}), the " "strides(d_stride, h_stride, w_stride) of " @@ -316,6 +328,11 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType( library_ = framework::LibraryType::kCUDNN; } #endif +#ifdef PADDLE_WITH_HIP + if (platform::CanMIOpenBeUsed(ctx)) { + library_ = framework::LibraryType::kCUDNN; + } +#endif #ifdef PADDLE_WITH_MKLDNN if (library_ == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { @@ -331,16 +348,44 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType( layout_, library_); } +class Conv2DGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto* op = new framework::OpDesc(); + op->SetType("conv2d_grad"); + op->SetInput("Input", Input("Input")); + op->SetInput("Filter", Input("Filter")); + op->SetInput("Algorithm", Input("Algorithm")); + op->SetInput(framework::GradVarName("Output"), OutputGrad("Output")); + + op->SetAttrMap(Attrs()); + + op->SetOutput("AlgorithmOut", Output("AlgorithmOut")); + op->SetOutput(framework::GradVarName("Input"), InputGrad("Input")); + op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter")); + + return std::unique_ptr(op); + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad, - ops::ConvOpGrad); +REGISTER_OPERATOR(conv2d, ops::ConvOp, ops::Conv2DOpMaker, + ops::Conv2DGradMaker); +REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad); // depthwise convolution op -REGISTER_OP(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker, - depthwise_conv2d_grad, ops::ConvOpGrad); +REGISTER_OPERATOR(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker, + ops::Conv2DGradMaker); +REGISTER_OPERATOR(depthwise_conv2d_grad, ops::ConvOpGrad); + +// REGISTER_OP(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker, +// depthwise_conv2d_grad, ops::ConvOpGrad); REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad, ops::ConvOpGrad); diff --git a/paddle/fluid/operators/conv_shift_op.cu b/paddle/fluid/operators/conv_shift_op.cu index 344bbade7055a..22cc5aad38623 100644 --- a/paddle/fluid/operators/conv_shift_op.cu +++ b/paddle/fluid/operators/conv_shift_op.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "hip/hip_runtime.h" #include "paddle/fluid/operators/conv_shift_op.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/cuda_helper.h" @@ -37,7 +38,7 @@ template __global__ void ConvShiftForward(const T *x, const T *y, int x_width, int y_width, int y_half_width, int batch_size, T *out) { - extern __shared__ T mem[]; + HIP_DYNAMIC_SHARED( T, mem) int tx = threadIdx.x; int i = blockIdx.x * blockDim.x + tx; // global x index @@ -136,7 +137,7 @@ class ConvShiftKernel auto stream = context.template device_context().stream(); - ConvShiftForward<<>>( + hipLaunchKernelGGL((ConvShiftForward), dim3(grid_dim), dim3(x_per_block), mem_per_block, stream, x_data, y_data, x_width, y_width, y_half_width, batch_size, out_data); } }; @@ -172,14 +173,14 @@ class ConvShiftGradKernel if (dX) { T *dx_data = dX->mutable_data(context.GetPlace()); zero(device_ctx, dX, static_cast(0.0)); - ConvShiftGradX<<>>( + hipLaunchKernelGGL((ConvShiftGradX), dim3(grid_dim), dim3(x_per_block), 0, device_ctx.stream(), dout_data, y_data, x_width, y_width, y_half_width, batch_size, dx_data); } if (dY) { T *dy_data = dY->mutable_data(context.GetPlace()); zero(device_ctx, dY, static_cast(0.0)); - ConvShiftDy<<>>( + hipLaunchKernelGGL((ConvShiftDy), dim3(grid_dim), dim3(x_per_block), 0, device_ctx.stream(), x_data, dout_data, x_width, y_width, y_half_width, batch_size, dy_data); } diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc index 901682edbb01c..9ddfc66fb8acf 100644 --- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc @@ -17,7 +17,7 @@ limitations under the License. */ #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/operators/conv_transpose_op.h" #include "paddle/fluid/platform/assert.h" -#include "paddle/fluid/platform/cudnn_helper.h" +#include "paddle/fluid/platform/miopen_helper.h" namespace paddle { namespace operators { @@ -36,6 +36,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), "It must use CUDAPlace."); +#if 1 auto* input = ctx.Input("Input"); auto* filter = ctx.Input("Filter"); auto* output = ctx.Output("Output"); @@ -44,7 +45,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { std::vector paddings = ctx.Attr>("paddings"); // cudnn v5 does not support dilations std::vector dilations = ctx.Attr>("dilations"); - int user_workspace_size = ctx.Attr("workspace_size_MB"); + //int user_workspace_size = ctx.Attr("workspace_size_MB"); const T* input_data = input->data(); const T* filter_data = filter->data(); @@ -63,41 +64,42 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { } // (N, M, H, W) or (N, M, D, H, W) - cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( + miopenTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( layout, framework::vectorize2int(input->dims())); // (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w) - cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( + miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( layout, framework::vectorize2int(output->dims())); // (M, C, K_h, K_w) or (M, C, K_d, K_h, K_w) - cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor( + miopenTensorDescriptor_t cudnn_filter_desc = filter_desc.descriptor( layout, framework::vectorize2int(filter->dims())); - cudnnConvolutionDescriptor_t cudnn_conv_desc = + miopenConvolutionDescriptor_t cudnn_conv_desc = conv_desc.descriptor(paddings, strides, dilations); // ------------------- cudnn conv workspace --------------------- void* cudnn_workspace = nullptr; size_t workspace_size_in_bytes; // final workspace to allocate. - size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes; - if (user_workspace_size > 0) { - workspace_size_limit = user_workspace_size * 1024 * 1024; - } + //size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes; + //if (user_workspace_size > 0) { + // workspace_size_limit = user_workspace_size * 1024 * 1024; + //} // ------------------- cudnn conv algorithm --------------------- - cudnnConvolutionBwdDataAlgo_t algo; + miopenConvBwdDataAlgorithm_t algo; auto& dev_ctx = ctx.template device_context(); - auto handle = dev_ctx.cudnn_handle(); + auto handle = dev_ctx.miopen_handle(); + miopenConvAlgoPerf_t perfRes; + int algoCount = 0; // Get the algorithm - PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( - handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc, + PADDLE_ENFORCE(platform::dynload::miopenFindConvolutionBackwardDataAlgorithm( + handle, cudnn_input_desc, input_data,cudnn_filter_desc, filter_data, cudnn_conv_desc, // dxDesc: Handle to the previously initialized output tensor // descriptor. - cudnn_output_desc, CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &algo)); - + cudnn_output_desc, output_data,1,&algoCount, &perfRes, cudnn_workspace,workspace_size_in_bytes,false)); + algo=perfRes.bwd_data_algo; // get workspace size able to allocate PADDLE_ENFORCE( - platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( + platform::dynload::miopenConvolutionBackwardDataGetWorkSpaceSize( handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc, - cudnn_output_desc, algo, &workspace_size_in_bytes)); + cudnn_output_desc, &workspace_size_in_bytes)); // Allocate on GPU memory platform::CUDAPlace gpu = boost::get(ctx.GetPlace()); @@ -105,13 +107,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { // ------------------- cudnn conv transpose forward --------------------- T alpha = 1.0f, beta = 0.0f; - PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( - handle, &alpha, cudnn_filter_desc, filter_data, cudnn_input_desc, - input_data, cudnn_conv_desc, algo, cudnn_workspace, - workspace_size_in_bytes, &beta, cudnn_output_desc, output_data)); + PADDLE_ENFORCE(platform::dynload::miopenConvolutionBackwardData( + handle, &alpha, cudnn_input_desc, input_data,cudnn_filter_desc, filter_data, + cudnn_conv_desc, algo, &beta, cudnn_output_desc, output_data, cudnn_workspace, + workspace_size_in_bytes)); // Release the cudnn workspace paddle::memory::Free(gpu, cudnn_workspace); +#endif } }; @@ -121,6 +124,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), "It must use CUDAPlace."); +#if 1 auto input = ctx.Input("Input"); auto filter = ctx.Input("Filter"); auto output_grad = ctx.Input(framework::GradVarName("Output")); @@ -134,7 +138,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { std::vector paddings = ctx.Attr>("paddings"); // cudnn v5 does not support dilations std::vector dilations = ctx.Attr>("dilations"); - int user_workspace_size = ctx.Attr("workspace_size_MB"); + //int user_workspace_size = ctx.Attr("workspace_size_MB"); // ------------------- cudnn descriptors --------------------- ScopedTensorDescriptor input_desc; @@ -144,63 +148,65 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { DataLayout layout = DataLayout::kNCHW; // Input: (N, M, H, W) or (N, M, D, H, W) - cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( + miopenTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( layout, framework::vectorize2int(input->dims())); // Output: (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w) - cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( + miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( layout, framework::vectorize2int(output_grad->dims())); // Filter (M, C, K_h, K_w) or (M, C, K_d K_h, K_w) - cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor( + miopenTensorDescriptor_t cudnn_filter_desc = filter_desc.descriptor( layout, framework::vectorize2int(filter->dims())); - cudnnConvolutionDescriptor_t cudnn_conv_desc = + miopenConvolutionDescriptor_t cudnn_conv_desc = conv_desc.descriptor(paddings, strides, dilations); // ------------------- cudnn backward algorithm --------------------- - cudnnConvolutionFwdAlgo_t data_algo; - cudnnConvolutionBwdFilterAlgo_t filter_algo; + miopenConvFwdAlgorithm_t data_algo = miopenConvolutionFwdAlgoGEMM; + miopenConvBwdWeightsAlgorithm_t filter_algo = miopenConvolutionBwdWeightsAlgoGEMM; size_t bwd_filter_ws_size, fwd_ws_size; size_t workspace_size_in_bytes = 0; - size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes; - if (user_workspace_size > 0) { - workspace_size_limit = user_workspace_size * 1024 * 1024; - } + //size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes; + //if (user_workspace_size > 0) { + // workspace_size_limit = user_workspace_size * 1024 * 1024; + //} auto& dev_ctx = ctx.template device_context(); - auto handle = dev_ctx.cudnn_handle(); + auto handle = dev_ctx.miopen_handle(); + miopenConvAlgoPerf_t perfRes; + void* cudnn_workspace = nullptr; + int algoCount = 0; if (input_grad) { // choose backward algorithm for data - PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( + PADDLE_ENFORCE(platform::dynload::miopenFindConvolutionForwardAlgorithm( + handle, cudnn_input_desc, (const void*)input_data, cudnn_filter_desc, + (const void*)filter_data,cudnn_conv_desc, cudnn_output_desc, (void*)output_grad_data, + 1, &algoCount, &perfRes, (void*)cudnn_workspace, workspace_size_in_bytes, false)); + data_algo=perfRes.fwd_algo; + PADDLE_ENFORCE(platform::dynload::miopenConvolutionForwardGetWorkSpaceSize( handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc, - cudnn_input_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &data_algo)); - PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( - handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc, - cudnn_input_desc, data_algo, &fwd_ws_size)); + cudnn_input_desc, &fwd_ws_size)); workspace_size_in_bytes = std::max(workspace_size_in_bytes, fwd_ws_size); } if (filter_grad) { // choose backward algorithm for filter PADDLE_ENFORCE( - platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( - handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc, - cudnn_filter_desc, - CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &filter_algo)); - + platform::dynload::miopenFindConvolutionBackwardWeightsAlgorithm( + handle, cudnn_input_desc, (const void*)input_data,cudnn_filter_desc, (const void*)filter_data, + cudnn_conv_desc, cudnn_output_desc, (void*)output_grad_data, 1, &algoCount, + &perfRes, (void*)cudnn_workspace,workspace_size_in_bytes,false)); + filter_algo=perfRes.bwd_weights_algo; // get workspace for backwards filter algorithm PADDLE_ENFORCE( - platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( - handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc, - cudnn_filter_desc, filter_algo, &bwd_filter_ws_size)); + platform::dynload::miopenConvolutionBackwardWeightsGetWorkSpaceSize( + handle, cudnn_input_desc, cudnn_output_desc, cudnn_conv_desc, + cudnn_filter_desc, &bwd_filter_ws_size)); workspace_size_in_bytes = std::max(workspace_size_in_bytes, bwd_filter_ws_size); } // ------------------- cudnn conv workspace --------------------- // Already on GPU - void* cudnn_workspace = nullptr; platform::CUDAPlace gpu = boost::get(ctx.GetPlace()); cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); // ------------------- cudnn conv backward data --------------------- @@ -209,11 +215,12 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { if (input_grad) { T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); // Because beta is zero, it is unnecessary to reset input_grad. - PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward( + PADDLE_ENFORCE(platform::dynload::miopenConvolutionForward( handle, &alpha, cudnn_output_desc, output_grad_data, cudnn_filter_desc, filter_data, cudnn_conv_desc, data_algo, - cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, - input_grad_data)); + &beta, cudnn_input_desc, input_grad_data, cudnn_workspace, + workspace_size_in_bytes + )); } // ------------------- cudnn conv backward filter --------------------- @@ -221,13 +228,14 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { T* filter_grad_data = filter_grad->mutable_data(ctx.GetPlace()); // Because beta is zero, it is unnecessary to reset filter_grad. // Gradient with respect to the filter - PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( - handle, &alpha, cudnn_output_desc, output_grad_data, cudnn_input_desc, - input_data, cudnn_conv_desc, filter_algo, cudnn_workspace, - workspace_size_in_bytes, &beta, cudnn_filter_desc, filter_grad_data)); + PADDLE_ENFORCE(platform::dynload::miopenConvolutionBackwardWeights( + handle, &alpha, cudnn_input_desc, input_data, cudnn_output_desc, output_grad_data, + cudnn_conv_desc, filter_algo, &beta, cudnn_filter_desc, filter_grad_data, + cudnn_workspace, workspace_size_in_bytes)); } // Release the cudnn workspace paddle::memory::Free(gpu, cudnn_workspace); +#endif } }; @@ -237,15 +245,15 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_KERNEL(conv2d_transpose, CUDNN, ::paddle::platform::CUDAPlace, - ops::CUDNNConvTransposeOpKernel, - ops::CUDNNConvTransposeOpKernel); + ops::CUDNNConvTransposeOpKernel + /*,ops::CUDNNConvTransposeOpKernel*/); REGISTER_OP_KERNEL(conv2d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace, - ops::CUDNNConvTransposeGradOpKernel, - ops::CUDNNConvTransposeGradOpKernel); + ops::CUDNNConvTransposeGradOpKernel + /*,ops::CUDNNConvTransposeGradOpKernel*/); REGISTER_OP_KERNEL(conv3d_transpose, CUDNN, ::paddle::platform::CUDAPlace, - ops::CUDNNConvTransposeOpKernel, - ops::CUDNNConvTransposeOpKernel); + ops::CUDNNConvTransposeOpKernel + /*,ops::CUDNNConvTransposeOpKernel*/); REGISTER_OP_KERNEL(conv3d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace, - ops::CUDNNConvTransposeGradOpKernel, - ops::CUDNNConvTransposeGradOpKernel); + ops::CUDNNConvTransposeGradOpKernel + /*,ops::CUDNNConvTransposeGradOpKernel*/); diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc index b2a3cfc89f18e..e10913f25cc5d 100644 --- a/paddle/fluid/operators/conv_transpose_op.cc +++ b/paddle/fluid/operators/conv_transpose_op.cc @@ -67,6 +67,12 @@ framework::OpKernelType ConvTransposeOp::GetExpectedKernelType( auto& dev_ctx = ctx.template device_context(); use_cudnn &= dev_ctx.cudnn_handle() != nullptr; } +#endif +#ifdef PADDLE_WITH_HIP + if (platform::is_gpu_place(ctx.GetPlace())) { + auto& dev_ctx = ctx.template device_context(); + use_cudnn &= dev_ctx.miopen_handle() != nullptr; + } #endif framework::LibraryType library_; if (use_cudnn) { @@ -276,6 +282,12 @@ framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType( auto& dev_ctx = ctx.template device_context(); use_cudnn &= dev_ctx.cudnn_handle() != nullptr; } +#endif +#ifdef PADDLE_WITH_HIP + if (platform::is_gpu_place(ctx.GetPlace())) { + auto& dev_ctx = ctx.template device_context(); + use_cudnn &= dev_ctx.miopen_handle() != nullptr; + } #endif framework::LibraryType library_; if (use_cudnn) { diff --git a/paddle/fluid/operators/cross_entropy_op.cu b/paddle/fluid/operators/cross_entropy_op.cu index 6449149d4b559..8bf645b22928b 100644 --- a/paddle/fluid/operators/cross_entropy_op.cu +++ b/paddle/fluid/operators/cross_entropy_op.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "hip/hip_runtime.h" #include "paddle/fluid/operators/cross_entropy_op.h" namespace paddle { @@ -87,15 +88,15 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel { if (ctx.Attr("soft_label")) { auto* label_data = label->data(); - SoftCrossEntropyGradientKernel<<>>( - dx_data, dy_data, x_data, label_data, batch_size, class_num); + hipLaunchKernelGGL((SoftCrossEntropyGradientKernel), dim3(grid), dim3(block), 0, stream, + dx_data, dy_data, x_data, label_data, int(batch_size), int(class_num)); } else { math::SetConstant functor; functor(dev_ctx, dx, 0); auto* label_data = label->data(); grid = (batch_size + block - 1) / block; - CrossEntropyGradientKernel<<>>( - dx_data, dy_data, x_data, label_data, batch_size, class_num); + hipLaunchKernelGGL((CrossEntropyGradientKernel), dim3(grid), dim3(block), 0, stream, + dx_data, dy_data, x_data, label_data, int(batch_size), int(class_num)); } } }; diff --git a/paddle/fluid/operators/ctc_align_op.cu b/paddle/fluid/operators/ctc_align_op.cu index 54e0b1d9ad83c..9b1131d2f5ba4 100644 --- a/paddle/fluid/operators/ctc_align_op.cu +++ b/paddle/fluid/operators/ctc_align_op.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "hip/hip_runtime.h" #include #include #include @@ -68,7 +69,7 @@ class CTCAlignOpCUDAKernel : public framework::OpKernel { T* output_data = output->mutable_data({num_tokens, 1}, ctx.GetPlace()); auto stream = ctx.cuda_device_context().stream(); - MergeAndDelCudaKernel<<<1, 1, 0, stream>>>( + hipLaunchKernelGGL((MergeAndDelCudaKernel), dim3(1), dim3(1), 0, stream, num_tokens, tokens, num_seq, input_lod[level].CUDAMutableData(ctx.GetPlace()), blank, merge_repeated, dev_out_lod0_ptr, output_data); diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.cc b/paddle/fluid/operators/detail/sendrecvop_utils.cc index f8576d01b10f4..e82f58256b5dd 100644 --- a/paddle/fluid/operators/detail/sendrecvop_utils.cc +++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc @@ -76,7 +76,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, } } if (platform::is_gpu_place(ctx.GetPlace())) { -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) PADDLE_ENFORCE(platform::is_gpu_place(tensor.place())); platform::CPUPlace cpu; auto& gpu_dev_ctx = @@ -113,7 +113,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, e.WriteUint64(VarMsg::kSlrHeightFieldNumber, slr->height()); auto* tensor = slr->mutable_value(); if (platform::is_gpu_place(ctx.GetPlace())) { -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) platform::CPUPlace cpu; auto& gpu_dev_ctx = static_cast(ctx); diff --git a/paddle/fluid/operators/detail/strided_memcpy.h b/paddle/fluid/operators/detail/strided_memcpy.h index 0b7c470fe72eb..bbeb737dfa6fa 100644 --- a/paddle/fluid/operators/detail/strided_memcpy.h +++ b/paddle/fluid/operators/detail/strided_memcpy.h @@ -34,7 +34,7 @@ struct StridedMemcpyFunctor { auto& cpu_place = boost::get(place); memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T)); } else { -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) auto& gpu_place = boost::get(place); auto& cuda_ctx = reinterpret_cast(dev_ctx); @@ -57,7 +57,7 @@ struct StridedMemcpyFunctor { auto& cpu_place = boost::get(place); memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim.head); } else { -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) auto& gpu_place = boost::get(place); auto& cuda_ctx = reinterpret_cast(dev_ctx); diff --git a/paddle/fluid/operators/detail/variable_response.cc b/paddle/fluid/operators/detail/variable_response.cc index 78e1d274a9224..67218c0f386a3 100644 --- a/paddle/fluid/operators/detail/variable_response.cc +++ b/paddle/fluid/operators/detail/variable_response.cc @@ -56,7 +56,7 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input, int total_written = 0; if (platform::is_gpu_place(place)) { -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) auto& gpu_dev_ctx = static_cast(dev_ctx); platform::CPUPlace cpu; diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu index 184c095e487a3..a02cf7b36b076 100644 --- a/paddle/fluid/operators/dropout_op.cu +++ b/paddle/fluid/operators/dropout_op.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "hip/hip_runtime.h" #define EIGEN_USE_GPU #include #include @@ -69,8 +70,7 @@ class GPUDropoutKernel : public framework::OpKernel { int threads = 512; int grid = (x->numel() + threads - 1) / threads; - RandomGenerator< - T><<>>( + hipLaunchKernelGGL((RandomGenerator), dim3(grid), dim3(threads), 0, context.cuda_device_context().stream(), size, seed, dropout_prob, x_data, mask_data, y_data); } else { auto X = EigenMatrix::Reshape(*x, 1); diff --git a/paddle/fluid/operators/edit_distance_op.cu b/paddle/fluid/operators/edit_distance_op.cu index 3b89ad5d49c33..a1dd4056b4a99 100644 --- a/paddle/fluid/operators/edit_distance_op.cu +++ b/paddle/fluid/operators/edit_distance_op.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "hip/hip_runtime.h" #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/math_function.h" @@ -123,11 +124,9 @@ class EditDistanceGPUKernel : public framework::OpKernel { auto x1 = x1_t->data() + hyp_lod[num]; auto x2 = x2_t->data() + ref_lod[num]; - FillFirstColumn<<<1 + m / PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>(dist, m, n); + hipLaunchKernelGGL((FillFirstColumn), dim3(1 + m / PADDLE_CUDA_NUM_THREADS), dim3(PADDLE_CUDA_NUM_THREADS), 0, stream, dist, int(m), int(n)); - FillFirstRow<<<1 + n / PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>(dist, n); + hipLaunchKernelGGL((FillFirstRow), dim3(1 + n / PADDLE_CUDA_NUM_THREADS), dim3(PADDLE_CUDA_NUM_THREADS), 0, stream, dist, int(n)); // Compute the elements of distance matrix in the anti-diagonal diretion for (int64_t slice = 2; slice < m + n + 1; ++slice) { int z_m = slice < m + 1 ? 0 : slice - m; @@ -136,11 +135,10 @@ class EditDistanceGPUKernel : public framework::OpKernel { // anti-diagonal line to update // the start index at which computes from int start = slice < n + 1 ? slice : (z_n + 1) * (n + 1) - 1; - Levenshtein<<<1 + (size - 1) / PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>(dist, x1, x2, - m, n, start); + hipLaunchKernelGGL((Levenshtein), dim3(1 + (size - 1) / PADDLE_CUDA_NUM_THREADS), dim3(PADDLE_CUDA_NUM_THREADS), 0, stream, dist, x1, x2, + int(m), int(n), start); } - SetOutput<<<1, 1, 0, stream>>>(out + num, dist, m, n, normalized); + hipLaunchKernelGGL((SetOutput), dim3(1), dim3(1), 0, stream, out + num, dist, int(m), int(n), int(normalized)); } } } diff --git a/paddle/fluid/operators/elementwise_add_op.cu b/paddle/fluid/operators/elementwise_add_op.cu index dfff518f170b5..e61a21d7bbf2e 100644 --- a/paddle/fluid/operators/elementwise_add_op.cu +++ b/paddle/fluid/operators/elementwise_add_op.cu @@ -21,13 +21,8 @@ namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( elementwise_add, ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel); + ops::ElementwiseAddKernel); REGISTER_OP_CUDA_KERNEL( elementwise_add_grad, ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel); + ops::ElementwiseAddGradKernel); diff --git a/paddle/fluid/operators/elementwise_div_op.cu b/paddle/fluid/operators/elementwise_div_op.cu index 588d1f7420241..c777c64411c3b 100644 --- a/paddle/fluid/operators/elementwise_div_op.cu +++ b/paddle/fluid/operators/elementwise_div_op.cu @@ -20,13 +20,8 @@ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( elementwise_div, ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel); + ops::ElementwiseDivKernel); REGISTER_OP_CUDA_KERNEL( elementwise_div_grad, ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel); + ops::ElementwiseDivGradKernel); diff --git a/paddle/fluid/operators/elementwise_max_op.cu b/paddle/fluid/operators/elementwise_max_op.cu index 32c99835d66d8..4888640b4ad5e 100644 --- a/paddle/fluid/operators/elementwise_max_op.cu +++ b/paddle/fluid/operators/elementwise_max_op.cu @@ -20,13 +20,8 @@ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( elementwise_max, ops::ElementwiseMaxKernel, - ops::ElementwiseMaxKernel, - ops::ElementwiseMaxKernel, - ops::ElementwiseMaxKernel); + ops::ElementwiseMaxKernel); REGISTER_OP_CUDA_KERNEL( elementwise_max_grad, ops::ElementwiseMaxGradKernel, - ops::ElementwiseMaxGradKernel, - ops::ElementwiseMaxGradKernel, - ops::ElementwiseMaxGradKernel); + ops::ElementwiseMaxGradKernel) diff --git a/paddle/fluid/operators/elementwise_min_op.cu b/paddle/fluid/operators/elementwise_min_op.cu index a237c9c503ec9..0cb8fdb7ac1ef 100644 --- a/paddle/fluid/operators/elementwise_min_op.cu +++ b/paddle/fluid/operators/elementwise_min_op.cu @@ -20,13 +20,8 @@ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( elementwise_min, ops::ElementwiseMinKernel, - ops::ElementwiseMinKernel, - ops::ElementwiseMinKernel, - ops::ElementwiseMinKernel); + ops::ElementwiseMinKernel); REGISTER_OP_CUDA_KERNEL( elementwise_min_grad, ops::ElementwiseMinGradKernel, - ops::ElementwiseMinGradKernel, - ops::ElementwiseMinGradKernel, - ops::ElementwiseMinGradKernel); + ops::ElementwiseMinGradKernel); diff --git a/paddle/fluid/operators/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise_mul_op.cu index 2fb1b4bee689c..9b72e5d8b16fc 100644 --- a/paddle/fluid/operators/elementwise_mul_op.cu +++ b/paddle/fluid/operators/elementwise_mul_op.cu @@ -20,13 +20,8 @@ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( elementwise_mul, ops::ElementwiseMulKernel, - ops::ElementwiseMulKernel, - ops::ElementwiseMulKernel, - ops::ElementwiseMulKernel); + ops::ElementwiseMulKernel); REGISTER_OP_CUDA_KERNEL( elementwise_mul_grad, ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel); + ops::ElementwiseMulGradKernel); diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h index 0b4238436ffcc..cb67e75836790 100644 --- a/paddle/fluid/operators/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise_op_function.h @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); @@ -18,7 +19,7 @@ limitations under the License. */ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/transform.h" -#ifdef __NVCC__ +#ifdef __HCC__ #include #include "paddle/fluid/platform/cuda_helper.h" constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024; @@ -149,7 +150,7 @@ class MidWiseTransformIterator { int64_t post_; }; -#ifdef __NVCC__ +#ifdef __HCC__ template class RowwiseTransformIterator : public thrust::iterator_adaptor< @@ -332,7 +333,7 @@ static void ElemwiseGradBroadcast1CPU(const T* x, const T* y, const T* out, } } } -#ifdef __NVCC__ +#ifdef __HIPCC__ template static __global__ void ElemwiseGradBroadcast1CUDAKernel( const T* x, const T* y, const T* out, const T* dout, int h, int w, @@ -363,13 +364,13 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel( } template -static void ElemwiseGradBroadcast1CUDA(cudaStream_t stream, const T* x, +static void ElemwiseGradBroadcast1CUDA(hipStream_t stream, const T* x, const T* y, const T* out, const T* dout, int h, int w, DX_OP dx_op, DY_OP dy_op, T* dx, T* dy) { int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h); int gird_size = w; - ElemwiseGradBroadcast1CUDAKernel<<>>( + hipLaunchKernelGGL((ElemwiseGradBroadcast1CUDAKernel), dim3(gird_size), dim3(block_size), 0, stream, x, y, out, dout, h, w, dx_op, dy_op, dx, dy); } @@ -400,7 +401,7 @@ static void ElemwiseGradBroadcast2CPU(const T* x, const T* y, const T* out, } } -#ifdef __NVCC__ +#ifdef __HIPCC__ template static __global__ void ElemwiseGradBroadcast2CUDAKernel( const T* x, const T* y, const T* out, const T* dout, int pre, int n, @@ -440,13 +441,13 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel( } template -static void ElemwiseGradBroadcast2CUDA(cudaStream_t stream, const T* x, +static void ElemwiseGradBroadcast2CUDA(hipStream_t stream, const T* x, const T* y, const T* out, const T* dout, int pre, int n, int post, DX_OP dx_op, DY_OP dy_op, T* dx, T* dy) { int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, pre * post); int gird_size = n; - ElemwiseGradBroadcast2CUDAKernel<<>>( + hipLaunchKernelGGL((ElemwiseGradBroadcast2CUDAKernel), dim3(gird_size), dim3(block_size), 0, stream, x, y, out, dout, pre, n, post, dx_op, dy_op, dx, dy); } @@ -481,7 +482,7 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx, int h = pre; int w = n; if (platform::is_gpu_place(ctx.GetPlace())) { -#ifdef __NVCC__ +#ifdef __HIPCC__ ElemwiseGradBroadcast1CUDA( ctx.template device_context().stream(), x.data(), y.data(), out.data(), dout.data(), h, w, dx_op, dy_op, @@ -497,7 +498,7 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx, } } else { if (platform::is_gpu_place(ctx.GetPlace())) { -#ifdef __NVCC__ +#ifdef __HIPCC__ ElemwiseGradBroadcast2CUDA( ctx.template device_context().stream(), x.data(), y.data(), out.data(), dout.data(), pre, n, post, dx_op, diff --git a/paddle/fluid/operators/elementwise_pow_op.h b/paddle/fluid/operators/elementwise_pow_op.h index 8c1c5f9f98018..999421eda52ad 100644 --- a/paddle/fluid/operators/elementwise_pow_op.h +++ b/paddle/fluid/operators/elementwise_pow_op.h @@ -22,7 +22,7 @@ namespace operators { template struct PowFunctor { - inline HOSTDEVICE T operator()(T a, T b) const { return std::pow(a, b); } + inline HOSTDEVICE T operator()(T a, T b) const { return pow(a, b); } }; template diff --git a/paddle/fluid/operators/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise_sub_op.cu index 8709f686f9af1..2e225ce79ecf0 100644 --- a/paddle/fluid/operators/elementwise_sub_op.cu +++ b/paddle/fluid/operators/elementwise_sub_op.cu @@ -20,13 +20,8 @@ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( elementwise_sub, ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel); + ops::ElementwiseSubKernel); REGISTER_OP_CUDA_KERNEL( elementwise_sub_grad, ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel); + ops::ElementwiseSubGradKernel); diff --git a/paddle/fluid/operators/gather.cu.h b/paddle/fluid/operators/gather.cu.h index d74d4db92528d..a99702bf23dbf 100644 --- a/paddle/fluid/operators/gather.cu.h +++ b/paddle/fluid/operators/gather.cu.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include "hip/hip_runtime.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/place.h" @@ -69,10 +70,8 @@ void GPUGather(const platform::DeviceContext& ctx, const Tensor& src, int n = slice_size * index_size; int grid = (n + block - 1) / block; - GatherCUDAKernel<<< - grid, block, 0, - reinterpret_cast(ctx).stream()>>>( - p_src, p_index, p_output, index_size, slice_size); + hipLaunchKernelGGL((GatherCUDAKernel), dim3(grid), dim3(block), 0, reinterpret_cast(ctx).stream(), + p_src, p_index, p_output, size_t(index_size), size_t(slice_size)); } } // namespace operators diff --git a/paddle/fluid/operators/get_places_op.cc b/paddle/fluid/operators/get_places_op.cc index 9002ce4717c6e..885e96f9f8c75 100644 --- a/paddle/fluid/operators/get_places_op.cc +++ b/paddle/fluid/operators/get_places_op.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/platform/place.h" -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) #include "paddle/fluid/platform/gpu_info.h" #endif @@ -24,7 +24,7 @@ namespace paddle { namespace operators { static size_t CUDADevCount() { -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) return platform::GetCUDADeviceCount(); #else return 0UL; diff --git a/paddle/fluid/operators/huber_loss_op.cu b/paddle/fluid/operators/huber_loss_op.cu index 659464df9dc0e..a0a09bd219a5f 100644 --- a/paddle/fluid/operators/huber_loss_op.cu +++ b/paddle/fluid/operators/huber_loss_op.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #define EIGEN_USE_GPU #include "paddle/fluid/operators/huber_loss_op.h" diff --git a/paddle/fluid/operators/layer_norm_op.h b/paddle/fluid/operators/layer_norm_op.h index 7b84ba0a7daf1..314c3151f404b 100644 --- a/paddle/fluid/operators/layer_norm_op.h +++ b/paddle/fluid/operators/layer_norm_op.h @@ -34,7 +34,7 @@ struct RowwiseMean2D { const framework::Tensor& input, framework::Tensor* vec); }; -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) template class RowwiseMean2D { public: @@ -80,7 +80,7 @@ struct ColwiseSum2D { const framework::Tensor& input, framework::Tensor* vec); }; -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) template class ColwiseSum2D { public: diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu index 6d81fccd2059c..f75d9dc027c0c 100644 --- a/paddle/fluid/operators/lookup_table_op.cu +++ b/paddle/fluid/operators/lookup_table_op.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "hip/hip_runtime.h" #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/lookup_table_op.h" @@ -107,14 +108,14 @@ class LookupTableCUDAKernel : public framework::OpKernel { dim3 grids(8, 1); if (padding_idx == -1) - LookupTable< + hipLaunchKernelGGL((LookupTable< T, 128, 8, 8, - false><<>>( + false>), dim3(grids), dim3(threads), 0, context.cuda_device_context().stream(), output, table, ids, N, K, D, padding_idx); else - LookupTable< + hipLaunchKernelGGL((LookupTable< T, 128, 8, 8, - true><<>>( + true>), dim3(grids), dim3(threads), 0, context.cuda_device_context().stream(), output, table, ids, N, K, D, padding_idx); } }; @@ -177,8 +178,10 @@ class LookupTableGradCUDAKernel : public framework::OpKernel { dim3 threads(128, 8); dim3 grids(8, 1); - LookupTableGrad<<>>( - d_table, d_output, ids, N, K, D); + hipLaunchKernelGGL((LookupTableGrad< + T, 128, 8, + 8>), dim3(grids), dim3(threads), 0, dev_ctx.stream(), + d_table, d_output, ids, int64_t(N), int64_t(K), int64_t(D)); } } }; diff --git a/paddle/fluid/operators/lrn_op.cu b/paddle/fluid/operators/lrn_op.cu index 64f3fea6be24e..ab21d28887351 100644 --- a/paddle/fluid/operators/lrn_op.cu +++ b/paddle/fluid/operators/lrn_op.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "hip/hip_runtime.h" #include "paddle/fluid/operators/lrn_op.h" namespace paddle { @@ -70,12 +71,12 @@ void CrossMapNormal(const framework::ExecutionContext& ctx, const T* inputs, int grid_size = (img_size + block_size - 1) / block_size; auto& dev_ctx = ctx.template device_context(); - KeCMRNormFillScale<<>>( + hipLaunchKernelGGL((KeCMRNormFillScale), dim3(grid_size), dim3(block_size), 0, dev_ctx.stream(), img_size, inputs, mid, C, H, W, n, k, alpha); int input_size = N * H * W * C; grid_size = (input_size + block_size - 1) / block_size; - KeCMRNormOutput<<>>( + hipLaunchKernelGGL((KeCMRNormOutput), dim3(grid_size), dim3(block_size), 0, dev_ctx.stream(), input_size, inputs, mid, -beta, outputs); } @@ -148,7 +149,7 @@ void CrossMapNormalGrad(const framework::ExecutionContext& ctx, const T* x, int grid_size = (img_size + block_size - 1) / block_size; auto& dev_ctx = ctx.template device_context(); - KeCMRNormDiff<<>>( + hipLaunchKernelGGL((KeCMRNormDiff), dim3(grid_size), dim3(block_size), 0, dev_ctx.stream(), img_size, x, out, mid, x_g, out_g, C, H, W, n, -beta, 2.0f * alpha * beta); } diff --git a/paddle/fluid/operators/lstm_unit_op.cu b/paddle/fluid/operators/lstm_unit_op.cu index 76245a1b5a9c8..0651a582df28a 100644 --- a/paddle/fluid/operators/lstm_unit_op.cu +++ b/paddle/fluid/operators/lstm_unit_op.cu @@ -16,6 +16,7 @@ limitations under the License. */ https://github.com/caffe2/caffe2/blob/master/caffe2/operators/lstm_unit_op_gpu.cu */ +#include "hip/hip_runtime.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/cross_entropy_op.h" #include "paddle/fluid/platform/assert.h" @@ -120,7 +121,7 @@ class LstmUnitOpCUDAKernel : public framework::OpKernel { int n = b_size * D; int grid = (n + block - 1) / block; - LSTMUnitKernel<<>>(n, D, C_prev, X, C, H, forget_bias); + hipLaunchKernelGGL((LSTMUnitKernel), dim3(grid), dim3(block), 0, 0, n, D, C_prev, X, C, H, forget_bias); } }; @@ -163,7 +164,7 @@ class LstmUnitGradOpCUDAKernel : public framework::OpKernel { int n = N * D; int grid = (n + block - 1) / block; - LSTMUnitGradientKernel<<>>(n, D, C_prev, X, C, H, C_diff, + hipLaunchKernelGGL((LSTMUnitGradientKernel), dim3(grid), dim3(block), 0, 0, n, D, C_prev, X, C, H, C_diff, H_diff, C_prev_diff, X_diff, forget_bias); } diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index ee0e91132bce5..5f6fe8e6fdc54 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -26,7 +26,7 @@ function(math_library TARGET) if (WITH_GPU) nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps}) elseif (WITH_AMD_GPU) - hip_library(${TARGET} SRCS ${cc_srcs} ${hip_srcs} DEPS ${math_library_DEPS} ${math_common_deps}) + hip_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps}) elseif(${cc_srcs_len} GREATER 0) cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${math_library_DEPS} ${math_common_deps}) endif() @@ -54,10 +54,10 @@ math_library(unpooling) math_library(vol2col) cc_test(math_function_test SRCS math_function_test.cc) -cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor) +hip_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor) cc_test(im2col_test SRCS im2col_test.cc DEPS im2col) cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col) -cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding) +hip_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding) if(WITH_GPU) nv_test(math_function_gpu_test SRCS math_function_test.cu) nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor) diff --git a/paddle/fluid/operators/math/concat.cu b/paddle/fluid/operators/math/concat.cu index c0786757b3419..522cb9b1f1239 100644 --- a/paddle/fluid/operators/math/concat.cu +++ b/paddle/fluid/operators/math/concat.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "hip/hip_runtime.h" #include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/operators/math/concat.h" #include "paddle/fluid/platform/cuda_helper.h" @@ -182,11 +183,11 @@ class ConcatFunctor { dim3 grid_size = dim3(grid_cols, grid_rows, 1); if (sameShape) { - KernelConcat<<>>( + hipLaunchKernelGGL((KernelConcat), dim3(grid_size), dim3(block_size), 0, context.stream(), dev_ins_data, in_col, out_row, out_col, output->data()); } else { const int* dev_ins_col_data = inputs_col.CUDAData(context.GetPlace()); - KernelConcat<<>>( + hipLaunchKernelGGL((KernelConcat), dim3(grid_size), dim3(block_size), 0, context.stream(), dev_ins_data, dev_ins_col_data, static_cast(inputs_col.size()), out_row, out_col, output->data()); } @@ -252,11 +253,11 @@ class ConcatGradFunctor { dim3 grid_size = dim3(grid_cols, grid_rows, 1); if (sameShape) { - KernelConcatGrad<<>>( + hipLaunchKernelGGL((KernelConcatGrad), dim3(grid_size), dim3(block_size), 0, context.stream(), input.data(), in_row, in_col, out_col, dev_out_gpu_data); } else { const int* dev_outs_col_data = outputs_cols.CUDAData(context.GetPlace()); - KernelConcatGrad<<>>( + hipLaunchKernelGGL((KernelConcatGrad), dim3(grid_size), dim3(block_size), 0, context.stream(), input.data(), in_row, in_col, dev_outs_col_data, static_cast(outputs_cols.size()), dev_out_gpu_data); } diff --git a/paddle/fluid/operators/math/cos_sim_functor.cu b/paddle/fluid/operators/math/cos_sim_functor.cu index 55c1e726335df..f3984c9067afd 100644 --- a/paddle/fluid/operators/math/cos_sim_functor.cu +++ b/paddle/fluid/operators/math/cos_sim_functor.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "hip/hip_runtime.h" #include "paddle/fluid/operators/math/cos_sim_functor.h" #include "paddle/fluid/platform/cuda_helper.h" @@ -52,7 +53,7 @@ struct CosSimDyFunctor { const int block_size = 512; dim3 threads(block_size, 1); dim3 grid(1, (rows + block_size - 1) / block_size); - CosSimDyKernel<<>>( + hipLaunchKernelGGL((CosSimDyKernel), dim3(grid), dim3(threads), 0, ctx.stream(), x_norm, y_norm, x, y, z, dz, rows, cols, dy); } }; diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu index f4935c2813c9f..f9a4eb287f860 100644 --- a/paddle/fluid/operators/math/cross_entropy.cu +++ b/paddle/fluid/operators/math/cross_entropy.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "hip/hip_runtime.h" #include "paddle/fluid/operators/math/cross_entropy.h" namespace paddle { @@ -39,6 +40,12 @@ __device__ __forceinline__ T sum_single_warp(T val) { return val; } +//XXX: Commented out since __shfl_down doesn't support double. +template <> +__device__ __forceinline__ double sum_single_warp(double val) { + return val; +} + // CUDA do not support dynamic arrary in template // https://stackoverflow.com/questions/20497209 template @@ -50,7 +57,7 @@ struct SharedMemory { template <> struct SharedMemory { __device__ float* GetPointer() { - extern __shared__ float s_float[]; + HIP_DYNAMIC_SHARED( float, s_float) return s_float; } }; @@ -58,7 +65,7 @@ struct SharedMemory { template <> struct SharedMemory { __device__ double* GetPointer() { - extern __shared__ double s_double[]; + HIP_DYNAMIC_SHARED( double, s_double) return s_double; } }; @@ -75,7 +82,7 @@ __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label, int next_idx = blockIdx.x * class_num + tid; while (cur_idx < class_num) { d_sum[tid] += - math::TolerableValue()(std::log(X[next_idx])) * label[next_idx]; + math::TolerableValue()(log(X[next_idx])) * label[next_idx]; next_idx += blockDim.x; cur_idx += blockDim.x; } @@ -110,15 +117,13 @@ class CrossEntropyFunctor { const T* label_data = labels->data(); int block = class_num > 512 ? 512 : pow(2, int(std::log2(class_num))); - SoftCrossEntropyKernel<<< - batch_size, block, block * sizeof(T), - reinterpret_cast(ctx).stream()>>>( + hipLaunchKernelGGL((SoftCrossEntropyKernel), dim3(batch_size), dim3(block), block * sizeof(T), reinterpret_cast(ctx).stream(), loss_data, prob_data, label_data, class_num); } else { const int64_t* label_data = labels->data(); int block = 512; int grid = (batch_size + block - 1) / block; - CrossEntropyKernel<<>>( + hipLaunchKernelGGL((CrossEntropyKernel), dim3(grid), dim3(block), 0, ctx.stream(), loss_data, prob_data, label_data, batch_size, class_num); } } diff --git a/paddle/fluid/operators/math/cross_entropy.h b/paddle/fluid/operators/math/cross_entropy.h index adc5b3fe47cd3..7d8c0eaff06bc 100644 --- a/paddle/fluid/operators/math/cross_entropy.h +++ b/paddle/fluid/operators/math/cross_entropy.h @@ -27,8 +27,8 @@ struct TolerableValue { PADDLE_ASSERT(std::is_floating_point::value); const T kApproInf = 1e20; - if (x == INFINITY) return kApproInf; - if (x == -INFINITY) return -kApproInf; + if (x == FP_INFINITE) return kApproInf; + if (x == -FP_INFINITE) return -kApproInf; return x; } }; diff --git a/paddle/fluid/operators/math/depthwise_conv.cu b/paddle/fluid/operators/math/depthwise_conv.cu index a5e6e4031bbad..2f6196c164a03 100644 --- a/paddle/fluid/operators/math/depthwise_conv.cu +++ b/paddle/fluid/operators/math/depthwise_conv.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "hip/hip_runtime.h" #include "paddle/fluid/operators/math/depthwise_conv.h" #include "paddle/fluid/platform/cuda_helper.h" @@ -200,7 +201,7 @@ class DepthwiseConvFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelDepthwiseConv<<>>( + hipLaunchKernelGGL((KernelDepthwiseConv), dim3(grid), dim3(threads), 0, context.stream(), nthreads, input_data, filter_data, batch_size, output_channels, output_height, output_width, input_channels, input_height, input_width, output_channels / input_channels, ksize_height, ksize_width, @@ -242,7 +243,7 @@ class DepthwiseConvInputGradFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelDepthwiseConvInputGrad<<>>( + hipLaunchKernelGGL((KernelDepthwiseConvInputGrad), dim3(grid), dim3(threads), 0, context.stream(), nthreads, output_grad_data, filter_data, batch_size, output_channels, output_height, output_width, input_channels, input_height, input_width, output_channels / input_channels, ksize_height, ksize_width, @@ -284,7 +285,7 @@ class DepthwiseConvFilterGradFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelDepthwiseConvFilterGrad<<>>( + hipLaunchKernelGGL((KernelDepthwiseConvFilterGrad), dim3(grid), dim3(threads), 0, context.stream(), nthreads, output_grad_data, input_data, batch_size, output_channels, output_height, output_width, input_channels, input_height, input_width, output_channels / input_channels, ksize_height, ksize_width, diff --git a/paddle/fluid/operators/math/detail/activation_functions.h b/paddle/fluid/operators/math/detail/activation_functions.h index d205ebf210818..0338863ba5b4c 100644 --- a/paddle/fluid/operators/math/detail/activation_functions.h +++ b/paddle/fluid/operators/math/detail/activation_functions.h @@ -127,22 +127,66 @@ static DEVICE Active::ActGrad kActGradDouble[] = { namespace forward { inline DEVICE float activation(float a, int index) { - return kActFloat[index](a); + switch(index) { + case 0: + return kActFloat[0](a); + case 1: + return kActFloat[1](a); + case 2: + return kActFloat[2](a); + case 3: + return kActFloat[3](a); + default: + return 0.0f; + } } inline DEVICE double activation(double a, int index) { - return kActDouble[index](a); + switch(index) { + case 0: + return kActDouble[0](a); + case 1: + return kActDouble[1](a); + case 2: + return kActDouble[2](a); + case 3: + return kActDouble[3](a); + default: + return 0.0f; + } } } // namespace forward namespace backward { inline DEVICE float activation(float a, float b, int index) { - return kActGradFloat[index](a, b); + switch(index) { + case 0: + return kActGradFloat[0](a, b); + case 1: + return kActGradFloat[1](a, b); + case 2: + return kActGradFloat[2](a, b); + case 3: + return kActGradFloat[3](a, b); + default: + return 0.0f; + } } inline DEVICE double activation(double a, double b, int index) { - return kActGradDouble[index](a, b); + switch(index) { + case 0: + return kActGradDouble[0](a, b); + case 1: + return kActGradDouble[1](a, b); + case 2: + return kActGradDouble[2](a, b); + case 3: + return kActGradDouble[3](a, b); + default: + return 0.0f; + } } } // namespace backward diff --git a/paddle/fluid/operators/math/detail/gru_cpu_kernel.h b/paddle/fluid/operators/math/detail/gru_cpu_kernel.h index 1e5ff8ef46db9..5f6ae31b7e227 100644 --- a/paddle/fluid/operators/math/detail/gru_cpu_kernel.h +++ b/paddle/fluid/operators/math/detail/gru_cpu_kernel.h @@ -22,7 +22,7 @@ namespace operators { namespace math { namespace detail { -#ifndef __NVCC__ +#ifndef __HIPCC__ template void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output, diff --git a/paddle/fluid/operators/math/detail/gru_gpu_kernel.h b/paddle/fluid/operators/math/detail/gru_gpu_kernel.h index 657652562780a..e3203910a0f8e 100644 --- a/paddle/fluid/operators/math/detail/gru_gpu_kernel.h +++ b/paddle/fluid/operators/math/detail/gru_gpu_kernel.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include "hip/hip_runtime.h" #include #include "paddle/fluid/operators/math/detail/activation_functions.h" #include "paddle/fluid/operators/math/gru_compute.h" diff --git a/paddle/fluid/operators/math/detail/gru_kernel.h b/paddle/fluid/operators/math/detail/gru_kernel.h index 991f2e758c2c3..24573164c4226 100644 --- a/paddle/fluid/operators/math/detail/gru_kernel.h +++ b/paddle/fluid/operators/math/detail/gru_kernel.h @@ -35,7 +35,7 @@ class gru_resetOutput { value_reset_gate = activation(value_reset_gate, act_gate); value_reset_output = prev_out * value_reset_gate; } -#ifndef __NVCC__ +#ifndef __HIPCC__ #ifndef __AVX__ static const bool avx = false; #else @@ -62,7 +62,7 @@ class gru_finalOutput { value_output = prev_out - (value_update_gate * prev_out) + (value_update_gate * value_frame_state); } -#ifndef __NVCC__ +#ifndef __HIPCC__ #ifndef __AVX__ static const bool avx = false; #else @@ -96,7 +96,7 @@ class gru_stateGrad { grad_frame_state = activation(grad_output * value_update_gate, value_frame_state, act_input); } -#ifndef __NVCC__ +#ifndef __HIPCC__ #ifndef __AVX__ static const bool avx = false; #else @@ -134,7 +134,7 @@ class gru_resetGrad { activation(grad_update_gate, value_update_gate, act_gate); grad_reset_gate = activation(grad_reset_gate, value_reset_gate, act_gate); } -#ifndef __NVCC__ +#ifndef __HIPCC__ #ifndef __AVX__ static const bool avx = false; #else diff --git a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h index 6ad77830fd7a9..b3f21b961a780 100644 --- a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h +++ b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h @@ -22,7 +22,7 @@ namespace operators { namespace math { namespace detail { -#ifndef __NVCC__ +#ifndef __HIPCC__ template void naive_lstm_forward_one_sequence(Op op, LstmMetaValue value, diff --git a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h index ee7b16da4187e..664eed4f7458f 100644 --- a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h +++ b/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include "hip/hip_runtime.h" #include "paddle/fluid/operators/math/detail/activation_functions.h" #include "paddle/fluid/operators/math/lstm_compute.h" #include "paddle/fluid/platform/cuda_helper.h" @@ -203,13 +204,13 @@ void gpu_lstm_forward(const platform::DeviceContext& context, Op op, auto stream = reinterpret_cast(context).stream(); if (batch_size == 1) { - KeLstmForward<<>>( + hipLaunchKernelGGL((KeLstmForward), dim3(grid), dim3(threads), 0, stream, op, value, frame_size, batch_size, active_node, active_gate, active_state); } else { - KeLstmForward<<>>( + hipLaunchKernelGGL((KeLstmForward), dim3(grid), dim3(threads), 0, stream, op, value, frame_size, batch_size, active_node, active_gate, active_state); } @@ -237,13 +238,13 @@ void gpu_lstm_backward(const platform::DeviceContext& context, Op op, auto stream = reinterpret_cast(context).stream(); if (batch_size == 1) { - KeLstmBackward<<>>( + hipLaunchKernelGGL((KeLstmBackward), dim3(grid), dim3(threads), 0, stream, op, value, grad, frame_size, batch_size, active_node, active_gate, active_state); } else { - KeLstmBackward<<>>( + hipLaunchKernelGGL((KeLstmBackward), dim3(grid), dim3(threads), 0, stream, op, value, grad, frame_size, batch_size, active_node, active_gate, active_state); } diff --git a/paddle/fluid/operators/math/detail/lstm_kernel.h b/paddle/fluid/operators/math/detail/lstm_kernel.h index 9080634f2b3fc..db6ee5c9bc00c 100644 --- a/paddle/fluid/operators/math/detail/lstm_kernel.h +++ b/paddle/fluid/operators/math/detail/lstm_kernel.h @@ -41,7 +41,7 @@ class lstm { state_atv = activation(state, active_state); output = value_og * state_atv; } -#ifndef __NVCC__ +#ifndef __HIPCC__ #ifndef __AVX__ // If not compiled with AVX instructs. Disable AVX by default static const bool avx = false; #else @@ -102,7 +102,7 @@ class lstm { checkFGrad = grad_fg * prev_state; checkOGrad = grad_og * state; } -#ifndef __NVCC__ +#ifndef __HIPCC__ #ifndef __AVX__ // If not compiled with AVX instructs. Disable AVX by default static const bool avx = false; #else diff --git a/paddle/fluid/operators/math/gru_compute.cc b/paddle/fluid/operators/math/gru_compute.cc index 3f044b775138c..c80ec0facb7f6 100644 --- a/paddle/fluid/operators/math/gru_compute.cc +++ b/paddle/fluid/operators/math/gru_compute.cc @@ -24,7 +24,7 @@ struct GRUUnitFunctor { GRUMetaValue value, int frame_size, int batch_size, const detail::ActivationType active_node, const detail::ActivationType active_gate) { -#ifndef __NVCC__ +#ifndef __HIPCC__ if (value.prev_out_value) { math::gemm( context, false, false, batch_size, frame_size * 2, frame_size, 1, @@ -55,7 +55,7 @@ struct GRUUnitGradFunctor { int frame_size, int batch_size, const detail::ActivationType active_node, const detail::ActivationType active_gate) { -#ifndef __NVCC__ +#ifndef __HIPCC__ detail::backward_state_grad(detail::backward::gru_stateGrad(), value, grad, frame_size, batch_size, active_node); diff --git a/paddle/fluid/operators/math/gru_compute.cu b/paddle/fluid/operators/math/gru_compute.cu index 27caf3383dd6c..6e60f59a772c6 100644 --- a/paddle/fluid/operators/math/gru_compute.cu +++ b/paddle/fluid/operators/math/gru_compute.cu @@ -9,6 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "hip/hip_runtime.h" #include "paddle/fluid/operators/math/detail/gru_gpu_kernel.h" #include "paddle/fluid/operators/math/detail/gru_kernel.h" #include "paddle/fluid/operators/math/gru_compute.h" @@ -45,16 +46,16 @@ struct GRUUnitFunctor { } if (batch_size == 1) { - detail::KeGruForwardResetOutput, + hipLaunchKernelGGL((detail::KeGruForwardResetOutput, /* is_batch= */ false, - T><<>>( + T>), dim3(grid), dim3(threads), 0, stream, detail::forward::gru_resetOutput(), value.gate_value, value.reset_output_value, value.prev_out_value, frame_size, batch_size, active_gate); } else { - detail::KeGruForwardResetOutput, + hipLaunchKernelGGL((detail::KeGruForwardResetOutput, /* is_batch= */ true, - T><<>>( + T>), dim3(grid), dim3(threads), 0, stream, detail::forward::gru_resetOutput(), value.gate_value, value.reset_output_value, value.prev_out_value, frame_size, batch_size, active_gate); @@ -68,16 +69,16 @@ struct GRUUnitFunctor { } if (batch_size == 1) { - detail::KeGruForwardFinalOutput, + hipLaunchKernelGGL((detail::KeGruForwardFinalOutput, /* is_batch= */ false, - T><<>>( + T>), dim3(grid), dim3(threads), 0, stream, detail::forward::gru_finalOutput(), value.gate_value, value.prev_out_value, value.output_value, frame_size, batch_size, active_node); } else { - detail::KeGruForwardFinalOutput, + hipLaunchKernelGGL((detail::KeGruForwardFinalOutput, /* is_batch= */ true, - T><<>>( + T>), dim3(grid), dim3(threads), 0, stream, detail::forward::gru_finalOutput(), value.gate_value, value.prev_out_value, value.output_value, frame_size, batch_size, active_node); @@ -106,16 +107,16 @@ struct GRUUnitGradFunctor { } if (batch_size == 1) { - detail::KeGruBackwardStateGrad< + hipLaunchKernelGGL((detail::KeGruBackwardStateGrad< detail::backward::gru_stateGrad, - /* is_batch= */ false><<>>( + /* is_batch= */ false>),dim3(grid), dim3(threads), 0, stream, detail::backward::gru_stateGrad(), value.gate_value, grad.gate_grad, value.prev_out_value, grad.prev_out_grad, grad.output_grad, frame_size, batch_size, active_node); } else { - detail::KeGruBackwardStateGrad< + hipLaunchKernelGGL((detail::KeGruBackwardStateGrad< detail::backward::gru_stateGrad, - /* is_batch= */ true><<>>( + /* is_batch= */ true>), dim3(grid), dim3(threads), 0, stream, detail::backward::gru_stateGrad(), value.gate_value, grad.gate_grad, value.prev_out_value, grad.prev_out_grad, grad.output_grad, frame_size, batch_size, active_node); @@ -137,16 +138,16 @@ struct GRUUnitGradFunctor { } if (batch_size == 1) { - detail::KeGruBackwardResetGrad< + hipLaunchKernelGGL((detail::KeGruBackwardResetGrad< detail::backward::gru_resetGrad, - /* is_batch= */ false><<>>( + /* is_batch= */ false>), dim3(grid), dim3(threads), 0, stream, detail::backward::gru_resetGrad(), value.gate_value, grad.gate_grad, value.prev_out_value, grad.prev_out_grad, grad.reset_output_grad, frame_size, batch_size, active_gate); } else { - detail::KeGruBackwardResetGrad< + hipLaunchKernelGGL((detail::KeGruBackwardResetGrad< detail::backward::gru_resetGrad, - /* is_batch= */ true><<>>( + /* is_batch= */ true>), dim3(grid), dim3(threads), 0, stream, detail::backward::gru_resetGrad(), value.gate_value, grad.gate_grad, value.prev_out_value, grad.prev_out_grad, grad.reset_output_grad, frame_size, batch_size, active_gate); diff --git a/paddle/fluid/operators/math/im2col.cu b/paddle/fluid/operators/math/im2col.cu index f41c78140fb60..455ba7755b9dd 100644 --- a/paddle/fluid/operators/math/im2col.cu +++ b/paddle/fluid/operators/math/im2col.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "hip/hip_runtime.h" #include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/platform/cuda_helper.h" @@ -96,7 +97,8 @@ class Im2ColFunctor<<>>( + hipLaunchKernelGGL((im2col), dim3(grid), dim3(threads), 0, + context.stream(), im.data(), num_outputs, im_height, im_width, dilation[0], dilation[1], filter_height, filter_width, stride[0], stride[1], padding[0], padding[1], col_height, col_width, col->data()); @@ -201,7 +203,8 @@ class Col2ImFunctor<<>>( + hipLaunchKernelGGL((col2im), dim3(grid), dim3(threads), 0, + context.stream(), num_kernels, col.data(), im_height, im_width, dilation[0], dilation[1], filter_height, filter_width, stride[0], stride[1], padding[0], padding[2], col_height, col_width, im->data()); @@ -306,7 +309,8 @@ class Im2ColFunctor<<>>( + hipLaunchKernelGGL((im2colOCF), dim3(grid), dim3(threads), 0, + context.stream(), im.data(), im_channels, im_height, im_width, filter_height, filter_width, stride[0], stride[1], padding[0], padding[1], col_height, col_width, col->data()); @@ -403,7 +407,8 @@ class Col2ImFunctor<<>>( + hipLaunchKernelGGL((col2imOCF), dim3(grid), dim3(threads), 0, + context.stream(), col.data(), im_channels, im_height, im_width, filter_height, filter_width, stride[0], stride[1], padding[0], padding[1], col_height, col_width, im->data()); diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc index 44fd739fb1d16..af82b026fb38e 100644 --- a/paddle/fluid/operators/math/math_function.cc +++ b/paddle/fluid/operators/math/math_function.cc @@ -348,7 +348,7 @@ struct TensorSetConstantWithPlace : public boost::static_visitor { void set_constant(const platform::DeviceContext& context, framework::Tensor* tensor, float value) { TensorSetConstantWithPlace func(context, tensor, value); -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) tensor->place().apply_visitor(func); #else func(platform::CPUPlace()); diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu index 1e909db5288af..e56f5bc0421d8 100644 --- a/paddle/fluid/operators/math/math_function.cu +++ b/paddle/fluid/operators/math/math_function.cu @@ -24,20 +24,21 @@ namespace math { using float16 = paddle::platform::float16; +#if 0 template <> void gemm( const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, const float16 alpha, const float16* A, const float16* B, const float16 beta, float16* C) { - // Note that cublas follows fortran order, so the order is different from + // Note that hipblas follows fortran order, so the order is different from // the cblas convention. int lda = (transA == CblasNoTrans) ? K : M; int ldb = (transB == CblasNoTrans) ? N : K; - cublasOperation_t cuTransA = - (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cuTransB = - (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + hipblasOperation_t cuTransA = + (transA == CblasNoTrans) ? HIPBLAS_OP_N : HIPBLAS_OP_T; + hipblasOperation_t cuTransB = + (transB == CblasNoTrans) ? HIPBLAS_OP_N : HIPBLAS_OP_T; const half h_alpha = static_cast(alpha); const half h_beta = static_cast(beta); @@ -47,11 +48,12 @@ void gemm( // TODO(kexinzhao): add processing code for compute capability < 53 case PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53, - "cublas Hgemm requires GPU compute capability >= 53"); - PADDLE_ENFORCE(platform::dynload::cublasHgemm( - context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb, + "hipblas Hgemm requuires GPU compute capability >= 53"); + PADDLE_ENFORCE(platform::dynload::hipblasHgemm( + context.hipblas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb, h_A, lda, &h_beta, h_C, N)); } +#endif template <> void gemm( @@ -59,18 +61,18 @@ void gemm( const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, const float alpha, const float* A, const float* B, const float beta, float* C) { - // Note that cublas follows fortran order, so the order is different from + // Note that hipblas follows fortran order, so the order is different from // the cblas convention. int lda = (transA == CblasNoTrans) ? K : M; int ldb = (transB == CblasNoTrans) ? N : K; - cublasOperation_t cuTransA = - (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cuTransB = - (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - - PADDLE_ENFORCE(platform::dynload::cublasSgemm( - context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, - lda, &beta, C, N)); + hipblasOperation_t cuTransA = + (transA == CblasNoTrans) ? HIPBLAS_OP_N : HIPBLAS_OP_T; + hipblasOperation_t cuTransB = + (transB == CblasNoTrans) ? HIPBLAS_OP_N : HIPBLAS_OP_T; + + PADDLE_ENFORCE(platform::dynload::hipblasSgemm( + context.hipblas_handle(), + cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); } template <> @@ -79,29 +81,30 @@ void gemm( const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, const double alpha, const double* A, const double* B, const double beta, double* C) { - // Note that cublas follows fortran order, so the order is different from + // Note that hipblas follows fortran order, so the order is different from // the cblas convention. int lda = (transA == CblasNoTrans) ? K : M; int ldb = (transB == CblasNoTrans) ? N : K; - cublasOperation_t cuTransA = - (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cuTransB = - (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - PADDLE_ENFORCE(platform::dynload::cublasDgemm( - context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, - lda, &beta, C, N)); + hipblasOperation_t cuTransA = + (transA == CblasNoTrans) ? HIPBLAS_OP_N : HIPBLAS_OP_T; + hipblasOperation_t cuTransB = + (transB == CblasNoTrans) ? HIPBLAS_OP_N : HIPBLAS_OP_T; + PADDLE_ENFORCE(platform::dynload::hipblasDgemm( + context.hipblas_handle(), + cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); } +#if 0 template <> void gemm( const platform::CUDADeviceContext& context, const bool transA, const bool transB, const int M, const int N, const int K, const float16 alpha, const float16* A, const int lda, const float16* B, const int ldb, const float16 beta, float16* C, const int ldc) { - // Note that cublas follows fortran order, so the order is different from + // Note that hipblas follows fortran order, so the order is different from // the cblas convention. - cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T; + hipblasOperation_t cuTransA = transA == false ? HIPBLAS_OP_N : HIPBLAS_OP_T; + hipblasOperation_t cuTransB = transB == false ? HIPBLAS_OP_N : HIPBLAS_OP_T; const half h_alpha = static_cast(alpha); const half h_beta = static_cast(beta); @@ -111,11 +114,12 @@ void gemm( // TODO(kexinzhao): add processing code for compute capability < 53 case PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53, - "cublas Hgemm requires GPU compute capability >= 53"); - PADDLE_ENFORCE(platform::dynload::cublasHgemm( - context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb, + "hipblas Hgemm requires GPU compute capability >= 53"); + PADDLE_ENFORCE(platform::dynload::hipblasHgemm( + context.hipblas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb, h_A, lda, &h_beta, h_C, ldc)); } +#endif template <> void gemm( @@ -123,13 +127,13 @@ void gemm( const bool transB, const int M, const int N, const int K, const float alpha, const float* A, const int lda, const float* B, const int ldb, const float beta, float* C, const int ldc) { - // Note that cublas follows fortran order, so the order is different from + // Note that hipblas follows fortran order, so the order is different from // the cblas convention. - cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T; - PADDLE_ENFORCE(platform::dynload::cublasSgemm( - context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, - lda, &beta, C, ldc)); + hipblasOperation_t cuTransA = transA == false ? HIPBLAS_OP_N : HIPBLAS_OP_T; + hipblasOperation_t cuTransB = transB == false ? HIPBLAS_OP_N : HIPBLAS_OP_T; + PADDLE_ENFORCE(platform::dynload::hipblasSgemm( + context.hipblas_handle(), + cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, ldc)); } template <> @@ -138,15 +142,16 @@ void gemm( const bool transB, const int M, const int N, const int K, const double alpha, const double* A, const int lda, const double* B, const int ldb, const double beta, double* C, const int ldc) { - // Note that cublas follows fortran order, so the order is different from + // Note that hipblas follows fortran order, so the order is different from // the cblas convention. - cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T; - PADDLE_ENFORCE(platform::dynload::cublasDgemm( - context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, - lda, &beta, C, ldc)); + hipblasOperation_t cuTransA = transA == false ? HIPBLAS_OP_N : HIPBLAS_OP_T; + hipblasOperation_t cuTransB = transB == false ? HIPBLAS_OP_N : HIPBLAS_OP_T; + PADDLE_ENFORCE(platform::dynload::hipblasDgemm( + context.hipblas_handle(), + cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, ldc)); } +#if 0 template <> void matmul( const platform::CUDADeviceContext& context, @@ -175,6 +180,7 @@ void matmul( context, transA, transB, M, N, K, alpha, matrix_a.data(), matrix_b.data(), beta, matrix_out->data()); } +#endif template <> void matmul( @@ -234,21 +240,22 @@ void matmul( matrix_b.data(), beta, matrix_out->data()); } +#if 0 template <> void batched_gemm( const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, const float16 alpha, const float16* A, const float16* B, const float16 beta, float16* C, const int batchCount, const int strideA, const int strideB) { - // Note that cublas follows fortran order, so the order is different from + // Note that hipblas follows fortran order, so the order is different from // the cblas convention. int lda = (transA == CblasNoTrans) ? K : M; int ldb = (transB == CblasNoTrans) ? N : K; int ldc = N; - cublasOperation_t cuTransA = - (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cuTransB = - (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + hipblasOperation_t cuTransA = + (transA == CblasNoTrans) ? HIPBLAS_OP_N : HIPBLAS_OP_T; + hipblasOperation_t cuTransB = + (transB == CblasNoTrans) ? HIPBLAS_OP_N : HIPBLAS_OP_T; const int strideC = M * N; const half h_alpha = static_cast(alpha); @@ -259,11 +266,12 @@ void batched_gemm( // TODO(kexinzhao): add processing code for compute capability < 53 case PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53, - "cublas Hgemm requires GPU compute capability >= 53"); - PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched( - context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb, + "hipblas Hgemm requires GPU compute capability >= 53"); + PADDLE_ENFORCE(platform::dynload::hipblasHgemmStridedBatched( + context.hipblas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb, strideB, h_A, lda, strideA, &h_beta, h_C, ldc, strideC, batchCount)); } +#endif template <> void batched_gemm( @@ -271,20 +279,21 @@ void batched_gemm( const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, const float alpha, const float* A, const float* B, const float beta, float* C, const int batchCount, const int strideA, const int strideB) { - // Note that cublas follows fortran order, so the order is different from + // Note that hipblas follows fortran order, so the order is different from // the cblas convention. int lda = (transA == CblasNoTrans) ? K : M; int ldb = (transB == CblasNoTrans) ? N : K; int ldc = N; - cublasOperation_t cuTransA = - (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cuTransB = - (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + hipblasOperation_t cuTransA = + (transA == CblasNoTrans) ? HIPBLAS_OP_N : HIPBLAS_OP_T; + hipblasOperation_t cuTransB = + (transB == CblasNoTrans) ? HIPBLAS_OP_N : HIPBLAS_OP_T; const int strideC = M * N; - PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched( - context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, - strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount)); + PADDLE_ENFORCE(platform::dynload::hipblasSgemmStridedBatched( + context.hipblas_handle(), + cuTransB, cuTransA, N, M, K, &alpha, B, ldb, strideB, A, lda, strideA, + &beta, C, ldc, strideC, batchCount)); } template <> @@ -293,19 +302,19 @@ void batched_gemm( const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, const double alpha, const double* A, const double* B, const double beta, double* C, const int batchCount, const int strideA, const int strideB) { - // Note that cublas follows fortran order, so the order is different from + // Note that hipblas follows fortran order, so the order is different from // the cblas convention. int lda = (transA == CblasNoTrans) ? K : M; int ldb = (transB == CblasNoTrans) ? N : K; int ldc = N; - cublasOperation_t cuTransA = - (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cuTransB = - (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + hipblasOperation_t cuTransA = + (transA == CblasNoTrans) ? HIPBLAS_OP_N : HIPBLAS_OP_T; + hipblasOperation_t cuTransB = + (transB == CblasNoTrans) ? HIPBLAS_OP_N : HIPBLAS_OP_T; const int strideC = M * N; - PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched( - context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, + PADDLE_ENFORCE(platform::dynload::hipblasDgemmStridedBatched( + context.hipblas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount)); } @@ -314,9 +323,9 @@ void gemv( const platform::CUDADeviceContext& context, const bool trans_a, const int M, const int N, const float alpha, const float* A, const float* B, const float beta, float* C) { - cublasOperation_t cuTransA = (trans_a == false) ? CUBLAS_OP_T : CUBLAS_OP_N; + hipblasOperation_t cuTransA = (trans_a == false) ? HIPBLAS_OP_T : HIPBLAS_OP_N; - PADDLE_ENFORCE(platform::dynload::cublasSgemv(context.cublas_handle(), + PADDLE_ENFORCE(platform::dynload::hipblasSgemv(context.hipblas_handle(), cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1)); } @@ -326,8 +335,8 @@ void gemv( const platform::CUDADeviceContext& context, const bool trans_a, const int M, const int N, const double alpha, const double* A, const double* B, const double beta, double* C) { - cublasOperation_t cuTransA = (trans_a == false) ? CUBLAS_OP_T : CUBLAS_OP_N; - PADDLE_ENFORCE(platform::dynload::cublasDgemv(context.cublas_handle(), + hipblasOperation_t cuTransA = (trans_a == false) ? HIPBLAS_OP_T : HIPBLAS_OP_N; + PADDLE_ENFORCE(platform::dynload::hipblasDgemv(context.hipblas_handle(), cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1)); } @@ -336,7 +345,7 @@ template <> void axpy( const platform::CUDADeviceContext& context, const int n, const float alpha, const float* x, float* y) { - PADDLE_ENFORCE(platform::dynload::cublasSaxpy(context.cublas_handle(), n, + PADDLE_ENFORCE(platform::dynload::hipblasSaxpy(context.hipblas_handle(), n, &alpha, x, 1, y, 1)); } @@ -344,7 +353,7 @@ template <> void axpy( const platform::CUDADeviceContext& context, const int n, const double alpha, const double* x, double* y) { - PADDLE_ENFORCE(platform::dynload::cublasDaxpy(context.cublas_handle(), n, + PADDLE_ENFORCE(platform::dynload::hipblasDaxpy(context.hipblas_handle(), n, &alpha, x, 1, y, 1)); } @@ -414,7 +423,7 @@ struct RowwiseAdd { PADDLE_ENFORCE_EQ(output->dims(), in_dims); int blocks = 512; int grids = (input.numel() + blocks - 1) / blocks; - RowwiseAddKernel<<>>( + hipLaunchKernelGGL((RowwiseAddKernel), dim3(grids), dim3(blocks), 0, context.stream(), input.data(), vector.data(), output->data(), static_cast(in_dims[1]), static_cast(input.numel())); } diff --git a/paddle/fluid/operators/math/math_function_test.cu b/paddle/fluid/operators/math/math_function_test.cu index 8982d9d066165..128feb4d4afac 100644 --- a/paddle/fluid/operators/math/math_function_test.cu +++ b/paddle/fluid/operators/math/math_function_test.cu @@ -14,6 +14,7 @@ #include "gtest/gtest.h" #include "paddle/fluid/operators/math/math_function.h" +#if 0 void fill_fp16_data(paddle::platform::float16* in_ptr, size_t size, const std::vector& data) { PADDLE_ENFORCE_EQ(size, data.size()); @@ -21,6 +22,7 @@ void fill_fp16_data(paddle::platform::float16* in_ptr, size_t size, in_ptr[i] = paddle::platform::float16(data[i]); } } +#endif TEST(math_function, notrans_mul_trans_fp32) { using namespace paddle::framework; @@ -58,6 +60,7 @@ TEST(math_function, notrans_mul_trans_fp32) { EXPECT_EQ(out_ptr[3], 50); } +#if 0 TEST(math_function, notrans_mul_trans_fp16) { using namespace paddle::framework; using namespace paddle::platform; @@ -98,6 +101,7 @@ TEST(math_function, notrans_mul_trans_fp16) { EXPECT_EQ(static_cast(out_ptr[2]), 14); EXPECT_EQ(static_cast(out_ptr[3]), 50); } +#endif TEST(math_function, trans_mul_notrans_fp32) { using namespace paddle::framework; @@ -140,6 +144,7 @@ TEST(math_function, trans_mul_notrans_fp32) { EXPECT_EQ(out_ptr[8], 29); } +#if 0 TEST(math_function, trans_mul_notrans_fp16) { using namespace paddle::framework; using namespace paddle::platform; @@ -185,6 +190,7 @@ TEST(math_function, trans_mul_notrans_fp16) { EXPECT_EQ(static_cast(out_ptr[7]), 22); EXPECT_EQ(static_cast(out_ptr[8]), 29); } +#endif TEST(math_function, gemm_notrans_cublas_fp32) { using namespace paddle::framework; @@ -243,6 +249,7 @@ TEST(math_function, gemm_notrans_cublas_fp32) { EXPECT_EQ(input3_ptr[7], 99); } +#if 0 TEST(math_function, gemm_notrans_cublas_fp16) { using namespace paddle::framework; using namespace paddle::platform; @@ -303,6 +310,7 @@ TEST(math_function, gemm_notrans_cublas_fp16) { EXPECT_EQ(static_cast(input3_ptr[6]), 86); EXPECT_EQ(static_cast(input3_ptr[7]), 99); } +#endif TEST(math_function, gemm_trans_cublas_fp32) { using namespace paddle::framework; @@ -355,6 +363,7 @@ TEST(math_function, gemm_trans_cublas_fp32) { EXPECT_EQ(input3_ptr[7], 99); } +#if 0 TEST(math_function, gemm_trans_cublas_fp16) { using namespace paddle::framework; using namespace paddle::platform; @@ -409,6 +418,7 @@ TEST(math_function, gemm_trans_cublas_fp16) { EXPECT_EQ(static_cast(input3_ptr[6]), 86); EXPECT_EQ(static_cast(input3_ptr[7]), 99); } +#endif template void GemvTest(int m, int n, bool trans) { diff --git a/paddle/fluid/operators/math/maxouting.cu b/paddle/fluid/operators/math/maxouting.cu index 1e1a6a221c71c..2e684d193695f 100644 --- a/paddle/fluid/operators/math/maxouting.cu +++ b/paddle/fluid/operators/math/maxouting.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "hip/hip_runtime.h" #include "paddle/fluid/operators/math/maxouting.h" #include "paddle/fluid/platform/cuda_helper.h" @@ -98,9 +99,10 @@ class MaxOutFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelMaxOut<<>>( - nthreads, input_data, input_channels, input_height, input_width, groups, - output_data); + hipLaunchKernelGGL((KernelMaxOut< + T>), dim3(grid), dim3(threads), 0, + context.stream(), nthreads, input_data, input_channels, + input_height, input_width, groups, output_data); } }; /* @@ -130,9 +132,11 @@ class MaxOutGradFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelMaxoutGrad<<>>( - nthreads, input_data, output_data, output_grad_data, input_grad_data, - input_channels, input_height, input_width, groups); + hipLaunchKernelGGL((KernelMaxoutGrad< + T>), dim3(grid), dim3(threads), 0, + context.stream(), nthreads, input_data, output_data, + output_grad_data, input_grad_data, input_channels, + input_height, input_width, groups); } }; diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/fluid/operators/math/pooling.cu index 274263c69c535..660e383276c09 100644 --- a/paddle/fluid/operators/math/pooling.cu +++ b/paddle/fluid/operators/math/pooling.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "hip/hip_runtime.h" #include "paddle/fluid/operators/math/pooling.h" #include "paddle/fluid/platform/cuda_helper.h" @@ -183,7 +184,10 @@ class Pool2dFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelPool2D<<>>( + hipLaunchKernelGGL((KernelPool2D< + PoolProcess, + T>), dim3(grid), dim3(threads), 0, + context.stream(), nthreads, input_data, input_channels, input_height, input_width, output_height, output_width, ksize_height, ksize_width, stride_height, stride_width, padding_height, padding_width, pool_process, output_data); @@ -227,7 +231,10 @@ class Pool2dGradFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelPool2DGrad<<>>( + hipLaunchKernelGGL((KernelPool2DGrad< + PoolProcess, + T>), dim3(grid), dim3(threads), 0, + context.stream(), nthreads, input_data, output_data, output_grad_data, input_channels, input_height, input_width, output_height, output_width, ksize_height, ksize_width, stride_height, stride_width, padding_height, padding_width, @@ -273,7 +280,9 @@ class MaxPool2dGradFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelMaxPool2DGrad<<>>( + hipLaunchKernelGGL((KernelMaxPool2DGrad< + T>), dim3(grid), dim3(threads), 0, + context.stream(), nthreads, input_data, output_data, output_grad_data, input_channels, input_height, input_width, output_height, output_width, ksize_height, ksize_width, stride_height, stride_width, padding_height, padding_width, @@ -505,7 +514,10 @@ class Pool3dFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelPool3D<<>>( + hipLaunchKernelGGL((KernelPool3D< + PoolProcess, + T>), dim3(grid), dim3(threads), 0, + context.stream(), nthreads, input_data, input_channels, input_depth, input_height, input_width, output_depth, output_height, output_width, ksize_depth, ksize_height, ksize_width, stride_depth, stride_height, stride_width, @@ -558,7 +570,10 @@ class Pool3dGradFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelPool3DGrad<<>>( + hipLaunchKernelGGL((KernelPool3DGrad< + PoolProcess, + T>), dim3(grid), dim3(threads), 0, + context.stream(), nthreads, input_data, output_data, output_grad_data, input_channels, input_depth, input_height, input_width, output_depth, output_height, output_width, ksize_depth, ksize_height, ksize_width, stride_depth, @@ -611,7 +626,9 @@ class MaxPool3dGradFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelMaxPool3DGrad<<>>( + hipLaunchKernelGGL((KernelMaxPool3DGrad< + T>), dim3(grid), dim3(threads), 0, + context.stream(), nthreads, input_data, output_data, output_grad_data, input_channels, input_depth, input_height, input_width, output_depth, output_height, output_width, ksize_depth, ksize_height, ksize_width, stride_depth, @@ -762,7 +779,9 @@ class MaxPool2dWithIndexFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelMaxPool2dWithIdx<<>>( + hipLaunchKernelGGL((KernelMaxPool2dWithIdx< + T1, T2>), dim3(grid), dim3(threads), 0, + context.stream(), nthreads, input_data, input_channels, input_height, input_width, output_height, output_width, ksize_height, ksize_width, stride_height, stride_width, padding_height, padding_width, output_data, mask_data); @@ -804,7 +823,9 @@ class MaxPool2dWithIndexGradFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelMaxPool2DWithIdxGrad<<>>( + hipLaunchKernelGGL((KernelMaxPool2DWithIdxGrad< + T1, T2>), dim3(grid), dim3(threads), 0, + context.stream(), nthreads, output_grad_data, mask_data, input_channels, input_height, input_width, output_height, output_width, ksize_height, ksize_width, stride_height, stride_width, padding_height, padding_width, @@ -969,7 +990,9 @@ class MaxPool3dWithIndexFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelMaxPool3DWithIdx<<>>( + hipLaunchKernelGGL((KernelMaxPool3DWithIdx< + T1, T2>), dim3(grid), dim3(threads), 0, + context.stream(), nthreads, input_data, input_channels, input_depth, input_height, input_width, output_depth, output_height, output_width, ksize_depth, ksize_height, ksize_width, stride_depth, stride_height, stride_width, @@ -1018,7 +1041,9 @@ class MaxPool3dWithIndexGradFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelMaxPool3DWithIdxGrad<<>>( + hipLaunchKernelGGL((KernelMaxPool3DWithIdxGrad< + T1, T2>), dim3(grid), dim3(threads), 0, + context.stream(), nthreads, output_grad_data, mask_data, input_channels, input_depth, input_height, input_width, output_depth, output_height, output_width, ksize_depth, ksize_height, ksize_width, stride_depth, stride_height, diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h index 74cb42f0d0208..6b612389d6584 100644 --- a/paddle/fluid/operators/math/pooling.h +++ b/paddle/fluid/operators/math/pooling.h @@ -42,6 +42,7 @@ class MaxPool { DEVICE inline T initial() { return static_cast(-FLT_MAX); } DEVICE inline void compute(T& y, const T& x) { y = y > x ? y : x; } DEVICE inline void finalize(T& y, const T& pool_field) {} + int reserved; }; template @@ -50,11 +51,13 @@ class AvgPool { DEVICE inline T initial() { return static_cast(0); } DEVICE inline void compute(T& y, const T& x) { y += x; } DEVICE inline void finalize(T& y, const T& pool_field) { y /= pool_field; } + int reserved; }; template class MaxPoolGrad { public: + int reserved; DEVICE inline void compute(const T& x, const T& y, const T& dy, T& dx, T scale) { dx += dy * (x == y); @@ -64,6 +67,7 @@ class MaxPoolGrad { template class AvgPoolGrad { public: + int reserved; DEVICE inline void compute(const T& x, const T& y, const T& dy, T& dx, T scale) { dx += (scale * dy); diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu index 5d78fd9d21355..15eae3cdbd020 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cu +++ b/paddle/fluid/operators/math/selected_rows_functor.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "hip/hip_runtime.h" #include #include "paddle/fluid/operators/math/math_function.h" @@ -124,10 +125,9 @@ struct SelectedRowsAddTensor { const int block_size = 256; dim3 threads(block_size, 1); dim3 grid(1, in1_rows.size()); - SelectedRowsAddTensorKernel< - T, block_size><<>>( - in1_data, in1_rows.CUDAData(context.GetPlace()), out_data, - in1_row_numel); + hipLaunchKernelGGL((SelectedRowsAddTensorKernel), + dim3(grid), dim3(threads), 0, + context.stream(), in1_data, in1_rows.data(), out_data, in1_row_numel); auto out_eigen = framework::EigenVector::Flatten(*output); auto in2_eigen = framework::EigenVector::Flatten(input2); @@ -217,8 +217,8 @@ struct SelectedRowsAddToTensor { const int block_size = 256; dim3 threads(block_size, 1); dim3 grid(1, in1_rows.size()); - SelectedRowsAddToTensorKernel< - T, block_size><<>>( + hipLaunchKernelGGL((SelectedRowsAddToTensorKernel), + dim3(grid), dim3(threads), 0, context.stream(), in1_data, in1_rows.CUDAData(context.GetPlace()), in2_data, in1_row_numel); } @@ -284,10 +284,10 @@ struct MergeAdd { dim3 threads(block_size, 1); dim3 grid1(1, input_rows.size()); - MergeAddKernel< - T, 256><<), + dim3(grid1), dim3(threads), 0, reinterpret_cast(context) - .stream()>>>( + .stream(), input_data, input_rows.CUDAData(context.GetPlace()), out_data, out.mutable_rows()->CUDAMutableData(context.GetPlace()), out.rows().size(), input_width); @@ -374,8 +374,8 @@ struct UpdateToTensor { dim3 threads(platform::PADDLE_CUDA_NUM_THREADS, 1); dim3 grid(1, in1_rows.size()); - UpdateToTensorKernel<<< - grid, threads, 0, context.stream()>>>(in1_data, in1_rows.cuda_data(), + hipLaunchKernelGGL((UpdateToTensorKernel), + dim3(grid), dim3(threads), 0, context.stream(), in1_data, in1_rows.cuda_data(), op, in2_data, in1_row_numel); } }; diff --git a/paddle/fluid/operators/math/sequence2batch.cu b/paddle/fluid/operators/math/sequence2batch.cu index 3185f10d41804..2270c67c438ce 100644 --- a/paddle/fluid/operators/math/sequence2batch.cu +++ b/paddle/fluid/operators/math/sequence2batch.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "hip/hip_runtime.h" #define EIGEN_USE_GPU #include "paddle/fluid/operators/math/sequence2batch.h" @@ -61,9 +62,8 @@ class CopyMatrixRowsFunctor { dim3 threads(128, 8); dim3 grid(8, 1); auto stream = context.stream(); - CopyMatrixRowsKernel<<>>( - src_data, dst_data, index_lod.CUDAData(context.GetPlace()), height, - width, is_src_index); + hipLaunchKernelGGL((CopyMatrixRowsKernel), dim3(grid), dim3(threads), 0, stream, + src_data, dst_data, index_lod.CUDAData(context.GetPlace()), height, width, is_src_index); } }; diff --git a/paddle/fluid/operators/math/sequence_padding.cu b/paddle/fluid/operators/math/sequence_padding.cu index c044e6fc32bab..2d899c81abce9 100644 --- a/paddle/fluid/operators/math/sequence_padding.cu +++ b/paddle/fluid/operators/math/sequence_padding.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "hip/hip_runtime.h" #include "paddle/fluid/operators/math/sequence_padding.h" namespace paddle { @@ -119,13 +120,13 @@ class PaddingLoDTensorFunctor { const T* seq_data = seq.data(); T* padding_data = padding.data(); if (norm_by_times) { - SequencePaddingKernel<<>>( - padding_data, const_cast(seq_data), + hipLaunchKernelGGL((SequencePaddingKernel), dim3(grid), dim3(threads), 0, + context.stream(), padding_data, const_cast(seq_data), abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width, max_sequence_length, num_sequences); } else { - SequencePaddingKernel<<>>( - padding_data, const_cast(seq_data), + hipLaunchKernelGGL((SequencePaddingKernel), dim3(grid), dim3(threads), 0, + context.stream(), padding_data, const_cast(seq_data), abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width, max_sequence_length, num_sequences); } @@ -194,13 +195,13 @@ class UnpaddingLoDTensorFunctor { const T* padding_data = padding.data(); T* seq_data = seq.data(); if (norm_by_times) { - SequencePaddingKernel<<>>( - const_cast(padding_data), seq_data, + hipLaunchKernelGGL((SequencePaddingKernel), dim3(grid), dim3(threads), 0, + context.stream(), const_cast(padding_data), seq_data, abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width, max_sequence_length, num_sequences); } else { - SequencePaddingKernel<<>>( - const_cast(padding_data), seq_data, + hipLaunchKernelGGL((SequencePaddingKernel), dim3(grid), dim3(threads), 0, + context.stream(), const_cast(padding_data), seq_data, abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width, max_sequence_length, num_sequences); } diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu index 1935364da37e9..09d14ac15c645 100644 --- a/paddle/fluid/operators/math/sequence_pooling.cu +++ b/paddle/fluid/operators/math/sequence_pooling.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "hip/hip_runtime.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/sequence_pooling.h" #include "paddle/fluid/platform/cuda_helper.h" @@ -139,38 +140,38 @@ class SequencePoolFunctor { dim3 threads(1024, 1); dim3 grid(lod.size(), 1); if (pooltype == "MAX") { - sequence_pool_kernel< - T, MaxPoolFunctor><<>>( + hipLaunchKernelGGL((sequence_pool_kernel< + T, MaxPoolFunctor>), dim3(grid), dim3(threads), 0, context.stream(), MaxPoolFunctor(), input.data(), lod.CUDAData(context.GetPlace()), lod.size(), item_dim, output->mutable_data(context.GetPlace()), index->data()); } else if (pooltype == "AVERAGE") { - sequence_pool_kernel< - T, AvgPoolFunctor><<>>( + hipLaunchKernelGGL((sequence_pool_kernel< + T, AvgPoolFunctor>), dim3(grid), dim3(threads), 0, context.stream(), AvgPoolFunctor(), input.data(), lod.CUDAData(context.GetPlace()), lod.size(), item_dim, output->mutable_data(context.GetPlace()), nullptr); } else if (pooltype == "SUM") { - sequence_pool_kernel< - T, SumPoolFunctor><<>>( + hipLaunchKernelGGL((sequence_pool_kernel< + T, SumPoolFunctor>), dim3(grid), dim3(threads), 0, context.stream(), SumPoolFunctor(), input.data(), lod.CUDAData(context.GetPlace()), lod.size(), item_dim, output->mutable_data(context.GetPlace()), nullptr); } else if (pooltype == "SQRT") { - sequence_pool_kernel< - T, SqrtPoolFunctor><<>>( + hipLaunchKernelGGL((sequence_pool_kernel< + T, SqrtPoolFunctor>), dim3(grid), dim3(threads), 0, context.stream(), SqrtPoolFunctor(), input.data(), lod.CUDAData(context.GetPlace()), lod.size(), item_dim, output->mutable_data(context.GetPlace()), nullptr); } else if (pooltype == "LAST") { - sequence_pool_kernel< - T, LastPoolFunctor><<>>( + hipLaunchKernelGGL((sequence_pool_kernel< + T, LastPoolFunctor>), dim3(grid), dim3(threads), 0, context.stream(), LastPoolFunctor(), input.data(), lod.CUDAData(context.GetPlace()), lod.size(), item_dim, output->mutable_data(context.GetPlace()), nullptr); } else if (pooltype == "FIRST") { - sequence_pool_kernel< - T, FirstPoolFunctor><<>>( + hipLaunchKernelGGL((sequence_pool_kernel< + T, FirstPoolFunctor>), dim3(grid), dim3(threads), 0, context.stream(), FirstPoolFunctor(), input.data(), lod.CUDAData(context.GetPlace()), lod.size(), item_dim, output->mutable_data(context.GetPlace()), nullptr); @@ -301,38 +302,38 @@ class SequencePoolGradFunctor { dim3 threads(1024, 1); dim3 grid(lod.size(), 1); if (pooltype == "MAX") { - sequence_pool_grad_kernel< - T, MaxPoolGradFunctor><<>>( + hipLaunchKernelGGL((sequence_pool_grad_kernel< + T, MaxPoolGradFunctor>), dim3(grid), dim3(threads), 0, context.stream(), MaxPoolGradFunctor(), out_grad.data(), lod.CUDAData(context.GetPlace()), lod.size(), item_dim, in_grad->mutable_data(context.GetPlace()), index->data()); } else if (pooltype == "AVERAGE") { - sequence_pool_grad_kernel< - T, AvgPoolGradFunctor><<>>( + hipLaunchKernelGGL((sequence_pool_grad_kernel< + T, AvgPoolGradFunctor>), dim3(grid), dim3(threads), 0, context.stream(), AvgPoolGradFunctor(), out_grad.data(), lod.CUDAData(context.GetPlace()), lod.size(), item_dim, in_grad->mutable_data(context.GetPlace()), nullptr); } else if (pooltype == "SUM") { - sequence_pool_grad_kernel< - T, SumPoolGradFunctor><<>>( + hipLaunchKernelGGL((sequence_pool_grad_kernel< + T, SumPoolGradFunctor>), dim3(grid), dim3(threads), 0, context.stream(), SumPoolGradFunctor(), out_grad.data(), lod.CUDAData(context.GetPlace()), lod.size(), item_dim, in_grad->mutable_data(context.GetPlace()), nullptr); } else if (pooltype == "SQRT") { - sequence_pool_grad_kernel< - T, SqrtPoolGradFunctor><<>>( + hipLaunchKernelGGL((sequence_pool_grad_kernel< + T, SqrtPoolGradFunctor>), dim3(grid), dim3(threads), 0, context.stream(), SqrtPoolGradFunctor(), out_grad.data(), lod.CUDAData(context.GetPlace()), lod.size(), item_dim, in_grad->mutable_data(context.GetPlace()), nullptr); } else if (pooltype == "LAST") { - sequence_pool_grad_kernel< - T, LastPoolGradFunctor><<>>( + hipLaunchKernelGGL((sequence_pool_grad_kernel< + T, LastPoolGradFunctor>), dim3(grid), dim3(threads), 0, context.stream(), LastPoolGradFunctor(), out_grad.data(), lod.CUDAData(context.GetPlace()), lod.size(), item_dim, in_grad->mutable_data(context.GetPlace()), nullptr); } else if (pooltype == "FIRST") { - sequence_pool_grad_kernel< - T, FirstPoolGradFunctor><<>>( + hipLaunchKernelGGL((sequence_pool_grad_kernel< + T, FirstPoolGradFunctor>), dim3(grid), dim3(threads), 0, context.stream(), FirstPoolGradFunctor(), out_grad.data(), lod.CUDAData(context.GetPlace()), lod.size(), item_dim, in_grad->mutable_data(context.GetPlace()), nullptr); diff --git a/paddle/fluid/operators/math/sequence_scale.cu b/paddle/fluid/operators/math/sequence_scale.cu index 74085153c6235..6125f0b971027 100644 --- a/paddle/fluid/operators/math/sequence_scale.cu +++ b/paddle/fluid/operators/math/sequence_scale.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "hip/hip_runtime.h" #include "paddle/fluid/operators/math/sequence_scale.h" #include "paddle/fluid/platform/cuda_helper.h" @@ -44,8 +45,8 @@ class ScaleLoDTensorFunctor { framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); T* seq_data = seq.mutable_data(context.GetPlace()); - SequenceScaleKernel<<< - num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>( + hipLaunchKernelGGL((SequenceScaleKernel), + dim3(num_seq), dim3(PADDLE_CUDA_NUM_THREADS), 0, context.stream(), seq_data, abs_offset_lod[level].CUDAMutableData(context.GetPlace()), scales, seq_width); } diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu index 5518ebed3f792..50e262cec35b8 100644 --- a/paddle/fluid/operators/math/softmax.cu +++ b/paddle/fluid/operators/math/softmax.cu @@ -17,7 +17,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/softmax.h" #include "paddle/fluid/operators/math/softmax_impl.h" -#include "paddle/fluid/platform/cudnn_helper.h" +#include "paddle/fluid/platform/miopen_helper.h" namespace paddle { namespace operators { @@ -27,7 +27,7 @@ using Tensor = framework::Tensor; using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; using DataLayout = platform::DataLayout; template -using CudnnDataType = platform::CudnnDataType; +using MIOpenDataType = platform::MIOpenDataType; template void SoftmaxCUDNNFunctor::operator()( @@ -46,14 +46,13 @@ void SoftmaxCUDNNFunctor::operator()( if (cudnn_tensor_dims.size() <= 2) { cudnn_tensor_dims.resize(4, 1); } - cudnnTensorDescriptor_t cudnn_x_desc = + miopenTensorDescriptor_t cudnn_x_desc = xDesc.descriptor(layout, cudnn_tensor_dims); - cudnnTensorDescriptor_t cudnn_y_desc = + miopenTensorDescriptor_t cudnn_y_desc = xDesc.descriptor(layout, cudnn_tensor_dims); - PADDLE_ENFORCE(platform::dynload::cudnnSoftmaxForward( - context.cudnn_handle(), CUDNN_SOFTMAX_ACCURATE, - CUDNN_SOFTMAX_MODE_INSTANCE, CudnnDataType::kOne(), cudnn_x_desc, - X->data(), CudnnDataType::kZero(), cudnn_y_desc, + PADDLE_ENFORCE(platform::dynload::miopenSoftmaxForward( + context.miopen_handle(), MIOpenDataType::kOne(), cudnn_x_desc, + X->data(), MIOpenDataType::kZero(), cudnn_y_desc, Y->mutable_data(context.GetPlace()))); } @@ -75,25 +74,21 @@ void SoftmaxGradCUDNNFunctor::operator()( if (cudnn_tensor_dims.size() <= 2) { cudnn_tensor_dims.resize(4, 1); } - cudnnTensorDescriptor_t cudnn_y_desc = + miopenTensorDescriptor_t cudnn_y_desc = yDesc.descriptor(layout, cudnn_tensor_dims); - cudnnTensorDescriptor_t cudnn_xgrad_desc = + miopenTensorDescriptor_t cudnn_xgrad_desc = dxDesc.descriptor(layout, cudnn_tensor_dims); - cudnnTensorDescriptor_t cudnn_ygrad_desc = + miopenTensorDescriptor_t cudnn_ygrad_desc = dyDesc.descriptor(layout, cudnn_tensor_dims); - PADDLE_ENFORCE(platform::dynload::cudnnSoftmaxBackward( - context.cudnn_handle(), CUDNN_SOFTMAX_ACCURATE, - CUDNN_SOFTMAX_MODE_INSTANCE, CudnnDataType::kOne(), cudnn_y_desc, + PADDLE_ENFORCE(platform::dynload::miopenSoftmaxBackward( + context.miopen_handle(), MIOpenDataType::kOne(), cudnn_y_desc, Y->data(), cudnn_ygrad_desc, YGrad->data(), - CudnnDataType::kZero(), cudnn_xgrad_desc, + MIOpenDataType::kZero(), cudnn_xgrad_desc, XGrad->mutable_data(context.GetPlace()))); } -template class SoftmaxCUDNNFunctor; template class SoftmaxCUDNNFunctor; -template class SoftmaxCUDNNFunctor; template class SoftmaxGradCUDNNFunctor; -template class SoftmaxGradCUDNNFunctor; template class SoftmaxFunctor; template class SoftmaxFunctor; diff --git a/paddle/fluid/operators/math/softmax.h b/paddle/fluid/operators/math/softmax.h index da1f0b672d3a5..ed872e18bf69b 100644 --- a/paddle/fluid/operators/math/softmax.h +++ b/paddle/fluid/operators/math/softmax.h @@ -33,7 +33,7 @@ class SoftmaxGradFunctor { const framework::Tensor* y_grad, framework::Tensor* x_grad); }; -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) template class SoftmaxCUDNNFunctor { public: diff --git a/paddle/fluid/operators/math/unpooling.cu b/paddle/fluid/operators/math/unpooling.cu index 367f343d51712..5b183ae12bf5f 100644 --- a/paddle/fluid/operators/math/unpooling.cu +++ b/paddle/fluid/operators/math/unpooling.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "hip/hip_runtime.h" #include "paddle/fluid/operators/math/unpooling.h" #include "paddle/fluid/platform/cuda_helper.h" @@ -83,9 +84,11 @@ class Unpool2dMaxFunctor { T* output_data = output->mutable_data(context.GetPlace()); int threads = 1024; int grid = (input.numel() + threads - 1) / threads; - KernelUnpool2dMax<<>>( - input.numel(), input_data, indices_data, input_height, input_width, - output_channels, output_data, output_height, output_width); + hipLaunchKernelGGL((KernelUnpool2dMax< + T>), dim3(grid), dim3(threads), 0, + context.stream(), input.numel(), input_data, indices_data, + input_height, input_width, output_channels, + output_data, output_height, output_width); } }; /* @@ -113,10 +116,12 @@ class Unpool2dMaxGradFunctor { T* input_grad_data = input_grad->mutable_data(context.GetPlace()); int threads = 1024; int grid = (input.numel() + threads - 1) / threads; - KernelUnpool2dMaxGrad<<>>( - input.numel(), input_data, indices_data, input_height, input_width, - output_channels, output_data, output_grad_data, output_height, - output_width, input_grad_data); + hipLaunchKernelGGL((KernelUnpool2dMaxGrad< + T>), dim3(grid), dim3(threads), 0, + context.stream(), input.numel(), input_data, indices_data, + input_height, input_width, output_channels, + output_data, output_grad_data, output_height, + output_width, input_grad_data); } }; template class Unpool2dMaxGradFunctor; diff --git a/paddle/fluid/operators/math/vol2col.cu b/paddle/fluid/operators/math/vol2col.cu index 619730d394d07..bffccba55d91b 100644 --- a/paddle/fluid/operators/math/vol2col.cu +++ b/paddle/fluid/operators/math/vol2col.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "hip/hip_runtime.h" #include "paddle/fluid/operators/math/vol2col.h" #include "paddle/fluid/platform/cuda_helper.h" @@ -117,7 +118,8 @@ class Vol2ColFunctor { const int threads = 1024; const int blocks = (num_outputs + 1024 - 1) / 1024; - vol2col<<>>( + hipLaunchKernelGGL((vol2col), dim3(blocks), dim3(threads), 0, + context.stream(), num_outputs, vol.data(), input_depth, input_height, input_width, dilations[0], dilations[1], dilations[2], filter_depth, filter_height, filter_width, strides[0], strides[1], strides[2], paddings[0], @@ -243,7 +245,8 @@ class Col2VolFunctor { const int threads = 1024; const int blocks = (num_kernels + 1024 - 1) / 1024; - col2vol<<>>( + hipLaunchKernelGGL((col2vol), dim3(blocks), dim3(threads), 0, + context.stream(), num_kernels, col.data(), input_depth, input_height, input_width, dilations[0], dilations[1], dilations[2], filter_depth, filter_height, filter_width, strides[0], strides[1], strides[2], paddings[0], diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc index 4ebf20cbba69b..6ad15e35fa30d 100644 --- a/paddle/fluid/operators/merge_lod_tensor_op.cc +++ b/paddle/fluid/operators/merge_lod_tensor_op.cc @@ -50,7 +50,7 @@ class MergeLoDTensorOp : public framework::OperatorBase { if (platform::is_cpu_place(mask.place())) { cpu_mask->ShareDataWith(mask); } else if (platform::is_gpu_place(mask.place())) { -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) framework::TensorCopy(mask, platform::CPUPlace(), dev_ctx, cpu_mask.get()); #else diff --git a/paddle/fluid/operators/momentum_op.cu b/paddle/fluid/operators/momentum_op.cu index da4a6af298f61..1575e23dc1f53 100644 --- a/paddle/fluid/operators/momentum_op.cu +++ b/paddle/fluid/operators/momentum_op.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "hip/hip_runtime.h" #include "paddle/fluid/framework/op_registry.h" namespace paddle { @@ -65,7 +66,7 @@ class MomentumOpCUDAKernel : public framework::OpKernel { int block = 512; int grid = (param->numel() + block - 1) / block; - MomentumKernel<<>>( + hipLaunchKernelGGL((MomentumKernel), dim3(grid), dim3(block), 0, ctx.cuda_device_context().stream(), p, g, v, lr, mu, param->numel(), use_nesterov, p_out, v_out); } }; diff --git a/paddle/fluid/operators/mul_op.cu.cc b/paddle/fluid/operators/mul_op.cu.cc index 757f9c3ee2665..2efea7e8faf4a 100644 --- a/paddle/fluid/operators/mul_op.cu.cc +++ b/paddle/fluid/operators/mul_op.cu.cc @@ -17,7 +17,6 @@ limitations under the License. */ namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel, - ops::MulKernel); +REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel); REGISTER_OP_CUDA_KERNEL(mul_grad, ops::MulGradKernel); diff --git a/paddle/fluid/operators/nccl/CMakeLists.txt b/paddle/fluid/operators/nccl/CMakeLists.txt index ce0ddd89bfb0d..50b8a87e59132 100644 --- a/paddle/fluid/operators/nccl/CMakeLists.txt +++ b/paddle/fluid/operators/nccl/CMakeLists.txt @@ -1,3 +1,5 @@ if(WITH_GPU) nv_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator ) +elseif (WITH_AMD_GPU) + hip_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator ) endif() diff --git a/paddle/fluid/operators/nccl/nccl_gpu_common.cc b/paddle/fluid/operators/nccl/nccl_gpu_common.cc index 08b61765c2f0f..aaf07328992a6 100644 --- a/paddle/fluid/operators/nccl/nccl_gpu_common.cc +++ b/paddle/fluid/operators/nccl/nccl_gpu_common.cc @@ -19,7 +19,7 @@ namespace paddle { namespace platform { namespace { // TODO(panyx0718): Where to destroy them. -std::unique_ptr> global_comms; +std::unique_ptr> global_comms; std::unique_ptr> comm_id_map; bool inited = false; size_t last_num_gpus = -1; @@ -42,21 +42,21 @@ void Communicator::InitAll(const std::vector& gpus) { if (global_comms) { for (size_t i = 0; i < global_comms->size(); ++i) { // FIXME(dzh) : PADDLE_ENFORCE return void - dynload::ncclCommDestroy((*global_comms)[i]); + dynload::rcclCommDestroy((*global_comms)[i]); } } - global_comms.reset(new std::vector()); + global_comms.reset(new std::vector()); comm_id_map.reset(new std::unordered_map()); global_comms->resize(gpus.size()); for (size_t i = 0; i < gpus.size(); ++i) { (*comm_id_map)[gpus[i]] = i; } PADDLE_ENFORCE( - dynload::ncclCommInitAll(global_comms->data(), gpus.size(), gpus.data())); + dynload::rcclCommInitAll(global_comms->data(), (int)gpus.size(), (int*)gpus.data())); inited = true; } -const std::vector& Communicator::comms() const { +const std::vector& Communicator::comms() const { std::lock_guard guard(comm_mu); return *global_comms; } diff --git a/paddle/fluid/operators/nccl/nccl_gpu_common.h b/paddle/fluid/operators/nccl/nccl_gpu_common.h index 113f93e346681..f2e4ded910f96 100644 --- a/paddle/fluid/operators/nccl/nccl_gpu_common.h +++ b/paddle/fluid/operators/nccl/nccl_gpu_common.h @@ -23,7 +23,11 @@ limitations under the License. */ #include #include "paddle/fluid/platform/device_context.h" +#ifdef PADDLE_WITH_HIP +#include "paddle/fluid/platform/dynload/rccl.h" +#else #include "paddle/fluid/platform/dynload/nccl.h" +#endif #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/macros.h" @@ -38,7 +42,7 @@ struct Communicator { void InitAll(const std::vector& gpus); - const std::vector& comms() const; + const std::vector& comms() const; }; } // namespace platform diff --git a/paddle/fluid/operators/nccl_op.cu.cc b/paddle/fluid/operators/nccl_op.cu.cc index ad623e1fe0f89..64cf7dcf36ccf 100644 --- a/paddle/fluid/operators/nccl_op.cu.cc +++ b/paddle/fluid/operators/nccl_op.cu.cc @@ -28,13 +28,13 @@ class NCCLTypeWrapper; template <> class NCCLTypeWrapper { public: - static const ncclDataType_t type = ncclFloat; + static const rcclDataType_t type = rcclFloat; }; template <> class NCCLTypeWrapper { public: - static const ncclDataType_t type = ncclDouble; + static const rcclDataType_t type = rcclDouble; }; template @@ -48,15 +48,15 @@ class NCCLAllReduceKernel : public framework::OpKernel { auto* comm = ctx.Input("Communicator"); std::string reduction = ctx.Attr("reduction"); - ncclRedOp_t reduction_op_ = ncclSum; + rcclRedOp_t reduction_op_ = rcclSum; if (reduction == "ncclMin") { - reduction_op_ = ncclMin; + reduction_op_ = rcclMin; } else if (reduction == "ncclMax") { - reduction_op_ = ncclMax; + reduction_op_ = rcclMax; } else if (reduction == "ncclSum") { - reduction_op_ = ncclSum; + reduction_op_ = rcclSum; } else if (reduction == "ncclProd") { - reduction_op_ = ncclProd; + reduction_op_ = rcclProd; } else { PADDLE_THROW("Invalid reduction. default ncclSum."); } @@ -66,7 +66,7 @@ class NCCLAllReduceKernel : public framework::OpKernel { VLOG(3) << "gpu : " << " invoke allreduce. send " << x->numel() << " recv " << out->numel(); - PADDLE_ENFORCE(platform::dynload::ncclAllReduce( + PADDLE_ENFORCE(platform::dynload::rcclAllReduce( x->data(), out->mutable_data(ctx.GetPlace()), out->numel(), NCCLTypeWrapper::type, reduction_op_, comm->comms().at(idx), ctx.cuda_device_context().stream())); @@ -88,15 +88,15 @@ class NCCLReduceKernel : public framework::OpKernel { int root = ctx.Attr("root"); std::string reduction = ctx.Attr("reduction"); - ncclRedOp_t reduction_op_ = ncclSum; + rcclRedOp_t reduction_op_ = rcclSum; if (reduction == "ncclMin") { - reduction_op_ = ncclMin; + reduction_op_ = rcclMin; } else if (reduction == "ncclMax") { - reduction_op_ = ncclMax; + reduction_op_ = rcclMax; } else if (reduction == "ncclSum") { - reduction_op_ = ncclSum; + reduction_op_ = rcclSum; } else if (reduction == "ncclProd") { - reduction_op_ = ncclProd; + reduction_op_ = rcclProd; } else { PADDLE_THROW("Invalid reduction. default ncclSum."); } @@ -111,9 +111,12 @@ class NCCLReduceKernel : public framework::OpKernel { } VLOG(3) << "gpu : " << gpu_id << " invoke reduce. send " << x->numel() << " recv " << out->numel(); - PADDLE_ENFORCE(platform::dynload::ncclReduce( +// ToDo: rcclReduce isn't implmented. +// PADDLE_ENFORCE(platform::dynload::rcclReduce( + PADDLE_ENFORCE(platform::dynload::rcclAllReduce( x->data(), recvbuffer, x->numel(), NCCLTypeWrapper::type, - reduction_op_, root, comm->comms().at(idx), +// reduction_op_, root, comm->comms().at(idx), + reduction_op_, comm->comms().at(idx), ctx.cuda_device_context().stream())); VLOG(3) << "gpu : " << gpu_id << " finished reduce. send " << x->numel() << " recv " << out->numel(); @@ -134,7 +137,7 @@ class NCCLBcastKernel : public framework::OpKernel { if (idx == root) { auto* x = ctx.Input("X"); VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. send " << x->numel(); - PADDLE_ENFORCE(platform::dynload::ncclBcast( + PADDLE_ENFORCE(platform::dynload::rcclBcast( (void*)x->data(), x->numel(), NCCLTypeWrapper::type, root, comm->comms().at(idx), ctx.cuda_device_context().stream())); VLOG(3) << "gpu : " << gpu_id << " finished Bcast."; @@ -142,7 +145,7 @@ class NCCLBcastKernel : public framework::OpKernel { auto* out = ctx.Output("Out"); VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. recv buffer " << framework::product(out->dims()); - PADDLE_ENFORCE(platform::dynload::ncclBcast( + PADDLE_ENFORCE(platform::dynload::rcclBcast( out->mutable_data(ctx.GetPlace()), out->numel(), NCCLTypeWrapper::type, root, comm->comms().at(idx), ctx.cuda_device_context().stream())); diff --git a/paddle/fluid/operators/one_hot_op.cu b/paddle/fluid/operators/one_hot_op.cu index 240ac895e2c83..80b66b173e611 100644 --- a/paddle/fluid/operators/one_hot_op.cu +++ b/paddle/fluid/operators/one_hot_op.cu @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "hip/hip_runtime.h" #include "paddle/fluid/operators/one_hot_op.h" #include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/gpu_info.h" @@ -48,9 +49,9 @@ struct OneHotOpCUDAFunctor { auto stream = ctx_.stream(); math::set_constant(ctx_, out_, 0.0); - FillOutputKernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / - PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>( + hipLaunchKernelGGL((FillOutputKernel), dim3((numel + PADDLE_CUDA_NUM_THREADS - 1) / + PADDLE_CUDA_NUM_THREADS), + dim3(PADDLE_CUDA_NUM_THREADS), 0, stream, p_in_data, p_out_data, numel, depth_); } }; diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc index 39c862b03ad49..061e3118c3545 100644 --- a/paddle/fluid/operators/pool_cudnn_op.cu.cc +++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/pool_op.h" -#include "paddle/fluid/platform/cudnn_helper.h" +#include "paddle/fluid/platform/miopen_helper.h" namespace paddle { namespace operators { @@ -25,7 +25,7 @@ using ScopedPoolingDescriptor = platform::ScopedPoolingDescriptor; using DataLayout = platform::DataLayout; using PoolingMode = platform::PoolingMode; template -using ScalingParamType = typename platform::CudnnDataType::ScalingParamType; +using ScalingParamType = typename platform::MIOpenDataType::ScalingParamType; template class PoolCUDNNOpKernel : public framework::OpKernel { @@ -63,9 +63,9 @@ class PoolCUDNNOpKernel : public framework::OpKernel { layout = DataLayout::kNCDHW; } - cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( + miopenTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( layout, framework::vectorize2int(input->dims())); - cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( + miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( layout, framework::vectorize2int(output->dims())); PoolingMode pooling_mode; @@ -75,15 +75,25 @@ class PoolCUDNNOpKernel : public framework::OpKernel { pooling_mode = PoolingMode::kAverage; } - cudnnPoolingDescriptor_t cudnn_pool_desc = + miopenPoolingDescriptor_t cudnn_pool_desc = pool_desc.descriptor(pooling_mode, ksize, paddings, strides); // ------------------- cudnn pool algorithm --------------------- - auto handle = ctx.cuda_device_context().cudnn_handle(); + auto handle = ctx.cuda_device_context().miopen_handle(); ScalingParamType alpha = 1.0f, beta = 0.0f; - PADDLE_ENFORCE(platform::dynload::cudnnPoolingForward( + void* cudnn_workspace = nullptr; + size_t workspace_size_in_bytes; // final workspace to allocate. + PADDLE_ENFORCE(platform::dynload::miopenPoolingGetWorkSpaceSize( + cudnn_output_desc, &workspace_size_in_bytes)); + // Allocate on GPU memory + platform::CUDAPlace gpu = boost::get(ctx.GetPlace()); + cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); + + PADDLE_ENFORCE(platform::dynload::miopenPoolingForward( handle, cudnn_pool_desc, &alpha, cudnn_input_desc, input_data, &beta, - cudnn_output_desc, output_data)); + cudnn_output_desc, output_data, false, cudnn_workspace, workspace_size_in_bytes)); + + paddle::memory::Free(gpu, cudnn_workspace); } }; @@ -128,9 +138,9 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel { layout = DataLayout::kNCDHW; } - cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( + miopenTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( layout, framework::vectorize2int(input->dims())); - cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( + miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( layout, framework::vectorize2int(output->dims())); PoolingMode pooling_mode; @@ -140,20 +150,29 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel { pooling_mode = PoolingMode::kAverage; } - cudnnPoolingDescriptor_t cudnn_pool_desc = + miopenPoolingDescriptor_t cudnn_pool_desc = pool_desc.descriptor(pooling_mode, ksize, paddings, strides); // ------------------- cudnn pool algorithm --------------------- - auto handle = ctx.cuda_device_context().cudnn_handle(); + auto handle = ctx.cuda_device_context().miopen_handle(); ScalingParamType alpha = 1.0f, beta = 0.0f; if (input_grad) { T *input_grad_data = input_grad->mutable_data(ctx.GetPlace()); // Because beta is zero, it is unnecessary to reset input_grad. - - PADDLE_ENFORCE(platform::dynload::cudnnPoolingBackward( + void* cudnn_workspace = nullptr; + size_t workspace_size_in_bytes; // final workspace to allocate. + PADDLE_ENFORCE(platform::dynload::miopenPoolingGetWorkSpaceSize( + cudnn_output_desc, &workspace_size_in_bytes)); + // Allocate on GPU memory + platform::CUDAPlace gpu = boost::get(ctx.GetPlace()); + cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); + + PADDLE_ENFORCE(platform::dynload::miopenPoolingBackward( handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data, cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data, - &beta, cudnn_input_desc, input_grad_data)); + &beta, cudnn_input_desc, input_grad_data, cudnn_workspace)); + + paddle::memory::Free(gpu, cudnn_workspace); } } }; @@ -165,16 +184,11 @@ namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_KERNEL(pool2d, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNOpKernel, - ops::PoolCUDNNOpKernel, - ops::PoolCUDNNOpKernel); + ops::PoolCUDNNOpKernel); REGISTER_OP_KERNEL(pool2d_grad, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNGradOpKernel, - ops::PoolCUDNNGradOpKernel); + ops::PoolCUDNNGradOpKernel); REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNOpKernel, - ops::PoolCUDNNOpKernel); + ops::PoolCUDNNOpKernel); REGISTER_OP_KERNEL(pool3d_grad, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNGradOpKernel, - ops::PoolCUDNNGradOpKernel); + ops::PoolCUDNNGradOpKernel); diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc index b144ec5f7d315..6e1b722d6e4dd 100644 --- a/paddle/fluid/operators/pool_op.cc +++ b/paddle/fluid/operators/pool_op.cc @@ -16,6 +16,9 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/cudnn_helper.h" #endif +#ifdef PADDLE_WITH_HIP +#include "paddle/fluid/platform/miopen_helper.h" +#endif #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif @@ -88,6 +91,11 @@ framework::OpKernelType PoolOp::GetExpectedKernelType( library_ = framework::LibraryType::kCUDNN; } #endif +#ifdef PADDLE_WITH_HIP + if (platform::CanMIOpenBeUsed(ctx)) { + library_ = framework::LibraryType::kCUDNN; + } +#endif #ifdef PADDLE_WITH_MKLDNN if (library_ == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { @@ -117,6 +125,11 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType( library_ = framework::LibraryType::kCUDNN; } #endif +#ifdef PADDLE_WITH_HIP + if (platform::CanMIOpenBeUsed(ctx)) { + library_ = framework::LibraryType::kCUDNN; + } +#endif #ifdef PADDLE_WITH_MKLDNN if (library_ == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { diff --git a/paddle/fluid/operators/prior_box_op.cu b/paddle/fluid/operators/prior_box_op.cu index 76bf2b3b7de7a..e582addf92ad0 100644 --- a/paddle/fluid/operators/prior_box_op.cu +++ b/paddle/fluid/operators/prior_box_op.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "hip/hip_runtime.h" #include "paddle/fluid/operators/prior_box_op.h" namespace paddle { @@ -19,7 +20,10 @@ namespace operators { template __device__ inline T clip(T in) { - return min(max(in, 0.), 1.); + // return min(max(in, 0.), 1.); + if(in > 1.) return 1.; + else if (in < 0.) return 0.; + else return in; } template @@ -146,16 +150,16 @@ class PriorBoxOpCUDAKernel : public framework::OpKernel { max_data = max.data(); } - GenPriorBox<<>>( - boxes->data(), r.data(), height, width, im_height, im_width, - aspect_ratios.size(), offset, step_width, step_height, min.data(), + hipLaunchKernelGGL((GenPriorBox), dim3(grid), dim3(block), 0, stream, + boxes->data(), r.data(), int(height), int(width), int(im_height), int(im_width), + int(aspect_ratios.size()), offset, step_width, step_height, min.data(), max_data, min_num, clip); framework::Tensor v; framework::TensorFromVector(variances, ctx.device_context(), &v); grid = (box_num * 4 + block - 1) / block; - SetVariance<<>>(vars->data(), v.data(), - variances.size(), box_num * 4); + hipLaunchKernelGGL((SetVariance), dim3(grid), dim3(block), 0, stream, vars->data(), v.data(), + int(variances.size()), box_num * 4); } }; // namespace operators diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc index 96c0c1cbe6d58..6d117b48d50e9 100644 --- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc +++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc @@ -52,7 +52,7 @@ class DoubleBufferReader : public framework::DecoratedReader { explicit DoubleBufferReader( ReaderBase* reader, platform::Place target_place = platform::CPUPlace()) : DecoratedReader(reader), place_(target_place) { -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) for (size_t i = 0; i < kCacheSize; ++i) { if (platform::is_gpu_place(place_)) { ctxs_.emplace_back(new platform::CUDADeviceContext( diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu index 1931629d13407..09432074fde3c 100644 --- a/paddle/fluid/operators/roi_pool_op.cu +++ b/paddle/fluid/operators/roi_pool_op.cu @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "hip/hip_runtime.h" +#include #include "paddle/fluid/operators/roi_pool_op.h" #include "paddle/fluid/platform/cuda_helper.h" @@ -68,7 +70,15 @@ __global__ void GPUROIPoolForward(const int nthreads, const T* input_data, wend = min(max(wend + roi_start_w, 0), width); bool is_empty = (hend <= hstart) || (wend <= wstart); - T maxval = is_empty ? 0 : -std::numeric_limits::max(); + //T maxval = is_empty ? 0 : -std::numeric_limits::max(); + T maxval = 0; + if (!is_empty) + { + if (std::is_same::value) + maxval = -FLT_MAX; + else + maxval = -DBL_MAX; + } int maxidx = -1; const T* offset_input_data = input_data + (roi_batch_ind * channels + c) * height * width; @@ -145,8 +155,8 @@ class GPUROIPoolOpKernel : public framework::OpKernel { int blocks = NumBlocks(output_size); int threads = kNumCUDAThreads; - GPUROIPoolForward< - T><<>>( + hipLaunchKernelGGL((GPUROIPoolForward< + T>), dim3(blocks), dim3(threads), 0, ctx.cuda_device_context().stream(), output_size, in->data(), rois->data(), spatial_scale, channels, height, width, pooled_height, pooled_width, out->mutable_data(ctx.GetPlace()), @@ -184,10 +194,10 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel { int threads = kNumCUDAThreads; if (output_grad_size > 0) { - GPUROIPoolBackward< - T><<>>( + hipLaunchKernelGGL((GPUROIPoolBackward< + T>), dim3(blocks), dim3(threads), 0, ctx.cuda_device_context().stream(), output_grad_size, rois->data(), out_grad->data(), - argmax->data(), rois_num, spatial_scale, channels, height, + argmax->data(), int(rois_num), spatial_scale, channels, height, width, pooled_height, pooled_width, x_grad->mutable_data(ctx.GetPlace())); } diff --git a/paddle/fluid/operators/row_conv_op.cu b/paddle/fluid/operators/row_conv_op.cu index 67083455a7579..070338c26850c 100644 --- a/paddle/fluid/operators/row_conv_op.cu +++ b/paddle/fluid/operators/row_conv_op.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "hip/hip_runtime.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/row_conv_op.h" #include "paddle/fluid/platform/cuda_helper.h" @@ -39,7 +40,7 @@ __global__ void RowConvForwardSharedMemory(const T *in, const T *wt, int thy = threadIdx.y; int d = blockIdx.x * blx + thx; // index along input dim - extern __shared__ T mem[]; + HIP_DYNAMIC_SHARED( T, mem) T *sw = mem; if (thy < future_context) { @@ -106,7 +107,7 @@ __global__ void RowConvGradInputSharedMemory(const T *dout, const T *wt, int thy = threadIdx.y; int d = blockIdx.x * blx + thx; // index along input dim - extern __shared__ T mem[]; + HIP_DYNAMIC_SHARED( T, mem) T *sw = mem; if (thy < future_context) { sw[thy * blx + thx] = @@ -171,7 +172,7 @@ __global__ void RowConvGradFilterImproved(const T *in, const T *dout, int gx = blockIdx.x * blx; int d = gx + thx; // index along input dim - extern __shared__ T mem[]; + HIP_DYNAMIC_SHARED( T, mem) int xdim_sh_in = block_y; int xdim_sh_dout = block_y; @@ -247,7 +248,7 @@ __global__ void RowConvGradFilter(const T *in, const T *dout, int num_sequence, int thy = threadIdx.y; int gx = blockIdx.x * blx; int d = gx + thx; // index along input dim - extern __shared__ T mem[]; + HIP_DYNAMIC_SHARED( T, mem) T *sh_in = mem; T *sh_dout = &mem[block_x * block_y]; @@ -314,13 +315,12 @@ class RowConvKernel dim3 block_dim = dim3(32, 32); dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1); int mem_per_block = (future_context * block_dim.x) * sizeof(T); - RowConvForwardSharedMemory< - T><<>>( + hipLaunchKernelGGL((RowConvForwardSharedMemory), dim3(grid_dim), dim3(block_dim), mem_per_block, stream, in, weight, num_sequence, input_dim, future_context, idx, out); } else { dim3 block_dim = dim3(32, 32); dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1); - RowConvForward<<>>( + hipLaunchKernelGGL((RowConvForward), dim3(grid_dim), dim3(block_dim), 0, stream, in, weight, num_sequence, input_dim, future_context, idx, out); } } @@ -363,8 +363,7 @@ class RowConvGradKernel (block_y * block_x + block_y * (block_x + future_context - 1) + future_context * block_y) * sizeof(T); - RowConvGradFilterImproved< - T><<>>( + hipLaunchKernelGGL((RowConvGradFilterImproved), dim3(grid_dim), dim3(block_dim), mem_per_block, device_ctx.stream(), in, dout, num_sequence, input_dim, future_context, block_x, block_y, idx, dfilter); } else { @@ -374,8 +373,7 @@ class RowConvGradKernel int block_y = block_dim.y; int mem_per_block = (block_x * block_y * 2) * sizeof(T); // For 2 arrays of size 32x32 - RowConvGradFilter< - T><<>>( + hipLaunchKernelGGL((RowConvGradFilter), dim3(grid_dim), dim3(block_dim), mem_per_block, device_ctx.stream(), in, dout, num_sequence, input_dim, future_context, block_x, block_y, idx, dfilter); } @@ -387,13 +385,12 @@ class RowConvGradKernel dim3 block_dim = dim3(32, 32); dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1); int mem_per_block = (future_context * block_dim.x) * sizeof(T); - RowConvGradInputSharedMemory< - T><<>>( + hipLaunchKernelGGL((RowConvGradInputSharedMemory), dim3(grid_dim), dim3(block_dim), mem_per_block, device_ctx.stream(), dout, weights, num_sequence, input_dim, future_context, idx, din); } else { dim3 block_dim = dim3(32, 32); dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1); - RowConvGradInput<<>>( + hipLaunchKernelGGL((RowConvGradInput), dim3(grid_dim), dim3(block_dim), 0, device_ctx.stream(), dout, weights, num_sequence, input_dim, future_context, idx, din); } } diff --git a/paddle/fluid/operators/scatter.cu.h b/paddle/fluid/operators/scatter.cu.h index ac7d69bfb549f..1e1639bf4347b 100644 --- a/paddle/fluid/operators/scatter.cu.h +++ b/paddle/fluid/operators/scatter.cu.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include "hip/hip_runtime.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/place.h" @@ -70,10 +71,8 @@ void GPUScatterAssign(const platform::DeviceContext& ctx, const Tensor& src, int n = slice_size * index_size; int grid = (n + block - 1) / block; - ScatterCUDAKernel<<< - grid, block, 0, - reinterpret_cast(ctx).stream()>>>( - p_src, p_index, p_output, index_size, slice_size); + hipLaunchKernelGGL((ScatterCUDAKernel), dim3(grid), dim3(block), 0, reinterpret_cast(ctx).stream(), + p_src, p_index, p_output, size_t(index_size), size_t(slice_size)); } } // namespace operators diff --git a/paddle/fluid/operators/sequence_erase_op.cu b/paddle/fluid/operators/sequence_erase_op.cu index fc9b91c351def..1ddd5a238e2a2 100644 --- a/paddle/fluid/operators/sequence_erase_op.cu +++ b/paddle/fluid/operators/sequence_erase_op.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "hip/hip_runtime.h" #include #include #include "paddle/fluid/operators/sequence_erase_op.h" @@ -78,8 +79,7 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel { thrust::device_vector num_erased(in_len + 1, 0); size_t* num_erased_ptr = thrust::raw_pointer_cast(num_erased.data()); auto stream = ctx.cuda_device_context().stream(); - LabelErasedIdx<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>( + hipLaunchKernelGGL((LabelErasedIdx), dim3((in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1), dim3(PADDLE_CUDA_NUM_THREADS), 0, stream, in_dat, in_len, dev_tokens_ptr, tokens.size(), num_erased_ptr); thrust::inclusive_scan(num_erased.begin() + 1, num_erased.end(), num_erased.begin() + 1); @@ -92,8 +92,7 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel { // Calc output LoD thrust::device_vector dev_out_lod(lod_len); size_t* dev_out_lod_ptr = thrust::raw_pointer_cast(dev_out_lod.data()); - GetOutLod<<<(lod_len - 1) / PADDLE_CUDA_NUM_THREADS + 1, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>( + hipLaunchKernelGGL((GetOutLod), dim3((lod_len - 1) / PADDLE_CUDA_NUM_THREADS + 1), dim3(PADDLE_CUDA_NUM_THREADS), 0, stream, num_erased_ptr, dev_in_lod_ptr, lod_len, dev_out_lod_ptr); // Set LoD for output std::vector out_lod0(dev_out_lod.begin(), dev_out_lod.end()); @@ -104,8 +103,7 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel { // Set output out->Resize({static_cast(out_lod0.back()), 1}); auto out_dat = out->mutable_data(ctx.GetPlace()); - SetOutput<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_dat, in_len, + hipLaunchKernelGGL((SetOutput), dim3((in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1), dim3(PADDLE_CUDA_NUM_THREADS), 0, stream, in_dat, in_len, num_erased_ptr, out_dat); } }; diff --git a/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc b/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc index 5661f4b42f37f..a4a1d9c6c51cd 100644 --- a/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc +++ b/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc @@ -98,8 +98,6 @@ class SequenceSoftmaxGradCUDNNKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_KERNEL(sequence_softmax, CUDNN, ::paddle::platform::CUDAPlace, - ops::SequenceSoftmaxCUDNNKernel, - ops::SequenceSoftmaxCUDNNKernel) + ops::SequenceSoftmaxCUDNNKernel); REGISTER_OP_KERNEL(sequence_softmax_grad, CUDNN, ::paddle::platform::CUDAPlace, - ops::SequenceSoftmaxGradCUDNNKernel, - ops::SequenceSoftmaxGradCUDNNKernel) + ops::SequenceSoftmaxGradCUDNNKernel); diff --git a/paddle/fluid/operators/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_softmax_op.cc index e8b4df04286d3..ad6260a3e48f2 100644 --- a/paddle/fluid/operators/sequence_softmax_op.cc +++ b/paddle/fluid/operators/sequence_softmax_op.cc @@ -42,6 +42,13 @@ class SequenceSoftmaxOp : public framework::OperatorWithKernel { ctx.template device_context(); runtime_cudnn_support = dev_ctx.cudnn_handle() != nullptr ? true : false; } +#endif +#ifdef PADDLE_WITH_HIP + if (platform::is_gpu_place(ctx.GetPlace())) { + auto& dev_ctx = + ctx.template device_context(); + runtime_cudnn_support = dev_ctx.miopen_handle() != nullptr ? true : false; + } #endif framework::LibraryType library_ = framework::LibraryType::kPlain; if (use_cudnn && runtime_cudnn_support) { @@ -138,6 +145,13 @@ class SequenceSoftmaxGradOp : public framework::OperatorWithKernel { ctx.template device_context(); runtime_cudnn_support = dev_ctx.cudnn_handle() != nullptr ? true : false; } +#endif +#ifdef PADDLE_WITH_HIP + if (platform::is_gpu_place(ctx.GetPlace())) { + auto& dev_ctx = + ctx.template device_context(); + runtime_cudnn_support = dev_ctx.miopen_handle() != nullptr ? true : false; + } #endif framework::LibraryType library_ = framework::LibraryType::kPlain; if (use_cudnn && runtime_cudnn_support) { diff --git a/paddle/fluid/operators/sgd_op.cu b/paddle/fluid/operators/sgd_op.cu index 9d211541c0bf7..050bda49d6f51 100644 --- a/paddle/fluid/operators/sgd_op.cu +++ b/paddle/fluid/operators/sgd_op.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "hip/hip_runtime.h" #define EIGEN_USE_GPU #include "paddle/fluid/operators/sgd_op.h" #include "paddle/fluid/platform/cuda_helper.h" @@ -73,7 +74,8 @@ class SGDOpCUDAKernel : public framework::OpKernel { int block = 512; int grid = (param->numel() + block - 1) / block; - SGDKernel<<>>( + hipLaunchKernelGGL((SGDKernel), + dim3(grid), dim3(block), 0, ctx.cuda_device_context().stream(), grad_data, param_data, learning_rate->data(), param->numel(), param_out_data); @@ -100,8 +102,8 @@ class SGDOpCUDAKernel : public framework::OpKernel { const int block_size = 256; dim3 threads(block_size, 1); dim3 grid(1, in_rows.size()); - SparseSGDFunctorKernel< - T, 256><<>>( + hipLaunchKernelGGL((SparseSGDFunctorKernel), + dim3(grid), dim3(threads), 0, ctx.cuda_device_context().stream(), in_data, in_rows.CUDAData(ctx.GetPlace()), learning_rate->data(), out_data, in_row_numel); diff --git a/paddle/fluid/operators/smooth_l1_loss_op.cu b/paddle/fluid/operators/smooth_l1_loss_op.cu index dfbb5c905884b..7e48bb182a3cf 100644 --- a/paddle/fluid/operators/smooth_l1_loss_op.cu +++ b/paddle/fluid/operators/smooth_l1_loss_op.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #define EIGEN_USE_GPU #include "paddle/fluid/operators/smooth_l1_loss_op.h" diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu.cc b/paddle/fluid/operators/softmax_cudnn_op.cu.cc index 5596fa0648ccc..63f0eb9cc400a 100644 --- a/paddle/fluid/operators/softmax_cudnn_op.cu.cc +++ b/paddle/fluid/operators/softmax_cudnn_op.cu.cc @@ -58,7 +58,6 @@ class SoftmaxGradCUDNNKernel : public framework::OpKernel { namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_KERNEL(softmax, CUDNN, plat::CUDAPlace, - ops::SoftmaxCUDNNKernel, - ops::SoftmaxCUDNNKernel); + ops::SoftmaxCUDNNKernel); REGISTER_OP_KERNEL(softmax_grad, CUDNN, plat::CUDAPlace, ops::SoftmaxGradCUDNNKernel); diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index e2c0f915d96b7..be7ce3f05f812 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -17,6 +17,10 @@ limitations under the License. */ #include "paddle/fluid/platform/cudnn_helper.h" #endif +#ifdef PADDLE_WITH_HIP +#include "paddle/fluid/platform/miopen_helper.h" +#endif + #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif @@ -50,6 +54,11 @@ class SoftmaxOp : public framework::OperatorWithKernel { library_ = framework::LibraryType::kCUDNN; } #endif +#ifdef PADDLE_WITH_HIP + if (platform::CanMIOpenBeUsed(ctx)) { + library_ = framework::LibraryType::kCUDNN; + } +#endif #ifdef PADDLE_WITH_MKLDNN if (library_ == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { @@ -139,6 +148,11 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel { if (platform::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; } +#endif +#ifdef PADDLE_WITH_HIP + if (platform::CanMIOpenBeUsed(ctx)) { + library_ = framework::LibraryType::kCUDNN; + } #endif std::string data_format = ctx.Attr("data_format"); return framework::OpKernelType( diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu index 8f7840cee1dd9..972a681f0e7c4 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "hip/hip_runtime.h" #define EIGEN_USE_GPU #include "paddle/fluid/operators/softmax_with_cross_entropy_op.h" @@ -99,16 +100,20 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { if (context.Attr("soft_label")) { int grid = (batch_size * class_num + block - 1) / block; const T* label_data = labels->data(); - SoftCrossEntropyGradientKernel<<>>( + hipLaunchKernelGGL((SoftCrossEntropyGradientKernel), + dim3(grid), dim3(block), 0, stream, logit_grad_data, loss_grad_data, label_data, batch_size, class_num); } else { int grid = (batch_size + block - 1) / block; const int64_t* label_data = labels->data(); - CrossEntropyGrad<<>>( + hipLaunchKernelGGL((CrossEntropyGrad), + dim3(grid), dim3(block), 0, stream, logit_grad_data, label_data, batch_size, class_num); int num = batch_size * class_num; grid = (num + block - 1) / block; - Scale<<>>(logit_grad_data, loss_grad_data, num, + hipLaunchKernelGGL((Scale), + dim3(grid), dim3(block), 0, stream, + logit_grad_data, loss_grad_data, num, class_num); } } diff --git a/paddle/fluid/operators/split_lod_tensor_op.cc b/paddle/fluid/operators/split_lod_tensor_op.cc index 3222cce239988..9b0ca58b3104d 100644 --- a/paddle/fluid/operators/split_lod_tensor_op.cc +++ b/paddle/fluid/operators/split_lod_tensor_op.cc @@ -54,7 +54,7 @@ class SplitLoDTensorOp : public framework::OperatorBase { if (platform::is_cpu_place(mask.place())) { cpu_mask->ShareDataWith(mask); } else if (platform::is_gpu_place(mask.place())) { -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) framework::TensorCopy(mask, platform::CPUPlace(), dev_ctx, cpu_mask.get()); #else diff --git a/paddle/fluid/operators/split_selected_rows_op.h b/paddle/fluid/operators/split_selected_rows_op.h index 0e9ce165b9884..05603f0c658d5 100644 --- a/paddle/fluid/operators/split_selected_rows_op.h +++ b/paddle/fluid/operators/split_selected_rows_op.h @@ -86,7 +86,7 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel { platform::CPUPlace(), dst + j * row_numel, platform::CPUPlace(), src + outs_dense_idx[i][j] * row_numel, sizeof(T) * row_numel); } else { -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) auto stream = ctx.cuda_device_context().stream(); memory::Copy(platform::CUDAPlace(), dst + j * row_numel, platform::CUDAPlace(), diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h index 22c1db82e9f5a..1500f9e70f626 100644 --- a/paddle/fluid/operators/strided_memcpy.h +++ b/paddle/fluid/operators/strided_memcpy.h @@ -85,7 +85,7 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx, memory::Copy(cpu_place, dst + i * dst_after, cpu_place, src + i * src_after, sizeof(T) * size); } else { -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) auto& gpu_place = boost::get(place); auto& cuda_ctx = reinterpret_cast(ctx); diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h index e7e5346cdca5e..d71b2ba0fcbd5 100644 --- a/paddle/fluid/operators/sum_op.h +++ b/paddle/fluid/operators/sum_op.h @@ -73,7 +73,7 @@ class SumKernel : public framework::OpKernel { // If is in_place, we store the input[0] to in0 auto &in_sel0 = in_vars[0]->Get(); auto &rows = in_sel0.rows(); -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) std::vector rows_in_cpu; rows_in_cpu.reserve(rows.size()); for (auto item : rows) { diff --git a/paddle/fluid/operators/target_assign_op.cu b/paddle/fluid/operators/target_assign_op.cu index 24664f99b20f9..c8ba94f68cd9f 100644 --- a/paddle/fluid/operators/target_assign_op.cu +++ b/paddle/fluid/operators/target_assign_op.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "hip/hip_runtime.h" #include "paddle/fluid/operators/target_assign_op.h" namespace paddle { @@ -44,7 +45,7 @@ struct NegTargetAssignFunctor { WT* out_wt) { const int block_size = 256; const int grid_size = N; - NegTargetAssignKernel<<>>( + hipLaunchKernelGGL((NegTargetAssignKernel), dim3(grid_size), dim3(block_size), 0, ctx.stream(), neg_indices, lod, N, M, K, mismatch_value, out, out_wt); } }; diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu index bfd26c2f2294f..a006a245a0cfb 100644 --- a/paddle/fluid/operators/top_k_op.cu +++ b/paddle/fluid/operators/top_k_op.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "hip/hip_runtime.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/assert.h" @@ -148,7 +149,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair topk[], int& beam, if (k < MaxLength - beam) { topk[k] = topk[k + beam]; } else { - topk[k].set(-INFINITY, -1); + topk[k].set(-FP_INFINITE, -1); } } if (!is_empty) { @@ -179,7 +180,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair topk[], int& beam, if (k < MaxLength - beam) { topk[k] = topk[k + beam]; } else { - topk[k].set(-INFINITY, -1); + topk[k].set(-FP_INFINITE, -1); } } if (!is_empty) { @@ -265,7 +266,7 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices, bool firststep = true; for (int k = 0; k < MaxLength; k++) { - topk[k].set(-INFINITY, -1); + topk[k].set(-FP_INFINITE, -1); } while (k) { ThreadGetTopK(topk, beam, k, @@ -305,12 +306,10 @@ class TopkOpCUDAKernel : public framework::OpKernel { dim3 threads(256, 1); dim3 grid(input_height, 1); - KeMatrixTopK<<< - grid, threads, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(output_data, output->dims()[1], + hipLaunchKernelGGL((KeMatrixTopK), + dim3(grid), dim3(threads), 0, reinterpret_cast(ctx.device_context()).stream(), output_data, output->dims()[1], indices_data, input_data, - input_width, input_width, int(k)); + int(input_width), int(input_width), int(k)); } }; diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 6780b8cc6deca..19b99b3b9a024 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -22,6 +22,7 @@ cc_library(cpu_info SRCS cpu_info.cc DEPS gflags glog enforce) cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info) nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce) +hip_library(gpu_info SRCS gpu_info_hip.cc DEPS gflags glog enforce) cc_library(place SRCS place.cc DEPS enforce boost) cc_test(place_test SRCS place_test.cc DEPS place glog gflags) @@ -30,6 +31,8 @@ add_subdirectory(dynload) IF(WITH_GPU) set(GPU_CTX_DEPS dynload_cuda dynamic_loader) +ELSEIF(WITH_AMD_GPU) + set(GPU_CTX_DEPS dynload_hip dynamic_loader) ELSE() set(GPU_CTX_DEPS) ENDIF() @@ -54,4 +57,4 @@ cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer) cc_test(profiler_test SRCS profiler_test.cc DEPS profiler) nv_test(float16_gpu_test SRCS float16_test.cu) -cc_test(float16_test SRCS float16_test.cc) +hip_test(float16_test SRCS float16_test.cc) diff --git a/paddle/fluid/platform/assert.h b/paddle/fluid/platform/assert.h index 123d3598f4f47..eac42879707d3 100644 --- a/paddle/fluid/platform/assert.h +++ b/paddle/fluid/platform/assert.h @@ -17,14 +17,13 @@ limitations under the License. */ #define STRINGIFY(x) #x #define TOSTRING(x) STRINGIFY(x) -#if defined(__APPLE__) && defined(__CUDA_ARCH__) && !defined(NDEBUG) +#if defined(__HIP_DEVICE_COMPILE__) && !defined(NDEBUG) #include #define PADDLE_ASSERT(e) \ do { \ if (!(e)) { \ printf("%s:%d Assertion `%s` failed.\n", __FILE__, __LINE__, \ TOSTRING(e)); \ - asm("trap;"); \ } \ } while (0) diff --git a/paddle/fluid/platform/cuda_helper.h b/paddle/fluid/platform/cuda_helper.h index a4ea4f21e3c16..c51869d3ca3e6 100644 --- a/paddle/fluid/platform/cuda_helper.h +++ b/paddle/fluid/platform/cuda_helper.h @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include +#include "hip/hip_runtime.h" namespace paddle { namespace platform { @@ -42,7 +42,7 @@ CUDA_ATOMIC_WRAPPER(Add, int64_t) { static_cast(val)); } -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600 +#if defined(__HIP_DEVICE_COMPILE__) && 0 //__CUDA_ARCH__ >= 600 USE_CUDA_ATOMIC(Add, double); #else CUDA_ATOMIC_WRAPPER(Add, double) { diff --git a/paddle/fluid/platform/cuda_profiler.h b/paddle/fluid/platform/cuda_profiler.h index ebd6aebd76885..142909cc79ecc 100644 --- a/paddle/fluid/platform/cuda_profiler.h +++ b/paddle/fluid/platform/cuda_profiler.h @@ -13,7 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#ifdef PADDLE_WITH_HIP +#include "hip/hip_runtime_api.h" +#else #include +#endif #include #include #include @@ -21,6 +25,17 @@ limitations under the License. */ namespace paddle { namespace platform { +#ifdef PADDLE_WITH_HIP +void CudaProfilerInit(std::string output_file, std::string output_mode, + std::string config_file) { +} + +void CudaProfilerStart() { PADDLE_ENFORCE(hipProfilerStart()); } + +void CudaProfilerStop() { PADDLE_ENFORCE(hipProfilerStop()); } + +#else + void CudaProfilerInit(std::string output_file, std::string output_mode, std::string config_file) { PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv"); @@ -33,5 +48,6 @@ void CudaProfilerStart() { PADDLE_ENFORCE(cudaProfilerStart()); } void CudaProfilerStop() { PADDLE_ENFORCE(cudaProfilerStop()); } +#endif } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/cudnn_helper_test.cc b/paddle/fluid/platform/cudnn_helper_test.cc index 517df6863499f..b8ef3cf0498d3 100644 --- a/paddle/fluid/platform/cudnn_helper_test.cc +++ b/paddle/fluid/platform/cudnn_helper_test.cc @@ -23,14 +23,13 @@ TEST(CudnnHelper, ScopedTensorDescriptor) { std::vector shape = {2, 4, 6, 6}; auto desc = tensor_desc.descriptor(DataLayout::kNCHW, shape); - cudnnDataType_t type; - int nd; + miopenDataType_t type; std::vector dims(4); std::vector strides(4); - paddle::platform::dynload::cudnnGetTensorNdDescriptor( - desc, 4, &type, &nd, dims.data(), strides.data()); + paddle::platform::dynload::miopenGet4dTensorDescriptor( + desc, &type, &dims[0], &dims[1], &dims[2], &dims[3], + &strides[0], &strides[1], &strides[2], &strides[3]); - EXPECT_EQ(nd, 4); for (size_t i = 0; i < dims.size(); ++i) { EXPECT_EQ(dims[i], shape[i]); } @@ -38,59 +37,27 @@ TEST(CudnnHelper, ScopedTensorDescriptor) { EXPECT_EQ(strides[2], 6); EXPECT_EQ(strides[1], 36); EXPECT_EQ(strides[0], 144); - - // test tensor5d: ScopedTensorDescriptor - ScopedTensorDescriptor tensor5d_desc; - std::vector shape_5d = {2, 4, 6, 6, 6}; - auto desc_5d = tensor5d_desc.descriptor(DataLayout::kNCDHW, shape_5d); - - std::vector dims_5d(5); - std::vector strides_5d(5); - paddle::platform::dynload::cudnnGetTensorNdDescriptor( - desc_5d, 5, &type, &nd, dims_5d.data(), strides_5d.data()); - - EXPECT_EQ(nd, 5); - for (size_t i = 0; i < dims_5d.size(); ++i) { - EXPECT_EQ(dims_5d[i], shape_5d[i]); - } - EXPECT_EQ(strides_5d[4], 1); - EXPECT_EQ(strides_5d[3], 6); - EXPECT_EQ(strides_5d[2], 36); - EXPECT_EQ(strides_5d[1], 216); - EXPECT_EQ(strides_5d[0], 864); } - TEST(CudnnHelper, ScopedFilterDescriptor) { using paddle::platform::ScopedFilterDescriptor; using paddle::platform::DataLayout; ScopedFilterDescriptor filter_desc; std::vector shape = {2, 3, 3}; - auto desc = filter_desc.descriptor(DataLayout::kNCHW, shape); - cudnnDataType_t type; - int nd; - cudnnTensorFormat_t format; + miopenDataType_t type; std::vector kernel(3); - paddle::platform::dynload::cudnnGetFilterNdDescriptor(desc, 3, &type, &format, - &nd, kernel.data()); - - EXPECT_EQ(GetCudnnTensorFormat(DataLayout::kNCHW), format); - EXPECT_EQ(nd, 3); - for (size_t i = 0; i < shape.size(); ++i) { - EXPECT_EQ(kernel[i], shape[i]); - } ScopedFilterDescriptor filter_desc_4d; std::vector shape_4d = {2, 3, 3, 3}; auto desc_4d = filter_desc.descriptor(DataLayout::kNCDHW, shape_4d); std::vector kernel_4d(4); - paddle::platform::dynload::cudnnGetFilterNdDescriptor( - desc_4d, 4, &type, &format, &nd, kernel_4d.data()); + std::vector strides(4); + paddle::platform::dynload::miopenGet4dTensorDescriptor( + desc_4d, &type, &kernel_4d[0], &kernel_4d[1], &kernel_4d[2], &kernel_4d[3], + &strides[0], &strides[1], &strides[2], &strides[3]); - EXPECT_EQ(GetCudnnTensorFormat(DataLayout::kNCHW), format); - EXPECT_EQ(nd, 4); for (size_t i = 0; i < shape_4d.size(); ++i) { EXPECT_EQ(kernel_4d[i], shape_4d[i]); } @@ -100,28 +67,25 @@ TEST(CudnnHelper, ScopedConvolutionDescriptor) { using paddle::platform::ScopedConvolutionDescriptor; ScopedConvolutionDescriptor conv_desc; - std::vector src_pads = {2, 2, 2}; - std::vector src_strides = {1, 1, 1}; - std::vector src_dilations = {1, 1, 1}; + std::vector src_pads = {2, 2}; + std::vector src_strides = {1, 1}; + std::vector src_dilations = {1, 1}; auto desc = conv_desc.descriptor(src_pads, src_strides, src_dilations); - cudnnDataType_t type; - cudnnConvolutionMode_t mode; - int nd; - std::vector pads(3); - std::vector strides(3); - std::vector dilations(3); - paddle::platform::dynload::cudnnGetConvolutionNdDescriptor( - desc, 3, &nd, pads.data(), strides.data(), dilations.data(), &mode, - &type); - - EXPECT_EQ(nd, 3); + miopenConvolutionMode_t mode; + std::vector pads(2); + std::vector strides(2); + std::vector dilations(2); + paddle::platform::dynload::miopenGetConvolutionDescriptor( + desc, &mode, &pads[0], &pads[1], &strides[0], &strides[1], + &dilations[0], &dilations[1]); + for (size_t i = 0; i < src_pads.size(); ++i) { EXPECT_EQ(pads[i], src_pads[i]); EXPECT_EQ(strides[i], src_strides[i]); EXPECT_EQ(dilations[i], src_dilations[i]); } - EXPECT_EQ(mode, CUDNN_CROSS_CORRELATION); + EXPECT_EQ(mode, miopenConvolution); } TEST(CudnnHelper, ScopedPoolingDescriptor) { @@ -129,26 +93,24 @@ TEST(CudnnHelper, ScopedPoolingDescriptor) { using paddle::platform::PoolingMode; ScopedPoolingDescriptor pool_desc; - std::vector src_kernel = {2, 2, 5}; - std::vector src_pads = {1, 1, 2}; - std::vector src_strides = {2, 2, 3}; + std::vector src_kernel = {2, 2}; + std::vector src_pads = {1, 1}; + std::vector src_strides = {2, 2}; auto desc = pool_desc.descriptor(PoolingMode::kMaximum, src_kernel, src_pads, src_strides); - cudnnPoolingMode_t mode; - cudnnNanPropagation_t nan_t = CUDNN_PROPAGATE_NAN; - int nd; - std::vector kernel(3); - std::vector pads(3); - std::vector strides(3); - paddle::platform::dynload::cudnnGetPoolingNdDescriptor( - desc, 3, &mode, &nan_t, &nd, kernel.data(), pads.data(), strides.data()); + miopenPoolingMode_t mode; + std::vector kernel(2); + std::vector pads(2); + std::vector strides(2); + paddle::platform::dynload::miopenGet2dPoolingDescriptor( + desc, &mode, &kernel[0], &kernel[1], &pads[0], &pads[1], + &strides[0], &strides[1]); - EXPECT_EQ(nd, 3); for (size_t i = 0; i < src_pads.size(); ++i) { EXPECT_EQ(kernel[i], src_kernel[i]); EXPECT_EQ(pads[i], src_pads[i]); EXPECT_EQ(strides[i], src_strides[i]); } - EXPECT_EQ(mode, CUDNN_POOLING_MAX); + EXPECT_EQ(mode, miopenPoolingMax); } diff --git a/paddle/fluid/platform/details/device_ptr_cast.h b/paddle/fluid/platform/details/device_ptr_cast.h index 1c502a19c056c..1dbe1ed3e11a4 100644 --- a/paddle/fluid/platform/details/device_ptr_cast.h +++ b/paddle/fluid/platform/details/device_ptr_cast.h @@ -14,7 +14,7 @@ limitations under the License. */ #pragma once -#ifndef __NVCC__ +#ifndef __HIPCC__ #error device_ptr_cast must be include by .cu file #endif diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index feb4f367008d7..50a327dc4cc77 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -46,7 +46,7 @@ DeviceContextPool::DeviceContextPool( p, PtrType(new CPUDeviceContext(boost::get(p)))); #endif } else if (platform::is_gpu_place(p)) { -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) device_contexts_.emplace( p, PtrType(new CUDADeviceContext(boost::get(p)))); #else @@ -55,7 +55,7 @@ DeviceContextPool::DeviceContextPool( "option"); #endif } else if (platform::is_cuda_pinned_place(p)) { -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) device_contexts_.emplace( p, PtrType(new CUDAPinnedDeviceContext(boost::get(p)))); @@ -212,6 +212,136 @@ Eigen::DefaultDevice* CUDAPinnedDeviceContext::eigen_device() const { Place CUDAPinnedDeviceContext::GetPlace() const { return place_; } #endif +#ifdef PADDLE_WITH_HIP + +class EigenHipStreamDevice : public Eigen::StreamInterface { + public: + EigenHipStreamDevice() : scratch_(nullptr), semaphore_(nullptr) { + Eigen::initializeDeviceProp(); + } + ~EigenHipStreamDevice() override {} + + void Reinitialize(const hipStream_t* cuda_stream, CUDAPlace place) { + stream_ = cuda_stream; + place_ = place; + device_prop_ = &Eigen::m_deviceProperties[place.device]; + } + + const hipStream_t& stream() const override { return *stream_; } + + const hipDeviceProp_t& deviceProperties() const override { + return *device_prop_; + } + + void* allocate(size_t num_bytes) const override { + return paddle::memory::Alloc(place_, num_bytes); + } + + void deallocate(void* buffer) const override { + paddle::memory::Free(place_, buffer); + } + + void* scratchpad() const override { + if (scratch_ == NULL) { + scratch_ = allocate(Eigen::kHipScratchSize + sizeof(unsigned int)); + } + return scratch_; + } + + unsigned int* semaphore() const override { + if (semaphore_ == NULL) { + char* scratch = + static_cast(scratchpad()) + Eigen::kHipScratchSize; + semaphore_ = reinterpret_cast(scratch); + PADDLE_ENFORCE( + hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_)); + } + return semaphore_; + } + + private: + CUDAPlace place_; + const hipStream_t* stream_; // not owned; + const hipDeviceProp_t* device_prop_; // not owned; + mutable void* scratch_; + mutable unsigned int* semaphore_; +}; + +CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) { + SetDeviceId(place_.device); + compute_capability = GetCUDAComputeCapability(place_.device); + multi_process = GetCUDAMultiProcessors(place_.device); + max_threads_per_mp = GetCUDAMaxThreadsPerMultiProcessor(place_.device); + PADDLE_ENFORCE(hipStreamCreate(&stream_)); + eigen_stream_.reset(new EigenHipStreamDevice()); + eigen_stream_->Reinitialize(&stream_, place); + eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get())); + PADDLE_ENFORCE(dynload::hipblasCreate(&hipblas_handle_)); + PADDLE_ENFORCE(dynload::hipblasSetStream(hipblas_handle_, stream_)); + if (dynload::HasMIOpen()) { + PADDLE_ENFORCE(dynload::miopenCreate(&miopen_handle_)); + PADDLE_ENFORCE(dynload::miopenSetStream(miopen_handle_, stream_)); + } else { + miopen_handle_ = nullptr; + } +} + +CUDADeviceContext::~CUDADeviceContext() { + SetDeviceId(place_.device); + Wait(); + PADDLE_ENFORCE(dynload::hipblasDestroy(hipblas_handle_)); + if (miopen_handle_ != nullptr) { + PADDLE_ENFORCE(dynload::miopenDestroy(miopen_handle_)); + } + eigen_stream_.reset(); + eigen_device_.reset(); + PADDLE_ENFORCE(hipStreamDestroy(stream_)); +} + +Place CUDADeviceContext::GetPlace() const { return place_; } + +void CUDADeviceContext::Wait() const { + std::lock_guard guard(mutex_); + PADDLE_ENFORCE(hipStreamSynchronize(stream_)); + PADDLE_ENFORCE(hipGetLastError()); +} + +int CUDADeviceContext::GetComputeCapability() const { + return compute_capability; +} + +int CUDADeviceContext::GetMaxPhysicalThreadCount() const { + return multi_process * max_threads_per_mp; +} + +Eigen::GpuDevice* CUDADeviceContext::eigen_device() const { + return eigen_device_.get(); +} + +hipblasHandle_t CUDADeviceContext::hipblas_handle() const { + return hipblas_handle_; +} + +miopenHandle_t CUDADeviceContext::miopen_handle() const { return miopen_handle_; } + +hipStream_t CUDADeviceContext::stream() const { return stream_; } + +CUDAPinnedDeviceContext::CUDAPinnedDeviceContext() { + eigen_device_.reset(new Eigen::DefaultDevice()); +} + +CUDAPinnedDeviceContext::CUDAPinnedDeviceContext(CUDAPinnedPlace place) + : place_(place) { + eigen_device_.reset(new Eigen::DefaultDevice()); +} + +Eigen::DefaultDevice* CUDAPinnedDeviceContext::eigen_device() const { + return eigen_device_.get(); +} + +Place CUDAPinnedDeviceContext::GetPlace() const { return place_; } +#endif + #ifdef PADDLE_WITH_MKLDNN MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place) : CPUDeviceContext(place), engine_(mkldnn::engine::cpu, 0), p_blobs_() { diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 6b796d92d09cd..dfff9469e62f6 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -21,6 +21,13 @@ limitations under the License. */ #define EIGEN_USE_GPU #endif +#ifdef PADDLE_WITH_HIP +#include "paddle/fluid/platform/dynload/hipblas.h" +#include "paddle/fluid/platform/dynload/miopen.h" +#include "paddle/fluid/platform/gpu_info.h" +#define EIGEN_USE_GPU +#endif + #ifdef PADDLE_WITH_MKLDNN #include #endif @@ -107,7 +114,80 @@ class CUDADeviceContext : public DeviceContext { cudaStream_t stream_; cudnnHandle_t cudnn_handle_; cublasHandle_t cublas_handle_; + int compute_capability; + int multi_process; + int max_threads_per_mp; +}; + +template <> +struct DefaultDeviceContextType { + using TYPE = CUDADeviceContext; +}; + +// Currently, CUDAPinnedDeviceContext is only used to data copying. +class CUDAPinnedDeviceContext : public DeviceContext { + public: + CUDAPinnedDeviceContext(); + explicit CUDAPinnedDeviceContext(CUDAPinnedPlace place); + + Place GetPlace() const override; + + Eigen::DefaultDevice* eigen_device() const; + + private: + CUDAPinnedPlace place_; + std::unique_ptr eigen_device_; +}; + +template <> +struct DefaultDeviceContextType { + using TYPE = CUDAPinnedDeviceContext; +}; +#endif + +#ifdef PADDLE_WITH_HIP + +class EigenHipStreamDevice; + +class CUDADeviceContext : public DeviceContext { + public: + explicit CUDADeviceContext(CUDAPlace place); + virtual ~CUDADeviceContext(); + + /*! \brief Wait for all operations completion in the stream. */ + void Wait() const override; + + /*! \brief Return place in the device context. */ + Place GetPlace() const override; + + /*! \brief Return compute capability in the device context. */ + int GetComputeCapability() const; + /*! \brief Return the max physical thread count in the device context */ + int GetMaxPhysicalThreadCount() const; + + /*! \brief Return eigen device in the device context. */ + Eigen::GpuDevice* eigen_device() const; + + /*! \brief Return hipblas handle in the device context. */ + hipblasHandle_t hipblas_handle() const; + + /*! \brief Return miopen handle in the device context. */ + miopenHandle_t miopen_handle() const; + + /*! \brief Return cuda stream in the device context. */ + hipStream_t stream() const; + + private: + CUDAPlace place_; + + std::unique_ptr eigen_device_; + std::unique_ptr eigen_stream_; + + mutable std::mutex mutex_; + hipStream_t stream_; + miopenHandle_t miopen_handle_; + hipblasHandle_t hipblas_handle_; int compute_capability; int multi_process; int max_threads_per_mp; diff --git a/paddle/fluid/platform/device_context_test.cu b/paddle/fluid/platform/device_context_test.cu index 9d8d07362ce3a..d72cf284a78a6 100644 --- a/paddle/fluid/platform/device_context_test.cu +++ b/paddle/fluid/platform/device_context_test.cu @@ -40,9 +40,9 @@ TEST(Device, CUDADeviceContext) { CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i)); Eigen::GpuDevice* gpu_device = device_context->eigen_device(); ASSERT_NE(nullptr, gpu_device); - cudnnHandle_t cudnn_handle = device_context->cudnn_handle(); + miopenHandle_t cudnn_handle = device_context->cudnn_handle(); ASSERT_NE(nullptr, cudnn_handle); - cublasHandle_t cublas_handle = device_context->cublas_handle(); + hipblasHandle_t cublas_handle = device_context->cublas_handle(); ASSERT_NE(nullptr, cublas_handle); ASSERT_NE(nullptr, device_context->stream()); delete device_context; diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt index 84dac2937de02..d6e865f6870e2 100644 --- a/paddle/fluid/platform/dynload/CMakeLists.txt +++ b/paddle/fluid/platform/dynload/CMakeLists.txt @@ -6,4 +6,8 @@ if (CUPTI_FOUND) list(APPEND CUDA_SRCS cupti.cc) endif(CUPTI_FOUND) nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader) + +list(APPEND HIP_SRCS hipblas.cc miopen.cc hiprand.cc rccl.cc) +hip_library(dynload_hip SRCS ${HIP_SRCS} DEPS dynamic_loader) + cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h index fa9041134d863..1645361bcf7b1 100644 --- a/paddle/fluid/platform/dynload/cublas.h +++ b/paddle/fluid/platform/dynload/cublas.h @@ -14,7 +14,7 @@ limitations under the License. */ #pragma once -#include +#include #include #include #include "paddle/fluid/platform/dynload/dynamic_loader.h" @@ -37,8 +37,8 @@ extern void *cublas_dso_handle; #define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) \ struct DynLoad__##__name { \ template \ - inline cublasStatus_t operator()(Args... args) { \ - typedef cublasStatus_t (*cublasFunc)(Args...); \ + inline hipblasStatus_t operator()(Args... args) { \ + typedef hipblasStatus_t (*cublasFunc)(Args...); \ std::call_once(cublas_dso_flag, \ paddle::platform::dynload::GetCublasDsoHandle, \ &cublas_dso_handle); \ @@ -51,7 +51,7 @@ extern void *cublas_dso_handle; #define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) \ struct DynLoad__##__name { \ template \ - inline cublasStatus_t operator()(Args... args) { \ + inline hipblasStatus_t operator()(Args... args) { \ return __name(args...); \ } \ }; \ @@ -62,34 +62,33 @@ extern void *cublas_dso_handle; DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) #define CUBLAS_BLAS_ROUTINE_EACH(__macro) \ - __macro(cublasSaxpy_v2); \ - __macro(cublasDaxpy_v2); \ - __macro(cublasSgemv_v2); \ - __macro(cublasDgemv_v2); \ - __macro(cublasSgemm_v2); \ - __macro(cublasDgemm_v2); \ - __macro(cublasHgemm); \ - __macro(cublasSgemmEx); \ - __macro(cublasSgeam_v2); \ - __macro(cublasDgeam_v2); \ - __macro(cublasCreate_v2); \ - __macro(cublasDestroy_v2); \ - __macro(cublasSetStream_v2); \ - __macro(cublasSetPointerMode_v2); \ - __macro(cublasGetPointerMode_v2); \ - __macro(cublasSgemmBatched); \ - __macro(cublasDgemmBatched); \ - __macro(cublasCgemmBatched); \ - __macro(cublasZgemmBatched); \ - __macro(cublasSgemmStridedBatched); \ - __macro(cublasDgemmStridedBatched); \ - __macro(cublasCgemmStridedBatched); \ - __macro(cublasZgemmStridedBatched); \ - __macro(cublasHgemmStridedBatched); \ - __macro(cublasSgetrfBatched); \ - __macro(cublasSgetriBatched); \ - __macro(cublasDgetrfBatched); \ - __macro(cublasDgetriBatched) + __macro(hipblasSaxpy); \ + __macro(hipblasDaxpy); \ + __macro(hipblasSgemv); \ + __macro(hipblasDgemv); \ + __macro(hipblasSgemm); \ + __macro(hipblasDgemm); \ + __macro(hipblasSgeam); \ + __macro(hipblasDgeam); \ + __macro(hipblasCreate); \ + __macro(hipblasDestroy); \ + __macro(hipblasSetStream); \ + __macro(hipblasSetPointerMode); \ + __macro(hipblasGetPointerMode); \ + __macro(hipblasSgemmBatched); \ + __macro(hipblasDgemmBatched); \ + __macro(hipblasCgemmBatched); \ + __macro(hipblasZgemmBatched); \ + __macro(hipblasSgemmStridedBatched); \ + __macro(hipblasDgemmStridedBatched); \ + __macro(hipblasCgemmStridedBatched); \ + __macro(hipblasZgemmStridedBatched); \ + __macro(hipblasDgetrfBatched); \ + __macro(hipblasDgetriBatched) + + +//__macro(hipblasSgetrfBatched); +//_macro(hipblasSgetriBatched); CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP); diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h index 49a54d8478e9a..247cab85848e5 100644 --- a/paddle/fluid/platform/dynload/cudnn.h +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -14,7 +14,7 @@ limitations under the License. */ #pragma once -#include +#include #include #include // NOLINT #include "paddle/fluid/platform/dynload/dynamic_loader.h" @@ -27,6 +27,28 @@ extern std::once_flag cudnn_dso_flag; extern void* cudnn_dso_handle; extern bool HasCUDNN(); +inline const char* miopenGetErrorString(miopenStatus_t status) { + switch (status) { + case miopenStatusSuccess: + return "MIOPEN_STATUS_SUCCESS"; + case miopenStatusNotInitialized: + return "MIOPEN_STATUS_NOT_INITIALIZED"; + case miopenStatusInvalidValue: + return "MIOPEN_STATUS_INVALID_VALUE"; + case miopenStatusBadParm: + return "MIOPEN_STATUS_BAD_PARAM"; + case miopenStatusAllocFailed: + return "MIOPEN_STATUS_ALLOC_FAILED"; + case miopenStatusInternalError: + return "MIOPEN_STATUS_INTERNAL_ERROR"; + case miopenStatusNotImplemented: + return "MIOPEN_STATUS_NOT_IMPLEMENTED"; + case miopenStatusUnknownError: + default: + return "MIOPEN_STATUS_UNKNOWN_ERROR"; + } +} + #ifdef PADDLE_USE_DSO extern void EnforceCUDNNLoaded(const char* fn_name); @@ -63,88 +85,63 @@ extern void EnforceCUDNNLoaded(const char* fn_name); * different cudnn version has different interfaces **/ #define CUDNN_DNN_ROUTINE_EACH(__macro) \ - __macro(cudnnSetTensor4dDescriptor); \ - __macro(cudnnSetTensor4dDescriptorEx); \ - __macro(cudnnSetTensorNdDescriptor); \ - __macro(cudnnGetTensorNdDescriptor); \ - __macro(cudnnGetConvolutionNdForwardOutputDim); \ - __macro(cudnnGetConvolutionForwardAlgorithm); \ - __macro(cudnnCreateTensorDescriptor); \ - __macro(cudnnDestroyTensorDescriptor); \ - __macro(cudnnCreateFilterDescriptor); \ - __macro(cudnnSetFilter4dDescriptor); \ - __macro(cudnnSetFilterNdDescriptor); \ - __macro(cudnnGetFilterNdDescriptor); \ - __macro(cudnnSetPooling2dDescriptor); \ - __macro(cudnnSetPoolingNdDescriptor); \ - __macro(cudnnGetPoolingNdDescriptor); \ - __macro(cudnnDestroyFilterDescriptor); \ - __macro(cudnnCreateConvolutionDescriptor); \ - __macro(cudnnCreatePoolingDescriptor); \ - __macro(cudnnDestroyPoolingDescriptor); \ - __macro(cudnnSetConvolution2dDescriptor); \ - __macro(cudnnDestroyConvolutionDescriptor); \ - __macro(cudnnSetConvolutionNdDescriptor); \ - __macro(cudnnGetConvolutionNdDescriptor); \ - __macro(cudnnDeriveBNTensorDescriptor); \ - __macro(cudnnCreate); \ - __macro(cudnnDestroy); \ - __macro(cudnnSetStream); \ - __macro(cudnnActivationForward); \ - __macro(cudnnConvolutionForward); \ - __macro(cudnnConvolutionBackwardBias); \ - __macro(cudnnGetConvolutionForwardWorkspaceSize); \ - __macro(cudnnTransformTensor); \ - __macro(cudnnPoolingForward); \ - __macro(cudnnPoolingBackward); \ - __macro(cudnnSoftmaxBackward); \ - __macro(cudnnSoftmaxForward); \ - __macro(cudnnGetVersion); \ - __macro(cudnnGetErrorString); + __macro(miopenSet4dTensorDescriptor); \ + __macro(miopenGet4dTensorDescriptor); \ + __macro(miopenFindConvolutionForwardAlgorithm); \ + __macro(miopenGetConvolutionDescriptor); \ + __macro(miopenCreateTensorDescriptor); \ + __macro(miopenDestroyTensorDescriptor); \ + __macro(miopenSet2dPoolingDescriptor); \ + __macro(miopenGet2dPoolingDescriptor); \ + __macro(miopenCreateConvolutionDescriptor); \ + __macro(miopenCreatePoolingDescriptor); \ + __macro(miopenDestroyPoolingDescriptor); \ + __macro(miopenInitConvolutionDescriptor); \ + __macro(miopenDestroyConvolutionDescriptor); \ + __macro(miopenDeriveBNTensorDescriptor); \ + __macro(miopenCreate); \ + __macro(miopenDestroy); \ + __macro(miopenSetStream); \ + __macro(miopenActivationForward); \ + __macro(miopenConvolutionForward); \ + __macro(miopenConvolutionBackwardBias); \ + __macro(miopenConvolutionForwardGetWorkSpaceSize); \ + __macro(miopenPoolingGetWorkSpaceSize); \ + __macro(miopenPoolingForward); \ + __macro(miopenPoolingBackward); \ + __macro(miopenSoftmaxBackward); \ + __macro(miopenSoftmaxForward); CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #define CUDNN_DNN_ROUTINE_EACH_R2(__macro) \ - __macro(cudnnAddTensor); \ - __macro(cudnnConvolutionBackwardData); \ - __macro(cudnnConvolutionBackwardFilter); + __macro(miopenAddTensor); \ + __macro(miopenConvolutionBackwardData); \ + __macro(miopenConvolutionBackwardWeights); CUDNN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) // APIs available after R3: -#if CUDNN_VERSION >= 3000 #define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro) \ - __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize); \ - __macro(cudnnGetConvolutionBackwardDataAlgorithm); \ - __macro(cudnnGetConvolutionBackwardFilterAlgorithm); \ - __macro(cudnnGetConvolutionBackwardDataWorkspaceSize); + __macro(miopenConvolutionBackwardWeightsGetWorkspaceSize); \ + __macro(miopenFindConvolutionBackwardDataAlgorithm); \ + __macro(miopenFindConvolutionBackwardWeightsAlgorithm); \ + __macro(miopenConvolutionBackwardWeightsGetWorkSpaceSize); \ + __macro(miopenConvolutionBackwardDataGetWorkSpaceSize); CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) -#endif // APIs available after R4: -#if CUDNN_VERSION >= 4007 #define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro) \ - __macro(cudnnBatchNormalizationForwardTraining); \ - __macro(cudnnBatchNormalizationForwardInference); \ - __macro(cudnnBatchNormalizationBackward); + __macro(miopenBatchNormalizationForwardTraining); \ + __macro(miopenBatchNormalizationForwardInference); \ + __macro(miopenBatchNormalizationBackward); CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) -#endif // APIs in R5 -#if CUDNN_VERSION >= 5000 #define CUDNN_DNN_ROUTINE_EACH_R5(__macro) \ - __macro(cudnnCreateActivationDescriptor); \ - __macro(cudnnSetActivationDescriptor); \ - __macro(cudnnGetActivationDescriptor); \ - __macro(cudnnDestroyActivationDescriptor); + __macro(miopenCreateActivationDescriptor); \ + __macro(miopenSetActivationDescriptor); \ + __macro(miopenGetActivationDescriptor); \ + __macro(miopenDestroyActivationDescriptor); CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) -#endif - -#if CUDNN_VERSION >= 7001 -#define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \ - __macro(cudnnSetConvolutionGroupCount); \ - __macro(cudnnSetConvolutionMathType); -CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) -#endif - } // namespace dynload } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/dynload/curand.h b/paddle/fluid/platform/dynload/curand.h index 1b3ff962d6edc..44f424a2eab70 100644 --- a/paddle/fluid/platform/dynload/curand.h +++ b/paddle/fluid/platform/dynload/curand.h @@ -14,7 +14,7 @@ limitations under the License. */ #pragma once -#include +#include #include #include #include "paddle/fluid/platform/dynload/dynamic_loader.h" @@ -28,8 +28,8 @@ extern void *curand_dso_handle; #define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \ struct DynLoad__##__name { \ template \ - curandStatus_t operator()(Args... args) { \ - typedef curandStatus_t (*curandFunc)(Args...); \ + hiprandStatus_t operator()(Args... args) { \ + typedef hiprandStatus_t (*curandFunc)(Args...); \ std::call_once(curand_dso_flag, \ paddle::platform::dynload::GetCurandDsoHandle, \ &curand_dso_handle); \ @@ -42,7 +42,7 @@ extern void *curand_dso_handle; #define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \ struct DynLoad__##__name { \ template \ - curandStatus_t operator()(Args... args) { \ + hiprandStatus_t operator()(Args... args) { \ return __name(args...); \ } \ }; \ @@ -50,13 +50,13 @@ extern void *curand_dso_handle; #endif #define CURAND_RAND_ROUTINE_EACH(__macro) \ - __macro(curandCreateGenerator); \ - __macro(curandSetStream); \ - __macro(curandSetPseudoRandomGeneratorSeed); \ - __macro(curandGenerateUniform); \ - __macro(curandGenerateUniformDouble); \ - __macro(curandGenerateNormal); \ - __macro(curandDestroyGenerator); + __macro(hiprandCreateGenerator); \ + __macro(hiprandSetStream); \ + __macro(hiprandSetPseudoRandomGeneratorSeed); \ + __macro(hiprandGenerateUniform); \ + __macro(hiprandGenerateUniformDouble); \ + __macro(hiprandGenerateNormal); \ + __macro(hiprandDestroyGenerator); CURAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP); diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index e590e81bab51f..0f2befc149da7 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -40,6 +40,10 @@ DEFINE_string(nccl_dir, "", "Specify path for loading nccl library, such as libcublas, " "libcurand. For instance, /usr/local/cuda/lib64. If default, " "dlopen will search cuda from LD_LIBRARY_PATH"); +DEFINE_string(rccl_dir, "", + "Specify path for loading nccl library, such as libcublas, " + "libcurand. For instance, /usr/local/cuda/lib64. If default, " + "dlopen will search cuda from LD_LIBRARY_PATH"); DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so."); @@ -132,18 +136,19 @@ static inline void GetDsoHandleFromSearchPath(const std::string& search_root, void GetCublasDsoHandle(void** dso_handle) { #if defined(__APPLE__) || defined(__OSX__) - GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle); + GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libhipblas.dylib", dso_handle); #else - GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle); + GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libhipblas.so", dso_handle); #endif } void GetCUDNNDsoHandle(void** dso_handle) { #if defined(__APPLE__) || defined(__OSX__) - GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle, + GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libMIOpen.dylib", dso_handle, false); #else - GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle, false); + GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, + "libMIOpen.so", dso_handle, false); #endif } @@ -161,9 +166,9 @@ void GetCUPTIDsoHandle(void** dso_handle) { void GetCurandDsoHandle(void** dso_handle) { #if defined(__APPLE__) || defined(__OSX__) - GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle); + GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libhiprand.dylib", dso_handle); #else - GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle); + GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libhiprand.so", dso_handle); #endif } @@ -190,6 +195,9 @@ void GetNCCLDsoHandle(void** dso_handle) { GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", dso_handle); #endif } +void GetRCCLDsoHandle(void** dso_handle) { + GetDsoHandleFromSearchPath(FLAGS_rccl_dir, "librccl.so", dso_handle); +} } // namespace dynload } // namespace platform diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h index b5b9c4af91624..7e79a1ca618f6 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.h +++ b/paddle/fluid/platform/dynload/dynamic_loader.h @@ -67,6 +67,7 @@ void GetLapackDsoHandle(void** dso_handle); * */ void GetNCCLDsoHandle(void** dso_handle); +void GetRCCLDsoHandle(void** dso_handle); } // namespace dynload } // namespace platform diff --git a/paddle/fluid/platform/dynload/hipblas.cc b/paddle/fluid/platform/dynload/hipblas.cc new file mode 100644 index 0000000000000..bd17503b558c7 --- /dev/null +++ b/paddle/fluid/platform/dynload/hipblas.cc @@ -0,0 +1,29 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/hipblas.h" + +namespace paddle { +namespace platform { +namespace dynload { +std::once_flag cublas_dso_flag; +void *cublas_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +CUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/hipblas.h b/paddle/fluid/platform/dynload/hipblas.h new file mode 100644 index 0000000000000..1645361bcf7b1 --- /dev/null +++ b/paddle/fluid/platform/dynload/hipblas.h @@ -0,0 +1,98 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include "paddle/fluid/platform/dynload/dynamic_loader.h" + +namespace paddle { +namespace platform { +namespace dynload { + +extern std::once_flag cublas_dso_flag; +extern void *cublas_dso_handle; + +/** + * The following macro definition can generate structs + * (for each function) to dynamic load cublas routine + * via operator overloading. + * + * note: default dynamic linked libs + */ +#ifdef PADDLE_USE_DSO +#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + inline hipblasStatus_t operator()(Args... args) { \ + typedef hipblasStatus_t (*cublasFunc)(Args...); \ + std::call_once(cublas_dso_flag, \ + paddle::platform::dynload::GetCublasDsoHandle, \ + &cublas_dso_handle); \ + void *p_##__name = dlsym(cublas_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name +#else +#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + inline hipblasStatus_t operator()(Args... args) { \ + return __name(args...); \ + } \ + }; \ + extern DynLoad__##__name __name +#endif + +#define DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) \ + DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) + +#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \ + __macro(hipblasSaxpy); \ + __macro(hipblasDaxpy); \ + __macro(hipblasSgemv); \ + __macro(hipblasDgemv); \ + __macro(hipblasSgemm); \ + __macro(hipblasDgemm); \ + __macro(hipblasSgeam); \ + __macro(hipblasDgeam); \ + __macro(hipblasCreate); \ + __macro(hipblasDestroy); \ + __macro(hipblasSetStream); \ + __macro(hipblasSetPointerMode); \ + __macro(hipblasGetPointerMode); \ + __macro(hipblasSgemmBatched); \ + __macro(hipblasDgemmBatched); \ + __macro(hipblasCgemmBatched); \ + __macro(hipblasZgemmBatched); \ + __macro(hipblasSgemmStridedBatched); \ + __macro(hipblasDgemmStridedBatched); \ + __macro(hipblasCgemmStridedBatched); \ + __macro(hipblasZgemmStridedBatched); \ + __macro(hipblasDgetrfBatched); \ + __macro(hipblasDgetriBatched) + + +//__macro(hipblasSgetrfBatched); +//_macro(hipblasSgetriBatched); + +CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP); + +#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/hiprand.cc b/paddle/fluid/platform/dynload/hiprand.cc new file mode 100644 index 0000000000000..3a006a8bd05a4 --- /dev/null +++ b/paddle/fluid/platform/dynload/hiprand.cc @@ -0,0 +1,30 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/hiprand.h" + +namespace paddle { +namespace platform { +namespace dynload { + +std::once_flag curand_dso_flag; +void *curand_dso_handle; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +CURAND_RAND_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/hiprand.h b/paddle/fluid/platform/dynload/hiprand.h new file mode 100644 index 0000000000000..44f424a2eab70 --- /dev/null +++ b/paddle/fluid/platform/dynload/hiprand.h @@ -0,0 +1,65 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include "paddle/fluid/platform/dynload/dynamic_loader.h" + +namespace paddle { +namespace platform { +namespace dynload { +extern std::once_flag curand_dso_flag; +extern void *curand_dso_handle; +#ifdef PADDLE_USE_DSO +#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + hiprandStatus_t operator()(Args... args) { \ + typedef hiprandStatus_t (*curandFunc)(Args...); \ + std::call_once(curand_dso_flag, \ + paddle::platform::dynload::GetCurandDsoHandle, \ + &curand_dso_handle); \ + void *p_##__name = dlsym(curand_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name +#else +#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + hiprandStatus_t operator()(Args... args) { \ + return __name(args...); \ + } \ + }; \ + extern DynLoad__##__name __name +#endif + +#define CURAND_RAND_ROUTINE_EACH(__macro) \ + __macro(hiprandCreateGenerator); \ + __macro(hiprandSetStream); \ + __macro(hiprandSetPseudoRandomGeneratorSeed); \ + __macro(hiprandGenerateUniform); \ + __macro(hiprandGenerateUniformDouble); \ + __macro(hiprandGenerateNormal); \ + __macro(hiprandDestroyGenerator); + +CURAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP); + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/miopen.cc b/paddle/fluid/platform/dynload/miopen.cc new file mode 100644 index 0000000000000..da5c9344f4f59 --- /dev/null +++ b/paddle/fluid/platform/dynload/miopen.cc @@ -0,0 +1,62 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/miopen.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace platform { +namespace dynload { +std::once_flag cudnn_dso_flag; +void* cudnn_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +CUDNN_DNN_ROUTINE_EACH(DEFINE_WRAP); +CUDNN_DNN_ROUTINE_EACH_R2(DEFINE_WRAP); + +#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R3 +CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DEFINE_WRAP); +#endif + +#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R4 +CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP); +#endif + +#ifdef CUDNN_DNN_ROUTINE_EACH_R5 +CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP); +#endif + +#ifdef CUDNN_DNN_ROUTINE_EACH_R7 +CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP); +#endif + +#ifdef PADDLE_USE_DSO +bool HasMIOpen() { + std::call_once(cudnn_dso_flag, GetCUDNNDsoHandle, &cudnn_dso_handle); + return cudnn_dso_handle != nullptr; +} + +void EnforceCUDNNLoaded(const char* fn_name) { + PADDLE_ENFORCE(cudnn_dso_handle != nullptr, + "Cannot load cudnn shared library. Cannot invoke method %s", + fn_name); +} +#else +bool HasMIOpen() { return true; } +#endif + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/miopen.h b/paddle/fluid/platform/dynload/miopen.h new file mode 100644 index 0000000000000..a6cbf5dad63a2 --- /dev/null +++ b/paddle/fluid/platform/dynload/miopen.h @@ -0,0 +1,148 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include // NOLINT +#include "paddle/fluid/platform/dynload/dynamic_loader.h" + +namespace paddle { +namespace platform { +namespace dynload { + +extern std::once_flag cudnn_dso_flag; +extern void* cudnn_dso_handle; +extern bool HasMIOpen(); + +inline const char* miopenGetErrorString(miopenStatus_t status) { + switch (status) { + case miopenStatusSuccess: + return "MIOPEN_STATUS_SUCCESS"; + case miopenStatusNotInitialized: + return "MIOPEN_STATUS_NOT_INITIALIZED"; + case miopenStatusInvalidValue: + return "MIOPEN_STATUS_INVALID_VALUE"; + case miopenStatusBadParm: + return "MIOPEN_STATUS_BAD_PARAM"; + case miopenStatusAllocFailed: + return "MIOPEN_STATUS_ALLOC_FAILED"; + case miopenStatusInternalError: + return "MIOPEN_STATUS_INTERNAL_ERROR"; + case miopenStatusNotImplemented: + return "MIOPEN_STATUS_NOT_IMPLEMENTED"; + case miopenStatusUnknownError: + default: + return "MIOPEN_STATUS_UNKNOWN_ERROR"; + } +} + +#ifdef PADDLE_USE_DSO + +extern void EnforceCUDNNLoaded(const char* fn_name); +#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> decltype(__name(args...)) { \ + using cudnn_func = decltype(__name(args...)) (*)(Args...); \ + std::call_once(cudnn_dso_flag, \ + paddle::platform::dynload::GetCUDNNDsoHandle, \ + &cudnn_dso_handle); \ + EnforceCUDNNLoaded(#__name); \ + void* p_##__name = dlsym(cudnn_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern struct DynLoad__##__name __name + +#else + +#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> decltype(__name(args...)) { \ + return __name(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#endif + +/** + * include all needed cudnn functions in HPPL + * different cudnn version has different interfaces + **/ +#define CUDNN_DNN_ROUTINE_EACH(__macro) \ + __macro(miopenSet4dTensorDescriptor); \ + __macro(miopenGet4dTensorDescriptor); \ + __macro(miopenFindConvolutionForwardAlgorithm); \ + __macro(miopenGetConvolutionDescriptor); \ + __macro(miopenCreateTensorDescriptor); \ + __macro(miopenDestroyTensorDescriptor); \ + __macro(miopenSet2dPoolingDescriptor); \ + __macro(miopenGet2dPoolingDescriptor); \ + __macro(miopenCreateConvolutionDescriptor); \ + __macro(miopenCreatePoolingDescriptor); \ + __macro(miopenDestroyPoolingDescriptor); \ + __macro(miopenInitConvolutionDescriptor); \ + __macro(miopenDestroyConvolutionDescriptor); \ + __macro(miopenDeriveBNTensorDescriptor); \ + __macro(miopenCreate); \ + __macro(miopenDestroy); \ + __macro(miopenSetStream); \ + __macro(miopenActivationForward); \ + __macro(miopenConvolutionForward); \ + __macro(miopenConvolutionBackwardBias); \ + __macro(miopenConvolutionForwardGetWorkSpaceSize); \ + __macro(miopenPoolingGetWorkSpaceSize); \ + __macro(miopenPoolingForward); \ + __macro(miopenPoolingBackward); \ + __macro(miopenSoftmaxBackward); \ + __macro(miopenSoftmaxForward); +CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) + +#define CUDNN_DNN_ROUTINE_EACH_R2(__macro) \ + __macro(miopenAddTensor); \ + __macro(miopenConvolutionBackwardData); \ + __macro(miopenConvolutionBackwardWeights); +CUDNN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) + +// APIs available after R3: +#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro) \ + __macro(miopenConvolutionBackwardWeightsGetWorkspaceSize); \ + __macro(miopenFindConvolutionBackwardDataAlgorithm); \ + __macro(miopenFindConvolutionBackwardWeightsAlgorithm); \ + __macro(miopenConvolutionBackwardWeightsGetWorkSpaceSize); \ + __macro(miopenConvolutionBackwardDataGetWorkSpaceSize); \ + __macro(miopenConvolutionForwardGetWorkspaceSize); +CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) + +// APIs available after R4: +#define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro) \ + __macro(miopenBatchNormalizationForwardTraining); \ + __macro(miopenBatchNormalizationForwardInference); \ + __macro(miopenBatchNormalizationBackward); +CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) + +// APIs in R5 +#define CUDNN_DNN_ROUTINE_EACH_R5(__macro) \ + __macro(miopenCreateActivationDescriptor); \ + __macro(miopenSetActivationDescriptor); \ + __macro(miopenGetActivationDescriptor); \ + __macro(miopenDestroyActivationDescriptor); +CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/nccl.h b/paddle/fluid/platform/dynload/nccl.h index dc78bcb44d331..925fcb9807583 100644 --- a/paddle/fluid/platform/dynload/nccl.h +++ b/paddle/fluid/platform/dynload/nccl.h @@ -15,7 +15,7 @@ limitations under the License. */ #pragma once #include -#include +#include #include #include "paddle/fluid/platform/call_once.h" #include "paddle/fluid/platform/dynload/dynamic_loader.h" @@ -45,7 +45,7 @@ extern void LoadNCCLDSO(); #define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \ struct DynLoad__##__name { \ template \ - ncclResult_t operator()(Args... args) { \ + rcclResult_t operator()(Args... args) { \ return __name(args...); \ } \ }; \ @@ -53,20 +53,20 @@ extern void LoadNCCLDSO(); #endif #define NCCL_RAND_ROUTINE_EACH(__macro) \ - __macro(ncclCommInitAll); \ - __macro(ncclGetUniqueId); \ - __macro(ncclCommInitRank); \ - __macro(ncclCommDestroy); \ - __macro(ncclCommCount); \ - __macro(ncclCommCuDevice); \ - __macro(ncclCommUserRank); \ - __macro(ncclAllReduce); \ - __macro(ncclBcast); \ - __macro(ncclAllGather); \ - __macro(ncclGroupStart); \ - __macro(ncclGroupEnd); \ - __macro(ncclReduce); \ - __macro(ncclGetErrorString); + __macro(rcclCommInitAll); \ + __macro(rcclGetUniqueId); \ + __macro(rcclCommInitRank); \ + __macro(rcclCommDestroy); \ + __macro(rcclCommCount); \ + __macro(rcclCommCuDevice); \ + __macro(rcclCommUserRank); \ + __macro(rcclAllReduce); \ + __macro(rcclBcast); \ + __macro(rcclAllGather); \ + __macro(rcclGroupStart); \ + __macro(rcclGroupEnd); \ + __macro(rcclReduce); \ + __macro(rcclGetErrorString); NCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NCCL_WRAP) diff --git a/paddle/fluid/platform/dynload/rccl.cc b/paddle/fluid/platform/dynload/rccl.cc new file mode 100644 index 0000000000000..2a35839b00157 --- /dev/null +++ b/paddle/fluid/platform/dynload/rccl.cc @@ -0,0 +1,35 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/rccl.h" + +namespace paddle { +namespace platform { +namespace dynload { + +std::once_flag rccl_dso_flag; +void *rccl_dso_handle; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +RCCL_RAND_ROUTINE_EACH(DEFINE_WRAP); + +void LoadRCCLDSO() { + platform::call_once(rccl_dso_flag, + [] { GetRCCLDsoHandle(&rccl_dso_handle); }); +} + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/rccl.h b/paddle/fluid/platform/dynload/rccl.h new file mode 100644 index 0000000000000..eeccd6aae125e --- /dev/null +++ b/paddle/fluid/platform/dynload/rccl.h @@ -0,0 +1,73 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include "paddle/fluid/platform/call_once.h" +#include "paddle/fluid/platform/dynload/dynamic_loader.h" + +namespace paddle { +namespace platform { +namespace dynload { + +extern std::once_flag rccl_dso_flag; +extern void* rccl_dso_handle; + +#ifdef PADDLE_USE_DSO +extern void LoadRCCLDSO(); + +#define DECLARE_DYNAMIC_LOAD_RCCL_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> decltype(__name(args...)) { \ + using rccl_func = decltype(__name(args...)) (*)(Args...); \ + paddle::platform::dynload::LoadRCCLDSO(); \ + void* p_##__name = dlsym(rccl_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name +#else +#define DECLARE_DYNAMIC_LOAD_RCCL_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + rcclResult_t operator()(Args... args) { \ + return __name(args...); \ + } \ + }; \ + extern DynLoad__##__name __name +#endif + +#define RCCL_RAND_ROUTINE_EACH(__macro) \ + __macro(rcclCommInitAll); \ + __macro(rcclGetUniqueId); \ + __macro(rcclCommInitRank); \ + __macro(rcclCommDestroy); \ + __macro(rcclCommCount); \ + __macro(rcclCommCuDevice); \ + __macro(rcclCommUserRank); \ + __macro(rcclAllReduce); \ + __macro(rcclBcast); \ + __macro(rcclAllGather); \ + __macro(rcclReduce); \ + __macro(rcclGetErrorString); + +RCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_RCCL_WRAP) + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index d303fd6d63f84..e8599739dd6a1 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -47,6 +47,23 @@ limitations under the License. */ #endif +#ifdef PADDLE_WITH_HIP + +#include "paddle/fluid/platform/dynload/hipblas.h" +#include "paddle/fluid/platform/dynload/miopen.h" +#include "paddle/fluid/platform/dynload/hiprand.h" +#include "paddle/fluid/platform/dynload/rccl.h" + +#include +#include +#include +#include +#include +#include +#include + +#endif + namespace paddle { namespace platform { @@ -185,7 +202,75 @@ inline typename std::enable_if::type throw_on_error( } } -#endif // PADDLE_ONLY_CPU +#endif // PADDLE_WITH_CUDA + + +#ifdef PADDLE_WITH_HIP + +template +inline typename std::enable_if::type throw_on_error( + hipError_t e, const Args&... args) { + if (UNLIKELY(e)) { + throw thrust::system_error(e, thrust::cuda_category(), + string::Sprintf(args...)); + } +} + +template +inline typename std::enable_if::type throw_on_error( + hiprandStatus_t stat, const Args&... args) { + if (stat != HIPRAND_STATUS_SUCCESS) { + throw thrust::system_error(hipErrorLaunchFailure, thrust::cuda_category(), + string::Sprintf(args...)); + } +} + +template +inline typename std::enable_if::type throw_on_error( + miopenStatus_t stat, const Args&... args) { + if (stat == miopenStatusSuccess) { + return; + } else { + throw std::runtime_error(platform::dynload::miopenGetErrorString(stat) + + string::Sprintf(args...)); + } +} + +template +inline typename std::enable_if::type throw_on_error( + hipblasStatus_t stat, const Args&... args) { + std::string err; + if (stat == HIPBLAS_STATUS_SUCCESS) { + return; + } else if (stat == HIPBLAS_STATUS_NOT_INITIALIZED) { + err = "CUBLAS: not initialized, "; + } else if (stat == HIPBLAS_STATUS_ALLOC_FAILED) { + err = "CUBLAS: alloc failed, "; + } else if (stat == HIPBLAS_STATUS_INVALID_VALUE) { + err = "CUBLAS: invalid value, "; + } else if (stat == HIPBLAS_STATUS_MAPPING_ERROR) { + err = "CUBLAS: mapping error, "; + } else if (stat == HIPBLAS_STATUS_EXECUTION_FAILED) { + err = "CUBLAS: execution failed, "; + } else if (stat == HIPBLAS_STATUS_INTERNAL_ERROR) { + err = "CUBLAS: internal error, "; + } else if (stat == HIPBLAS_STATUS_NOT_SUPPORTED) { + err = "CUBLAS: not supported, "; + } + throw std::runtime_error(err + string::Sprintf(args...)); +} + +template +inline typename std::enable_if::type throw_on_error( + rcclResult_t stat, const Args&... args) { + if (stat == rcclSuccess) { + return; + } else { + throw std::runtime_error(string::Sprintf(args...)); + } +} + +#endif // PADDLE_WITH_HIP template inline void throw_on_error(T e) { diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h index 2cf311c7e56a9..939f895a0f595 100644 --- a/paddle/fluid/platform/float16.h +++ b/paddle/fluid/platform/float16.h @@ -20,6 +20,10 @@ limitations under the License. */ #include #endif // PADDLE_WITH_CUDA +#ifdef PADDLE_WITH_HIP +#include +#endif // PADDLE_WITH_HIP + #ifdef __GNUC__ #define PADDLE_GNUC_VER (__GNUC__ * 10 + __GNUC_MINOR__) #else @@ -87,7 +91,9 @@ struct PADDLE_ALIGN(2) float16 { float16& operator=(const float16& o) = default; float16(float16&& o) = default; float16& operator=(float16&& o) = default; +#ifndef PADDLE_WITH_HIP ~float16() = default; +#endif // Constructors #ifdef PADDLE_CUDA_FP16 diff --git a/paddle/fluid/platform/for_range.h b/paddle/fluid/platform/for_range.h index c153e80fe42ae..8c52fbef4ebd0 100644 --- a/paddle/fluid/platform/for_range.h +++ b/paddle/fluid/platform/for_range.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include "hip/hip_runtime.h" #include "paddle/fluid/platform/device_context.h" namespace paddle { @@ -40,7 +41,7 @@ struct ForRange { size_t limit_; }; -#ifdef __NVCC__ +#ifdef __HIPCC__ template __global__ static void ForRangeElemwiseOpGridIsOne(Function func) { size_t idx = static_cast(threadIdx.x); @@ -67,10 +68,10 @@ struct ForRange { int grid_size = (limit_ + num_threads - 1) / num_threads; if (grid_size == 1) { - ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>( + hipLaunchKernelGGL((ForRangeElemwiseOpGridIsOne), dim3(1), dim3(block_size), 0, dev_ctx_.stream(), func); } else { - ForRangeElemwiseOp<<>>( + hipLaunchKernelGGL((ForRangeElemwiseOp), dim3(grid_size), dim3(block_size), 0, dev_ctx_.stream(), func, limit_); } } diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h index fa469fa77f5ca..4dc0507fe0ef0 100644 --- a/paddle/fluid/platform/gpu_info.h +++ b/paddle/fluid/platform/gpu_info.h @@ -15,8 +15,13 @@ limitations under the License. */ #pragma once #ifdef PADDLE_WITH_CUDA - #include +#endif // PADDLE_WITH_CUDA + +#ifdef PADDLE_WITH_HIP +#include +#endif // PADDLE_WITH_HIP + #include #include @@ -57,6 +62,7 @@ size_t GpuMinChunkSize(); //! Get the maximum chunk size for GPU buddy allocator. size_t GpuMaxChunkSize(); +#ifdef PADDLE_WITH_CUDA //! Copy memory from address src to dst asynchronously. void GpuMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream); @@ -67,8 +73,20 @@ void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device, //! Set memory dst with value count size asynchronously void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream); +#endif // PADDLE_WITH_CUDA + +#ifdef PADDLE_WITH_HIP +//! Copy memory from address src to dst asynchronously. +void GpuMemcpyAsync(void *dst, const void *src, size_t count, + enum hipMemcpyKind kind, hipStream_t stream); + +//! Copy memory from one device to another device. +void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device, + size_t count, hipStream_t stream); + +//! Set memory dst with value count size asynchronously +void GpuMemsetAsync(void *dst, int value, size_t count, hipStream_t stream); +#endif // PADDLE_WITH_HIP } // namespace platform } // namespace paddle - -#endif diff --git a/paddle/fluid/platform/gpu_info_hip.cc b/paddle/fluid/platform/gpu_info_hip.cc new file mode 100644 index 0000000000000..6850a37e52845 --- /dev/null +++ b/paddle/fluid/platform/gpu_info_hip.cc @@ -0,0 +1,141 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/gpu_info.h" + +#include "gflags/gflags.h" + +#include "paddle/fluid/platform/enforce.h" + +DEFINE_double(fraction_of_gpu_memory_to_use, 0.92, + "Default use 92% of GPU memory for PaddlePaddle," + "reserve the rest for page tables, etc"); + +namespace paddle { +namespace platform { + +int GetCUDADeviceCount() { + int count; + PADDLE_ENFORCE( + hipGetDeviceCount(&count), + "hipGetDeviceCount failed in paddle::platform::GetCUDADeviceCount"); + return count; +} + +int GetCUDAComputeCapability(int id) { + PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); + hipDeviceProp_t device_prop; + PADDLE_ENFORCE(hipGetDeviceProperties(&device_prop, id), + "hipGetDeviceProperties failed in " + "paddle::platform::GetCUDAComputeCapability"); + return device_prop.major * 10 + device_prop.minor; +} + +int GetCUDAMultiProcessors(int id) { + PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); + int count; + PADDLE_ENFORCE( + hipDeviceGetAttribute(&count, hipDeviceAttributeMultiprocessorCount, id), + "hipDeviceGetAttribute failed in " + "paddle::platform::GetCUDAMultiProcessors"); + return count; +} + +int GetCUDAMaxThreadsPerMultiProcessor(int id) { + PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); + int count; + PADDLE_ENFORCE(hipDeviceGetAttribute( + &count, hipDeviceAttributeMaxThreadsPerMultiProcessor, id), + "hipDeviceGetAttribute failed in " + "paddle::platform::GetCUDAMaxThreadsPerMultiProcessor"); + return count; +} + +int GetCurrentDeviceId() { + int device_id; + PADDLE_ENFORCE( + hipGetDevice(&device_id), + "hipGetDevice failed in paddle::platform::GetCurrentDeviceId"); + return device_id; +} + +void SetDeviceId(int id) { + // TODO(qijun): find a better way to cache the cuda device count + PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); + PADDLE_ENFORCE(hipSetDevice(id), + "hipSetDevice failed in paddle::platform::SetDeviceId"); +} + +void GpuMemoryUsage(size_t &available, size_t &total) { + PADDLE_ENFORCE(hipMemGetInfo(&available, &total), + "hipMemGetInfo failed in paddle::platform::GetMemoryUsage"); +} + +size_t GpuMaxAllocSize() { + size_t total = 0; + size_t available = 0; + + GpuMemoryUsage(available, total); + + // Reserve the rest for page tables, etc. + return static_cast(total * FLAGS_fraction_of_gpu_memory_to_use); +} + +size_t GpuMinChunkSize() { + // Allow to allocate the minimum chunk size is 256 bytes. + return 1 << 8; +} + +size_t GpuMaxChunkSize() { + size_t total = 0; + size_t available = 0; + + GpuMemoryUsage(available, total); + VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/" + << total / 1024 / 1024 << "M"; + size_t reserving = static_cast(0.05 * total); + // If available less than minimum chunk size, no usable memory exists. + available = + std::min(std::max(available, GpuMinChunkSize()) - GpuMinChunkSize(), + total - reserving); + + // Reserving the rest memory for page tables, etc. + + size_t allocating = static_cast(FLAGS_fraction_of_gpu_memory_to_use * + (total - reserving)); + + PADDLE_ENFORCE_LE(allocating, available); + + return allocating; +} + +void GpuMemcpyAsync(void *dst, const void *src, size_t count, + enum hipMemcpyKind kind, hipStream_t stream) { + PADDLE_ENFORCE(hipMemcpyAsync(dst, src, count, kind, stream), + "hipMemcpyAsync failed in paddle::platform::GpuMemcpyAsync"); +} + +void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device, + size_t count, hipStream_t stream) { + PADDLE_ENFORCE( + hipMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream), + "hipMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeer"); +} + +void GpuMemsetAsync(void *dst, int value, size_t count, hipStream_t stream) { + PADDLE_ENFORCE(hipMemsetAsync(dst, value, count, stream), + "hipMemsetAsync failed in paddle::platform::GpuMemsetAsync"); +} +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/hostdevice.h b/paddle/fluid/platform/hostdevice.h index c0dc92a521764..bb6795109281c 100644 --- a/paddle/fluid/platform/hostdevice.h +++ b/paddle/fluid/platform/hostdevice.h @@ -13,7 +13,7 @@ // limitations under the License. #pragma once -#ifdef __CUDACC__ +#if (defined(__CUDACC__) || defined(__HIPCC__)) #define HOSTDEVICE __host__ __device__ #define DEVICE __device__ #define HOST __host__ diff --git a/paddle/fluid/platform/miopen_helper.h b/paddle/fluid/platform/miopen_helper.h new file mode 100644 index 0000000000000..94006538b5fec --- /dev/null +++ b/paddle/fluid/platform/miopen_helper.h @@ -0,0 +1,252 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +B +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include "miopen/miopen.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/dynload/miopen.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/platform/macros.h" + +namespace paddle { +namespace platform { + +#define MIOPEN_ENFORCE(condition) \ + do { \ + miopenStatus_t status = condition; \ + if (status != miopenStatusSuccess) { \ + PADDLE_THROW("miopen call failed"); \ + } \ + } while (false) + +enum class DataLayout { // Not use + kNHWC, + kNCHW, + kNCDHW, + kNCHW_VECT_C, +}; + +enum class PoolingMode { + + kMaximum, + kAverage, +}; + +template +class MIOpenDataType; + +template <> +class MIOpenDataType { + public: + static const miopenDataType_t type = miopenHalf; + // The scaling param type is float for HALF and FLOAT tensors + using ScalingParamType = const float; + using BatchNormParamType = float; + static ScalingParamType* kOne() { + static ScalingParamType v = 1.0; + return &v; + } + static ScalingParamType* kZero() { + static ScalingParamType v = 0.0; + return &v; + } +}; + +template <> +class MIOpenDataType { + public: + static const miopenDataType_t type = miopenFloat; + using ScalingParamType = const float; + using BatchNormParamType = float; + static ScalingParamType* kOne() { + static ScalingParamType v = 1.0; + return &v; + } + static ScalingParamType* kZero() { + static ScalingParamType v = 0.0; + return &v; + } +}; + +class ScopedTensorDescriptor { + public: + + ScopedTensorDescriptor() { + PADDLE_ENFORCE(dynload::miopenCreateTensorDescriptor(&desc_)); + } + ~ScopedTensorDescriptor() { + PADDLE_ENFORCE(dynload::miopenDestroyTensorDescriptor(desc_)); + } + + inline miopenTensorDescriptor_t descriptor(const miopenDataType_t type, + const std::vector& dims, + const int groups = 1) { + // the format is not used now, will add later + std::vector strides(dims.size()); + strides[dims.size() - 1] = 1; + for (int i = dims.size() - 2; i >= 0; i--) { + strides[i] = dims[i + 1] * strides[i + 1]; + } + // Update tensor descriptor dims setting if groups > 1 + // NOTE: Assume using NCHW or NCDHW order + std::vector dims_with_group(dims.begin(), dims.end()); // copy + if (groups > 1) { + dims_with_group[1] = dims_with_group[1] / groups; + } + if (dims_with_group.size()!=4){ + PADDLE_THROW("miopen only supports 4D tensors, dim=%d not allowed",dims_with_group.size()); + } + PADDLE_ENFORCE(dynload::miopenSet4dTensorDescriptor( + desc_, type, dims_with_group[0], dims_with_group[1], dims_with_group[2], dims_with_group[3])); + return desc_; + } + + template + inline miopenTensorDescriptor_t descriptor(const DataLayout& order, + const std::vector& dims, + const int groups = 1) { + return descriptor(MIOpenDataType::type, dims, + groups); + } + + private: + miopenTensorDescriptor_t desc_; + DISABLE_COPY_AND_ASSIGN(ScopedTensorDescriptor); +}; + +class ScopedFilterDescriptor { + public: + ScopedFilterDescriptor() { + PADDLE_ENFORCE(dynload::miopenCreateTensorDescriptor(&desc_)); + } + ~ScopedFilterDescriptor() { + PADDLE_ENFORCE(dynload::miopenDestroyTensorDescriptor(desc_)); + } + inline miopenTensorDescriptor_t descriptor(const miopenDataType_t type, + const std::vector& kernel, + const int groups = 1) { + // filter layout: MCHW(MCDHW), where M is the number of + // output image channels, C is the number of input image channels, + // D is the depth of the filter, H is the height of the filter, and W is the + // width of the filter. + std::vector kernel_with_group(kernel.begin(), kernel.end()); + if (groups > 1) { + kernel_with_group[0] /= groups; + // NOTE: input filter(C) of the filter is already asserted to be C/groups. + } + if (kernel_with_group.size()!=4){ + PADDLE_THROW("miopen only supports 4D filters, dim=%d not allowed",kernel_with_group.size()); + } + PADDLE_ENFORCE(dynload::miopenSet4dTensorDescriptor( + desc_, type, kernel_with_group[0], kernel_with_group[1], kernel_with_group[2], kernel_with_group[3])); + return desc_; + } + + template + inline miopenTensorDescriptor_t descriptor(const DataLayout& order, + const std::vector& kernel, + const int groups = 1) { + return descriptor(MIOpenDataType::type, + kernel, groups); + } + private: + miopenTensorDescriptor_t desc_; + DISABLE_COPY_AND_ASSIGN(ScopedFilterDescriptor); +}; + +class ScopedConvolutionDescriptor { + public: + ScopedConvolutionDescriptor() { + PADDLE_ENFORCE(dynload::miopenCreateConvolutionDescriptor(&desc_)); + } + ~ScopedConvolutionDescriptor() { + PADDLE_ENFORCE(dynload::miopenDestroyConvolutionDescriptor(desc_)); + } + + inline miopenConvolutionDescriptor_t descriptor( + miopenDataType_t type, const std::vector& pads, + const std::vector& strides, const std::vector& dilations) { + PADDLE_ENFORCE_EQ(pads.size(), strides.size()); + PADDLE_ENFORCE_EQ(pads.size(), dilations.size()); + if (pads.size()!=2){ + PADDLE_THROW("miopen only supports 2D Convolution, dim=%d not allowed",pads.size()); + } + + PADDLE_ENFORCE(dynload::miopenInitConvolutionDescriptor( + desc_, miopenConvolution, pads[0], pads[1], strides[0], strides[1], + dilations[0], dilations[1])); + return desc_; + } + + template + inline miopenConvolutionDescriptor_t descriptor( + const std::vector& pads, const std::vector& strides, + const std::vector& dilations) { + return descriptor(MIOpenDataType::type, pads, strides, dilations); + } + + private: + miopenConvolutionDescriptor_t desc_; + DISABLE_COPY_AND_ASSIGN(ScopedConvolutionDescriptor); +}; + +class ScopedPoolingDescriptor { + public: + ScopedPoolingDescriptor() { + PADDLE_ENFORCE(dynload::miopenCreatePoolingDescriptor(&desc_)); + } + ~ScopedPoolingDescriptor() { + PADDLE_ENFORCE(dynload::miopenDestroyPoolingDescriptor(desc_)); + } + inline miopenPoolingDescriptor_t descriptor(const PoolingMode& mode, + const std::vector& kernel, + const std::vector& pads, + const std::vector& strides) { + PADDLE_ENFORCE_EQ(kernel.size(), pads.size()); + PADDLE_ENFORCE_EQ(kernel.size(), strides.size()); + if (kernel.size()!=2){ + PADDLE_THROW("miopen only supports 2D Pooling, dim=%d not allowed",kernel.size()); + } + + PADDLE_ENFORCE(dynload::miopenSet2dPoolingDescriptor( + desc_, (mode == PoolingMode::kMaximum + ? miopenPoolingMax + : miopenPoolingAverage), + kernel[0], kernel[1], pads[0], pads[1], strides[0], strides[1])); + return desc_; + } + private: + miopenPoolingDescriptor_t desc_; + DISABLE_COPY_AND_ASSIGN(ScopedPoolingDescriptor); +}; + +inline bool CanMIOpenBeUsed(const framework::ExecutionContext& ctx) { + bool use_cudnn = ctx.Attr("use_cudnn"); + use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace()); +#ifdef PADDLE_WITH_HIP + if (use_cudnn) { + auto& dev_ctx = ctx.device_context(); + use_cudnn &= dev_ctx.miopen_handle() != nullptr; + } +#endif + return use_cudnn; +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h index d0bdcb0da5177..360fe9160039a 100644 --- a/paddle/fluid/platform/place.h +++ b/paddle/fluid/platform/place.h @@ -113,7 +113,7 @@ struct PlaceVisitorWrapper } typename Visitor::result_type operator()(const CUDAPlace &cuda) const { -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) return visitor_(cuda); #else PADDLE_THROW("Paddle is not compiled with CUDA. Cannot visit cuda device"); @@ -123,7 +123,7 @@ struct PlaceVisitorWrapper typename Visitor::result_type operator()( const CUDAPinnedPlace &cuda_pinned) const { -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) return visitor_(cuda_pinned); #else PADDLE_THROW("Paddle is not compiled with CUDA. Cannot visit cuda_pinned"); diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index b25206ff35cc8..8eaa86ace7470 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -20,6 +20,9 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include #endif // PADDLE_WITH_CUDA +#ifdef PADDLE_WITH_HIP +#include +#endif // PADDLE_WITH_HIP #include "glog/logging.h" #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/platform/device_tracer.h" @@ -72,6 +75,16 @@ Event::Event(EventKind kind, std::string name, uint32_t thread_id, auto stream = cuda_dev_ctx->stream(); PADDLE_ENFORCE(cudaEventRecord(event_, stream)); } +#endif +#ifdef PADDLE_WITH_HIP + has_cuda_ = dev_ctx ? platform::is_gpu_place(dev_ctx->GetPlace()) : false; + if (has_cuda_) { + auto* cuda_dev_ctx = static_cast(dev_ctx); + PADDLE_ENFORCE(hipGetDevice(&device_)); + PADDLE_ENFORCE(hipEventCreate(&event_)); + auto stream = cuda_dev_ctx->stream(); + PADDLE_ENFORCE(hipEventRecord(event_, stream)); + } #endif cpu_ns_ = GetTimeInNsec(); } @@ -101,12 +114,20 @@ double Event::CudaElapsedMs(const Event& e) const { float ms; PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event())); return ms; +#elif defined(PADDLE_WITH_HIP) + PADDLE_ENFORCE(e.has_cuda() && has_cuda()); + PADDLE_ENFORCE(e.device() == device()); + PADDLE_ENFORCE(hipEventSynchronize(event_)); + PADDLE_ENFORCE(hipEventSynchronize(e.event())); + float ms; + PADDLE_ENFORCE(hipEventElapsedTime(&ms, event_, e.event())); + return ms; #else PADDLE_THROW("CUDA is not enabled"); #endif } -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) static void ForEachDevice(std::function func) { auto original_device = GetCurrentDeviceId(); int count = GetCUDADeviceCount(); @@ -205,7 +226,7 @@ void EnableProfiler(ProfilerState state) { g_profiler_place = "All"; GetDeviceTracer()->Enable(); } -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) if (g_state == ProfilerState::kCUDA) { // Generate some dummy events first to reduce the startup overhead. for (int i = 0; i < 5; i++) { diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index de9a5cc20d76b..1447ca1245e85 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -42,6 +42,11 @@ class Event { int device() const { return device_; } #endif +#ifdef PADDLE_WITH_HIP + hipEvent_t event() const { return event_; } + int device() const { return device_; } +#endif + double CpuElapsedMs(const Event& e) const; double CudaElapsedMs(const Event& e) const; @@ -55,6 +60,10 @@ class Event { cudaEvent_t event_ = nullptr; int device_ = -1; #endif +#ifdef PADDLE_WITH_HIP + hipEvent_t event_ = nullptr; + int device_ = -1; +#endif }; struct EventList { diff --git a/paddle/fluid/platform/rccl_helper.h b/paddle/fluid/platform/rccl_helper.h new file mode 100644 index 0000000000000..e44203f99b05a --- /dev/null +++ b/paddle/fluid/platform/rccl_helper.h @@ -0,0 +1,137 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/platform/dynload/rccl.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace platform { + +inline rcclDataType_t ToNCCLDataType(std::type_index type) { + if (type == typeid(float)) { // NOLINT + return rcclFloat; + } else if (type == typeid(double)) { // NOLINT + return rcclDouble; + } else if (type == typeid(int)) { // NOLINT + return rcclInt; + } else { + PADDLE_THROW("Not supported"); + } +} + +class NCCLGroupGuard { + public: + inline NCCLGroupGuard() { + mutex().lock(); + //PADDLE_ENFORCE(dynload::rcclGroupStart()); + } + + inline ~NCCLGroupGuard() { + //PADDLE_ENFORCE(dynload::rcclGroupEnd()); + mutex().unlock(); + } + + private: + static std::mutex &mutex() { + static std::mutex mtx; + return mtx; + } +}; + +struct NCCLContext { + std::unique_ptr ctx_; + rcclComm_t comm_; + + explicit NCCLContext(int dev_id) + : ctx_(new CUDADeviceContext(CUDAPlace(dev_id))) {} + + hipStream_t stream() const { return ctx_->stream(); } + + int device_id() const { + return boost::get(ctx_->GetPlace()).device; + } + + static void InitNCCLContext(std::unordered_map &contexts, + const std::vector &places) { + std::vector comms; + std::vector devs; + comms.resize(contexts.size()); + devs.reserve(contexts.size()); + + for (auto &p : places) { + devs.push_back(boost::get(p).device); + } + + PADDLE_ENFORCE(platform::dynload::rcclCommInitAll( + &comms[0], static_cast(contexts.size()), &devs[0])); + + int i = 0; + for (auto &dev_id : devs) { + contexts.at(dev_id).comm_ = comms[i++]; + } + } +}; + +struct NCCLContextMap { + std::unordered_map contexts_; + std::vector order_; + + NCCLContextMap(const std::vector &places) { + order_.reserve(places.size()); + for (auto &p : places) { + int dev_id = boost::get(p).device; + order_.emplace_back(dev_id); + contexts_.emplace(dev_id, NCCLContext(dev_id)); + } + PADDLE_ENFORCE_EQ( + order_.size(), contexts_.size(), + "RCCL Context Map does not support contain two or more same device"); + + std::vector comms; + comms.resize(order_.size()); + + PADDLE_ENFORCE(platform::dynload::rcclCommInitAll( + &comms[0], static_cast(order_.size()), &order_[0])); + + int i = 0; + for (auto &dev_id : order_) { + contexts_.at(dev_id).comm_ = comms[i++]; + } + } + + CUDADeviceContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); } + + CUDADeviceContext *DevCtx(platform::Place p) const { + return DevCtx(boost::get(p).device); + } + + const NCCLContext &at(platform::Place p) const { + return this->at(boost::get(p).device); + } + + const NCCLContext &at(int dev_id) const { return contexts_.at(dev_id); } + + void WaitAll() { + for (auto &p : contexts_) { + p.second.ctx_->Wait(); + } + } +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/transform.h b/paddle/fluid/platform/transform.h index 917c48b47f8d7..c530e2299ef63 100644 --- a/paddle/fluid/platform/transform.h +++ b/paddle/fluid/platform/transform.h @@ -21,7 +21,8 @@ limitations under the License. */ #include #include -#ifdef __NVCC__ +#ifdef __HIPCC__ +#include #include #include #include "paddle/fluid/platform/details/device_ptr_cast.h" @@ -61,7 +62,7 @@ struct Transform { } }; -#ifdef __NVCC__ +#ifdef __HIPCC__ template <> struct Transform { template diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index ada69ea4a425f..2a29a7f18e457 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -2,9 +2,9 @@ if(WITH_PYTHON) if(WITH_AMD_GPU) hip_library(paddle_pybind SHARED SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc - DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method + DEPS ARCHIVE_START pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method parallel_executor - ${GLOB_OP_LIB}) + ${GLOB_OP_LIB} ARCHIVE_END) else() cc_library(paddle_pybind SHARED SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index b0a3f06a8871b..3ce96281724fe 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -43,7 +43,7 @@ limitations under the License. */ #include "paddle/fluid/string/to_string.h" -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) #include "paddle/fluid/operators/nccl/nccl_gpu_common.h" #include "paddle/fluid/platform/cuda_profiler.h" #include "paddle/fluid/platform/gpu_info.h" @@ -55,7 +55,7 @@ PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray); namespace paddle { namespace pybind { bool IsCompiledWithCUDA() { -#ifndef PADDLE_WITH_CUDA +#if !(defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) return false; #else return true; @@ -106,7 +106,7 @@ PYBIND11_PLUGIN(core) { .def("set", PyCPUTensorSetFromArray) .def("set", PyCPUTensorSetFromArray) .def("set", PyCPUTensorSetFromArray) -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) .def("set", PyCUDATensorSetFromArray) .def("set", PyCUDATensorSetFromArray) .def("set", PyCUDATensorSetFromArray) @@ -163,7 +163,7 @@ PYBIND11_PLUGIN(core) { .def("height", &SelectedRows::height) .def("set_rows", [](SelectedRows &self, std::vector rows) { -#ifndef PADDLE_WITH_CUDA +#if !(defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) self.set_rows(rows); #else Vector new_rows(rows); @@ -171,7 +171,7 @@ PYBIND11_PLUGIN(core) { #endif }) .def("rows", [](SelectedRows &self) { -#ifndef PADDLE_WITH_CUDA +#if !(defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) return self.rows(); #else auto rows = self.rows(); @@ -213,7 +213,7 @@ All parameter, weight, gradient are variables in Paddle. .def("get_lod_tensor_array", [](Variable &self) { return self.GetMutable(); }, py::return_value_policy::reference) -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) .def("get_communicator", [](Variable &self) -> platform::Communicator * { return self.GetMutable(); @@ -312,14 +312,14 @@ All parameter, weight, gradient are variables in Paddle. .def_static("create", [](paddle::platform::CUDAPlace& place) -> paddle::platform::DeviceContext* { -#ifndef PADDLE_WITH_CUDA +#if !(defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) PADDLE_THROW("CUDAPlace is not supported in CPU device."); #else return new paddle::platform::CUDADeviceContext(place); #endif }); // clang-format on -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) py::class_(m, "Communicator").def(py::init<>()); #endif py::class_(m, "CUDAPlace") @@ -431,6 +431,10 @@ All parameter, weight, gradient are variables in Paddle. // Only GPUs with Compute Capability >= 53 support float16 return platform::GetCUDAComputeCapability(place.device) >= 53; }); +#elif defined(PADDLE_WITH_HIP) + m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool { + return false; + }); #endif m.def("set_feed_variable", framework::SetFeedVariable); @@ -469,7 +473,7 @@ All parameter, weight, gradient are variables in Paddle. }); m.def("op_support_gpu", OpSupportGPU); -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) m.def("get_cuda_device_count", platform::GetCUDADeviceCount); m.def("nvprof_init", platform::CudaProfilerInit); diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index 6f8c597f8e610..4ddac956cd98b 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -59,7 +59,7 @@ struct CastToPyBufferImpl { } framework::Tensor dst_tensor; if (paddle::platform::is_gpu_place(tensor.place())) { -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) auto *src_ptr = static_cast(tensor.data()); auto *dst_ptr = static_cast(dst_tensor.mutable_data( tensor.dims(), platform::CPUPlace())); @@ -71,7 +71,7 @@ struct CastToPyBufferImpl { paddle::platform::GpuMemcpyAsync( dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(), - cudaMemcpyDeviceToHost, dev_ctx->stream()); + hipMemcpyDeviceToHost, dev_ctx->stream()); dev_ctx->Wait(); #else PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); @@ -165,7 +165,7 @@ void PyCPUTensorSetFromArray( std::memcpy(dst, array.data(), sizeof(uint16_t) * array.size()); } -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) template void PyCUDATensorSetFromArray( framework::Tensor &self, @@ -184,7 +184,7 @@ void PyCUDATensorSetFromArray( auto dev_ctx = static_cast(pool.Get(place)); paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(), - cudaMemcpyHostToDevice, dev_ctx->stream()); + hipMemcpyHostToDevice, dev_ctx->stream()); } template <> @@ -206,7 +206,7 @@ void PyCUDATensorSetFromArray( static_cast(pool.Get(place)); paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(uint16_t) * array.size(), - cudaMemcpyHostToDevice, dev_ctx->stream()); + hipMemcpyHostToDevice, dev_ctx->stream()); } #endif diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index 4885b74e6c664..0612361b8b267 100755 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -50,7 +50,7 @@ function cmake_gen() { -DCUDNN_ROOT=/usr/ -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} -DWITH_TESTING=${WITH_TESTING:-ON} - -DWITH_FAST_BUNDLE_TEST=ON + -DWITH_FAST_BUNDLE_TEST=OFF -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} @@ -77,10 +77,11 @@ EOF -DCUDNN_ROOT=/usr/ \ -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} \ -DWITH_TESTING=${WITH_TESTING:-ON} \ - -DWITH_FAST_BUNDLE_TEST=ON \ + -DWITH_FAST_BUNDLE_TEST=OFF \ -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \ -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \ - -DCMAKE_EXPORT_COMPILE_COMMANDS=ON + -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ + -DCMAKE_CXX_FLAGS_RELEASE="-O0 -DNDEBUG" } function run_build() { @@ -231,7 +232,7 @@ function gen_fluid_inference_lib() { Deploying fluid inference library ... ======================================== EOF - make inference_lib_dist + #make inference_lib_dist fi } diff --git a/paddle/scripts/docker/dbuild.sh b/paddle/scripts/docker/dbuild.sh new file mode 100755 index 0000000000000..61a07a336183b --- /dev/null +++ b/paddle/scripts/docker/dbuild.sh @@ -0,0 +1,253 @@ +#!/bin/bash + +function cmake_gen() { + mkdir -p /paddle/dbuild + cd /paddle/dbuild + + # build script will not fail if *.deb does not exist + rm *.deb 2>/dev/null || true + # delete previous built whl packages + rm -rf /paddle/paddle/dist 2>/dev/null || true + + # Support build for all python versions, currently + # including cp27-cp27m and cp27-cp27mu. + PYTHON_FLAGS="" + if [ "$1" != "" ]; then + echo "using python abi: $1" + if [ "$1" == "cp27-cp27m" ]; then + export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs4/lib:} + export PATH=/opt/python/cp27-cp27m/bin/:${PATH} + PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27m/bin/python + -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27m/include/python2.7 + -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs2/lib/libpython2.7.so" + elif [ "$1" == "cp27-cp27mu" ]; then + export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs2/lib:} + export PATH=/opt/python/cp27-cp27mu/bin/:${PATH} + PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python + -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7 + -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so" + fi + fi + + cat < /paddle/dbuild/Dockerfile < + ENV HOME /root +EOF + + if [[ ${WITH_GPU} == "ON" ]]; then + NCCL_DEPS="apt-get install -y libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 &&" + else + NCCL_DEPS="" + fi + + if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]]; then + PADDLE_VERSION="paddle version" + CMD='"paddle", "version"' + else + PADDLE_VERSION="true" + CMD='"true"' + fi + + cat >> /paddle/dbuild/Dockerfile <