Skip to content

Commit

Permalink
[ROCM] update fluid operators for rocm (part1), test=develop (#31077)
Browse files Browse the repository at this point in the history
  • Loading branch information
qili93 authored Feb 23, 2021
1 parent 99fd981 commit cced930
Show file tree
Hide file tree
Showing 19 changed files with 142 additions and 36 deletions.
2 changes: 1 addition & 1 deletion paddle/fluid/operators/controlflow/conditional_block_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ class ConditionalOp : public framework::OperatorBase {
ips[0]->numel()));
bool res = false;
if (platform::is_gpu_place(ips[0]->place())) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
framework::LoDTensor cpu_tensor;
framework::TensorCopy(*ips[0], platform::CPUPlace(), &cpu_tensor);
platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait();
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/operators/controlflow/get_places_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,15 @@ namespace imperative {
class OpBase;
} // namespace imperative
} // namespace paddle
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/gpu_info.h"
#endif

namespace paddle {
namespace operators {

static size_t CUDADevCount() {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
return platform::GetCUDADeviceCount();
#else
return 0UL;
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/operators/controlflow/while_op_helper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ bool GetCondData(const framework::LoDTensor &cond) {
}
// when platform::is_gpu_place(cond.place()) is true
std::unique_ptr<framework::LoDTensor> cpu_cond{new framework::LoDTensor()};
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
framework::TensorCopySync(cond, platform::CPUPlace(), cpu_cond.get());
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
Expand Down
8 changes: 5 additions & 3 deletions paddle/fluid/operators/detection/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,12 @@ detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc bo
detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc sigmoid_focal_loss_op.cu)
detection_library(retinanet_detection_output_op SRCS retinanet_detection_output_op.cc)

if(WITH_GPU)
if(WITH_GPU OR WITH_ROCM)
set(TMPDEPS memory)
if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
set(TMPDEPS memory cub)
if(WITH_GPU)
if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
set(TMPDEPS memory cub)
endif()
endif()
detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS ${TMPDEPS})
detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc generate_proposals_v2_op.cu DEPS ${TMPDEPS})
Expand Down
21 changes: 19 additions & 2 deletions paddle/fluid/operators/detection/bbox_util.cu.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,16 @@ limitations under the License. */
#include <cfloat>
#include <string>
#include <vector>
#ifdef __NVCC__
#include "cub/cub.cuh"
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
#ifdef __HIPCC__
#include <hipcub/hipcub.hpp>
#include "paddle/fluid/platform/miopen_helper.h"
#endif
#include "paddle/fluid/operators/gather.cu.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/cudnn_helper.h"
#include "paddle/fluid/platform/for_range.h"

namespace paddle {
Expand Down Expand Up @@ -58,16 +64,27 @@ static void SortDescending(const platform::CUDADeviceContext &ctx,

// Determine temporary device storage requirements
size_t temp_storage_bytes = 0;
#ifdef PADDLE_WITH_HIP
hipcub::DeviceRadixSort::SortPairsDescending<T, int>(
nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num);
#else
cub::DeviceRadixSort::SortPairsDescending<T, int>(
nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num);
#endif
// Allocate temporary storage
auto place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);

// Run sorting operation
// Run sorting operation
#ifdef PADDLE_WITH_HIP
hipcub::DeviceRadixSort::SortPairsDescending<T, int>(
d_temp_storage->ptr(), temp_storage_bytes, keys_in, keys_out, idx_in,
idx_out, num);
#else
cub::DeviceRadixSort::SortPairsDescending<T, int>(
d_temp_storage->ptr(), temp_storage_bytes, keys_in, keys_out, idx_in,
idx_out, num);
#endif
}

template <typename T>
Expand Down
40 changes: 35 additions & 5 deletions paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include <paddle/fluid/memory/allocation/allocator.h>
#ifdef __NVCC__
#include "cub/cub.cuh"
#endif
#ifdef __HIPCC__
#include <hipcub/hipcub.hpp>
#endif

#include <paddle/fluid/memory/allocation/allocator.h>
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memcpy.h"
Expand Down Expand Up @@ -135,17 +141,29 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {

// Determine temporary device storage requirements
size_t temp_storage_bytes = 0;
#ifdef PADDLE_WITH_HIP
hipcub::DeviceRadixSort::SortPairsDescending<T, int>(
nullptr, temp_storage_bytes, concat_scores.data<T>(), keys_out, idx_in,
idx_out, total_roi_num);
#else
cub::DeviceRadixSort::SortPairsDescending<T, int>(
nullptr, temp_storage_bytes, concat_scores.data<T>(), keys_out, idx_in,
idx_out, total_roi_num);
#endif
// Allocate temporary storage
auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);

// Run sorting operation
// sort score to get corresponding index
// Run sorting operation
// sort score to get corresponding index
#ifdef PADDLE_WITH_HIP
hipcub::DeviceRadixSort::SortPairsDescending<T, int>(
d_temp_storage->ptr(), temp_storage_bytes, concat_scores.data<T>(),
keys_out, idx_in, idx_out, total_roi_num);
#else
cub::DeviceRadixSort::SortPairsDescending<T, int>(
d_temp_storage->ptr(), temp_storage_bytes, concat_scores.data<T>(),
keys_out, idx_in, idx_out, total_roi_num);
#endif
index_out_t.Resize({real_post_num});
Tensor sorted_rois;
sorted_rois.mutable_data<T>({real_post_num, kBBoxSize}, dev_ctx.GetPlace());
Expand All @@ -167,17 +185,29 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
out_id_t.mutable_data<int>({real_post_num}, dev_ctx.GetPlace());
// Determine temporary device storage requirements
temp_storage_bytes = 0;
#ifdef PADDLE_WITH_HIP
hipcub::DeviceRadixSort::SortPairs<int, int>(
nullptr, temp_storage_bytes, sorted_batch_id.data<int>(), out_id_data,
batch_idx_in, index_out_t.data<int>(), real_post_num);
#else
cub::DeviceRadixSort::SortPairs<int, int>(
nullptr, temp_storage_bytes, sorted_batch_id.data<int>(), out_id_data,
batch_idx_in, index_out_t.data<int>(), real_post_num);
#endif
// Allocate temporary storage
d_temp_storage = memory::Alloc(place, temp_storage_bytes);

// Run sorting operation
// sort batch_id to get corresponding index
// Run sorting operation
// sort batch_id to get corresponding index
#ifdef PADDLE_WITH_HIP
hipcub::DeviceRadixSort::SortPairs<int, int>(
d_temp_storage->ptr(), temp_storage_bytes, sorted_batch_id.data<int>(),
out_id_data, batch_idx_in, index_out_t.data<int>(), real_post_num);
#else
cub::DeviceRadixSort::SortPairs<int, int>(
d_temp_storage->ptr(), temp_storage_bytes, sorted_batch_id.data<int>(),
out_id_data, batch_idx_in, index_out_t.data<int>(), real_post_num);
#endif

GPUGather<T>(dev_ctx, sorted_rois, index_out_t, fpn_rois);

Expand Down
32 changes: 28 additions & 4 deletions paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include <paddle/fluid/memory/allocation/allocator.h>
#ifdef __NVCC__
#include "cub/cub.cuh"
#endif
#ifdef __HIPCC__
#include <hipcub/hipcub.hpp>
#endif

#include <paddle/fluid/memory/allocation/allocator.h>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/detection/bbox_util.h"
#include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h"
Expand Down Expand Up @@ -143,24 +149,42 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {

// Determine temporary device storage requirements
size_t temp_storage_bytes = 0;
#ifdef PADDLE_WITH_HIP
hipcub::DeviceRadixSort::SortPairs<int, int>(nullptr, temp_storage_bytes,
target_lvls_data, keys_out,
idx_in, idx_out, roi_num);
#else
cub::DeviceRadixSort::SortPairs<int, int>(nullptr, temp_storage_bytes,
target_lvls_data, keys_out,
idx_in, idx_out, roi_num);
#endif
// Allocate temporary storage
auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);

// Run sorting operation
// sort target level to get corresponding index
// Run sorting operation
// sort target level to get corresponding index
#ifdef PADDLE_WITH_HIP
hipcub::DeviceRadixSort::SortPairs<int, int>(
d_temp_storage->ptr(), temp_storage_bytes, target_lvls_data, keys_out,
idx_in, idx_out, roi_num);
#else
cub::DeviceRadixSort::SortPairs<int, int>(
d_temp_storage->ptr(), temp_storage_bytes, target_lvls_data, keys_out,
idx_in, idx_out, roi_num);
#endif

int* restore_idx_data =
restore_index->mutable_data<int>({roi_num, 1}, dev_ctx.GetPlace());
// sort current index to get restore index
// sort current index to get restore index
#ifdef PADDLE_WITH_HIP
hipcub::DeviceRadixSort::SortPairs<int, int>(
d_temp_storage->ptr(), temp_storage_bytes, idx_out, keys_out, idx_in,
restore_idx_data, roi_num);
#else
cub::DeviceRadixSort::SortPairs<int, int>(
d_temp_storage->ptr(), temp_storage_bytes, idx_out, keys_out, idx_in,
restore_idx_data, roi_num);
#endif

int start = 0;
auto multi_rois_num = ctx.MultiOutput<Tensor>("MultiLevelRoIsNum");
Expand Down
1 change: 0 additions & 1 deletion paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "cub/cub.cuh"
#include "paddle/fluid/operators/detection/sigmoid_focal_loss_op.h"
#include "paddle/fluid/operators/math.h"
#include "paddle/fluid/platform/cuda_primitives.h"
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/operators/detection/target_assign_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ class TargetAssignKernel : public framework::OpKernel<T> {
int64_t k = x->dims()[2];

auto x_lod = x->lod().back();
#if defined(PADDLE_WITH_CUDA)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
size_t* x_lod_data = x_lod.MutableData(ctx.GetPlace());
#else
size_t* x_lod_data = x_lod.data();
Expand All @@ -129,7 +129,7 @@ class TargetAssignKernel : public framework::OpKernel<T> {
"TargetAssignOp input(NegIndices) needs 1 level of LoD"));
const int* neg_idx_data = neg_indices->data<int>();
auto neg_lod = neg_indices->lod().back();
#if defined(PADDLE_WITH_CUDA)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
size_t* neg_lod_data = neg_lod.MutableData(ctx.GetPlace());
#else
size_t* neg_lod_data = neg_lod.data();
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/operators/distributed/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory)
cc_library(parameter_recv SRCS parameter_recv.cc DEPS sendrecvop_rpc memory)
cc_library(communicator SRCS communicator.cc DEPS scope selected_rows tensor variable_helper selected_rows_functor simple_threadpool parameter_send parameter_recv generator)
cc_test(communicator_test SRCS communicator_test.cc DEPS communicator)
if(WITH_GPU)
if(WITH_GPU OR WITH_ROCM)
cc_test(collective_server_test SRCS collective_server_test.cc
DEPS sendrecvop_rpc executor ${RPC_DEPS}
selected_rows_functor scope math_function)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ limitations under the License. */
#ifdef PADDLE_WITH_NCCL
#include <nccl.h>
#endif
#ifdef PADDLE_WITH_RCCL
#include <rccl.h>
#endif
#include <sys/time.h>
#include <limits>
#include <memory>
Expand Down Expand Up @@ -144,7 +147,7 @@ void SerializeToIOBuf(const std::string& name, framework::Variable* var,
} else if (var->IsType<framework::SelectedRows>()) {
request->set_type(::sendrecv::SELECTED_ROWS);
payload.reset(new TensorPayload(GetSelectedRowsPayload(var, ctx, request)));
#ifdef PADDLE_WITH_NCCL
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
} else if (var->IsType<ncclUniqueId>()) {
request->set_type(::sendrecv::NCCL_ID);
const ncclUniqueId& uid = var->Get<ncclUniqueId>();
Expand Down Expand Up @@ -172,7 +175,7 @@ void SerializeToIOBuf(const std::string& name, framework::Variable* var,
static_cast<const char*>(payload->ptr()), payload->memory_size());
} else {
if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
IOBufWriter::AppendZeroCopy(
name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber,
static_cast<const char*>(payload->ptr()), payload->memory_size(),
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ void RunTestLodTensor(platform::Place place) {
TEST(LodTensor, Run) {
platform::CPUPlace place;
RunTestLodTensor(place);
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::CUDAPlace gpu(0);
RunTestLodTensor(gpu);
#endif
Expand All @@ -168,7 +168,7 @@ TEST(LodTensor, Run) {
TEST(SelectedRows, Run) {
platform::CPUPlace place;
RunSerdeTestSelectedRows(place);
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::CUDAPlace gpu;
RunSerdeTestSelectedRows(gpu);
#endif
Expand Down
7 changes: 5 additions & 2 deletions paddle/fluid/operators/distributed/grpc/grpc_serde.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ limitations under the License. */
#ifdef PADDLE_WITH_NCCL
#include <nccl.h>
#endif
#ifdef PADDLE_WITH_RCCL
#include <rccl.h>
#endif
#include <limits>
#include <memory>
#include "grpcpp/impl/codegen/byte_buffer.h"
Expand Down Expand Up @@ -75,7 +78,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
} else if (var->IsType<framework::SelectedRows>()) {
request.set_type(::sendrecv::SELECTED_ROWS);
payload = new TensorPayload(GetSelectedRowsPayload(var, ctx, &request));
#ifdef PADDLE_WITH_NCCL
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
} else if (var->IsType<ncclUniqueId>()) {
request.set_type(::sendrecv::NCCL_ID);
#endif
Expand All @@ -91,7 +94,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
e.WriteRawBytes(std::string(header.data(), header.size()));
// NCCLID is copied directly to the message, return bytebuffer
// with only one slice if serializing NCCLID.
#ifdef PADDLE_WITH_NCCL
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
if (var->IsType<ncclUniqueId>()) {
e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber,
NCCL_UNIQUE_ID_BYTES);
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ TEST(LodTensor, Run) {
platform::CPUPlace place;
RunTestLodTensor(place);
RunTestLodTensor(place, 1);
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::CUDAPlace gpu(0);
RunTestLodTensor(gpu);
RunTestLodTensor(gpu, 1);
Expand All @@ -217,7 +217,7 @@ TEST(SelectedRows, Run) {
platform::CPUPlace place;
RunSerdeTestSelectedRows(place);

#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::CUDAPlace gpu;
RunSerdeTestSelectedRows(gpu);
#endif
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/operators/distributed/parameter_prefetch.cc
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ void prefetchs(const std::vector<std::string> &id_var_names,
}
}
} else {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std::vector<float> ids_value_vec(ids_size * vec_dim_1);
for (auto idx = 0; idx < static_cast<int>(ids_size); idx++) {
const auto &id = ids[idx];
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/operators/distributed/sendrecvop_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ using VarMsg = sendrecv::VariableMessage;
static TensorPayload GetCommunicationAllocationFromTensor(
const platform::DeviceContext& ctx, const framework::Tensor& tensor) {
if (is_gpu_place(ctx.GetPlace())) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PADDLE_ENFORCE_EQ(
is_gpu_place(tensor.place()), true,
platform::errors::PreconditionNotMet("Please run in gpu place."));
Expand Down
Loading

0 comments on commit cced930

Please sign in to comment.