Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Review conv2d 1 #18

Open
wants to merge 34 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
6fb2069
Add HIP support for fluid/memory.
sabreshao Apr 9, 2018
7977142
Add HIP support to fluid/framework.
sabreshao Apr 9, 2018
fa63850
Add HIP support for fluid/platform and fluid/pybind.
sabreshao Apr 9, 2018
0906b86
[Don't merge!] Use existing .cu as hip code in operators.
sabreshao Apr 10, 2018
cec5084
[Don't merge!] Remove inference from cmake.
sabreshao Apr 10, 2018
d159c1f
[Don't merge!] Add dbuild.sh.
sabreshao Apr 9, 2018
1f15364
Add HIP support to fluid/operator.
sabreshao Apr 9, 2018
a451b97
[Don't merge!] Remove inference in build shell.
sabreshao Apr 10, 2018
ad437d6
Add HIP support to fluid/operator.
sabreshao Apr 12, 2018
6b26a99
Add HIP support to fluid/operator.
sabreshao Apr 18, 2018
6707f25
Add HIP support to fluid/operator.
sabreshao Apr 18, 2018
f20d157
Fix HIP test build.
sabreshao Apr 19, 2018
e6970d1
Fix HIP test build.
sabreshao Apr 18, 2018
dd442b9
[Don't merge!] Disable fast bundle test.
sabreshao Apr 18, 2018
2c63578
HIP cmake.
sabreshao Apr 15, 2018
597039b
[Don't merge!]
sabreshao Apr 23, 2018
44b2a9a
Fix operators build.
sabreshao Apr 23, 2018
3405871
Rollback to C++11.
sabreshao May 4, 2018
87b629e
[Don't merge!] Release build stick on O0.
sabreshao May 6, 2018
789a415
Enable NCCL op. rcclReduce isn't implmented so use AllReduce instead.
sabreshao May 9, 2018
f695568
From Jimmy: fix ops with cast parameters.
sabreshao May 14, 2018
4b60e22
From Jimmy: fix prior_box_op.
sabreshao May 14, 2018
e1fe453
Draft of MIOpen optimization.
sabreshao May 25, 2018
1e81907
"push t file"
dzhwinter Jun 5, 2018
d77bba7
"repick"
dzhwinter Jun 5, 2018
60bcd53
"some fix"
dzhwinter Jun 5, 2018
bfc4ac1
"i"
dzhwinter Jun 5, 2018
04b90d1
"d"
dzhwinter Jun 5, 2018
396fed0
"add some details"
dzhwinter Jun 21, 2018
7513946
test
dzhwinter Jun 21, 2018
97cacad
"fix ci"
dzhwinter Jun 21, 2018
11f6155
"pick some commits"
dzhwinter Jun 21, 2018
05b51eb
"another patch"
dzhwinter Jun 21, 2018
e706408
"clean pr"
dzhwinter Jun 21, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 0 additions & 8 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,6 @@ repos:
entry: bash ./.clang_format.hook -i
language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
- repo: local
hooks:
- id: cpplint-cpp-source
name: cpplint
description: Check C++ code style using cpplint.py.
entry: bash ./tools/codestyle/cpplint_pre_commit.hook
language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx)$
- repo: https://github.com/PaddlePaddle/pre-commit-golang
sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
hooks:
Expand Down
2 changes: 1 addition & 1 deletion cmake/external/eigen.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ if(WITH_AMD_GPU)
extern_eigen3
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/sabreshao/hipeigen.git"
GIT_TAG 0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9
GIT_TAG e1c9e50333361eb826a2b35bda5d08c55dfbf16e
PREFIX ${EIGEN_SOURCE_DIR}
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
Expand Down
10 changes: 7 additions & 3 deletions cmake/generic.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -338,9 +338,13 @@ function(hip_library TARGET_NAME)
target_link_libraries(${TARGET_NAME} /opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a)
find_fluid_modules(${TARGET_NAME})
endif()
if (hip_library_DEPS)
add_dependencies(${TARGET_NAME} ${hip_library_DEPS})
target_link_libraries(${TARGET_NAME} ${hip_library_DEPS})
if("${hip_library_DEPS}" MATCHES "ARCHIVE_START")
# Support linking flags: --whole-archive (Linux) / -force_load (MacOS).
# WARNING: Please don't use ARCHIVE_START&ARCHIVE_END if TARGET_NAME will be linked by other libraries.
target_circle_link_libraries(${TARGET_NAME} ${hip_library_DEPS})
list(REMOVE_ITEM hip_library_DEPS ARCHIVE_START ARCHIVE_END)
else()
target_link_libraries(${TARGET_NAME} ${hip_library_DEPS})
endif()
# cpplint code style
foreach(source_file ${hip_library_SRCS})
Expand Down
2 changes: 1 addition & 1 deletion cmake/hip.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ include_directories("/opt/rocm/thrust")

list(APPEND EXTERNAL_LIBS "-L/opt/rocm/lib/ -lhip_hcc")

set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++14" )
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++11" )

if(WITH_DSO)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_USE_DSO")
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@ add_subdirectory(platform)
add_subdirectory(framework)
add_subdirectory(operators)
add_subdirectory(pybind)
add_subdirectory(inference)
#add_subdirectory(inference)
add_subdirectory(string)
add_subdirectory(recordio)
25 changes: 15 additions & 10 deletions paddle/fluid/framework/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ nv_test(dim_test SRCS dim_test.cu DEPS ddim)

if(WITH_GPU)
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place paddle_memory device_context framework_proto)
elseif(WITH_AMD_GPU)
hip_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place paddle_memory device_context framework_proto)
else()
cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place paddle_memory device_context framework_proto)
endif()
Expand All @@ -23,7 +25,7 @@ cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)

nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place paddle_memory device_context init)
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio)
cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory)
hip_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory)
nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init)

cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
Expand All @@ -43,6 +45,9 @@ nv_test(data_device_transform_test SRCS data_device_transform_test.cu
if(WITH_GPU)
nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
nv_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform)
elseif(WITH_AMD_GPU)
hip_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
hip_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform)
else()
cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor)
cc_test(data_type_transform_test SRCS data_type_transform_test.cc DEPS data_type_transform)
Expand All @@ -55,19 +60,19 @@ cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor
framework_proto selected_rows data_device_transform data_type_transform data_layout_transform)

cc_library(attribute SRCS attribute.cc DEPS framework_proto boost)
cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc
hip_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc
device_context)
cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute)
cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
shape_inference data_transform lod_tensor profiler)
cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry init)
hip_test(operator_test SRCS operator_test.cc DEPS operator op_registry init)
cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog)

cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
hip_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)

py_proto_compile(framework_py_proto SRCS framework.proto)
# Generate an empty __init__.py to make framework_py_proto as a valid python module.
Expand All @@ -80,7 +85,7 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})

cc_library(backward SRCS backward.cc DEPS net_op)
cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context fill_constant_op)
hip_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context fill_constant_op)
cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)

cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
Expand All @@ -92,11 +97,11 @@ framework_proto backward glog lod_rank_table feed_fetch_method)
cc_library(parallel_executor SRCS parallel_executor.cc DEPS multi_devices_graph_builder threaded_ssa_graph_executor)

cc_library(prune SRCS prune.cc DEPS framework_proto)
cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
hip_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
hip_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
proto_desc)
cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
hip_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)

cc_library(init SRCS init.cc DEPS gflags device_context place stringpiece operator)
cc_test(init_test SRCS init_test.cc DEPS init)
Expand All @@ -105,7 +110,7 @@ cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_contex
cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)

cc_test(channel_test SRCS channel_test.cc)
cc_test(tuple_test SRCS tuple_test.cc )
cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
hip_test(tuple_test SRCS tuple_test.cc )
hip_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
channel_send_op channel_recv_op sum_op select_op elementwise_add_op compare_op
conditional_block_op while_op assign_op print_op executor proto_desc)
2 changes: 1 addition & 1 deletion paddle/fluid/framework/data_type_transform.cc
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ struct CastDataType {
auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
trans(*context, in_begin, in_end, out_begin,
CastDataTypeFunctor<InType, OutType>());
#ifdef __NVCC__
#ifdef __HIPCC__
} else if (platform::is_gpu_place(in_.place())) {
platform::Transform<platform::CUDADeviceContext> trans;
auto* context = static_cast<const platform::CUDADeviceContext*>(ctx_);
Expand Down
4 changes: 4 additions & 0 deletions paddle/fluid/framework/ddim.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,11 @@ limitations under the License. */
#include <initializer_list>
#include <stdexcept>
#include <vector>
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/framework/dim_hip.h"
#else
#include "paddle/fluid/framework/dim.h"
#endif
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/variant.h"

Expand Down
6 changes: 6 additions & 0 deletions paddle/fluid/framework/details/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_h
cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
dynload_cuda)
hip_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
dynload_hip)
cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)

cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
Expand All @@ -12,7 +14,11 @@ cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
if(WITH_GPU)
set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle)
else()
if(WITH_AMD_GPU)
set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle)
else()
set(multi_devices_graph_builder_deps)
endif()
endif()
cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
scale_loss_grad_op_handle ${multi_devices_graph_builder_deps})
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/framework/details/fetch_op_handle.cc
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ void FetchOpHandle::RunImpl() {
auto &scope = scopes[i];
auto &t = scope->FindVar(var_name)->Get<framework::LoDTensor>();
if (platform::is_gpu_place(var->place_)) {
#ifdef PADDLE_WITH_CUDA
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP))
TensorCopy(t, cpu, *dev_ctxes_[t.place()], &tensors_[i]);
dev_ctxes_[t.place()]->Wait();
#endif
Expand Down
8 changes: 4 additions & 4 deletions paddle/fluid/framework/details/multi_devices_graph_builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
#include "paddle/fluid/framework/scope.h"

#ifdef PADDLE_WITH_CUDA
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP))
#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
#endif

Expand All @@ -28,7 +28,7 @@ namespace paddle {
namespace framework {
namespace details {

#ifdef PADDLE_WITH_CUDA
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP))
MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
const std::vector<platform::Place> &places,
const std::string &loss_var_name,
Expand Down Expand Up @@ -97,7 +97,7 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
if (is_forwarding) {
if (var_names.size() == 1 && var_names[0] == loss_var_name_) {
// Insert ScaleCost OpHandle
#ifdef PADDLE_WITH_CUDA
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP))
auto *communication_dev_ctx = nccl_ctxs_->DevCtx(p);
#else
auto *communication_dev_ctx =
Expand Down Expand Up @@ -135,7 +135,7 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
og_has_been_broadcast.count(og) == 0) { // is param grad
// Insert NCCL AllReduce Op
og_has_been_broadcast.insert(og);
#ifdef PADDLE_WITH_CUDA
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP))
result.ops_.emplace_back(
new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_));
auto *op_handle = result.ops_.back().get();
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/framework/details/multi_devices_graph_builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class Scope;
namespace details {
class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
public:
#ifdef PADDLE_WITH_CUDA
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP))
MultiDevSSAGraphBuilder(const std::vector<platform::Place> &places,
const std::string &loss_var_name,
const std::unordered_set<std::string> &params,
Expand All @@ -47,7 +47,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
const std::vector<Scope *> &local_scopes_;
std::unordered_set<std::string> grad_names_;

#ifdef PADDLE_WITH_CUDA
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP))
platform::NCCLContextMap *nccl_ctxs_;
#endif
};
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,8 @@ void NCCLAllReduceOpHandle::RunImpl() {
auto stream = nccl_ctx.stream();
auto comm = nccl_ctx.comm_;
all_reduce_calls.emplace_back([=] {
PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
buffer, buffer, numel, static_cast<ncclDataType_t>(dtype), ncclSum,
PADDLE_ENFORCE(platform::dynload::rcclAllReduce(
buffer, buffer, numel, static_cast<rcclDataType_t>(dtype), rcclSum,
comm, stream));
});
}
Expand Down
4 changes: 4 additions & 0 deletions paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,11 @@
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/rccl_helper.h"
#else
#include "paddle/fluid/platform/nccl_helper.h"
#endif

namespace paddle {
namespace framework {
Expand Down
34 changes: 34 additions & 0 deletions paddle/fluid/framework/details/op_handle_base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ OpHandleBase::~OpHandleBase() {
for (auto &ev : events_) {
PADDLE_ENFORCE(cudaEventDestroy(ev.second));
}
#elif defined(PADDLE_WITH_HIP)
for (auto &ev : events_) {
PADDLE_ENFORCE(hipEventDestroy(ev.second));
}
#endif
}

Expand All @@ -49,6 +53,15 @@ void OpHandleBase::Run(bool use_event) {
cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming));
}
}
#elif defined(PADDLE_WITH_HIP)
if (events_.empty() && use_event) {
for (auto &p : dev_ctxes_) {
int dev_id = boost::get<platform::CUDAPlace>(p.first).device;
PADDLE_ENFORCE(hipSetDevice(dev_id));
PADDLE_ENFORCE(
hipEventCreateWithFlags(&events_[dev_id], hipEventDisableTiming));
}
}
#else
PADDLE_ENFORCE(!use_event);
#endif
Expand All @@ -64,6 +77,15 @@ void OpHandleBase::Run(bool use_event) {
PADDLE_ENFORCE(cudaEventRecord(events_.at(dev_id), stream));
}
}
#elif defined(PADDLE_WITH_HIP)
if (use_event) {
for (auto &p : dev_ctxes_) {
int dev_id = boost::get<platform::CUDAPlace>(p.first).device;
auto stream =
static_cast<platform::CUDADeviceContext *>(p.second)->stream();
PADDLE_ENFORCE(hipEventRecord(events_.at(dev_id), stream));
}
}
#endif
}

Expand All @@ -80,6 +102,18 @@ void OpHandleBase::Wait(platform::DeviceContext *waited_dev) {
PADDLE_ENFORCE(cudaStreamWaitEvent(stream, ev.second, 0));
}
}
#elif defined(PADDLE_WITH_HIP)
if (platform::is_cpu_place(waited_dev->GetPlace()) || events_.empty()) {
for (auto &dev_ctx : dev_ctxes_) {
dev_ctx.second->Wait();
}
} else {
auto stream =
static_cast<platform::CUDADeviceContext *>(waited_dev)->stream();
for (auto &ev : events_) {
PADDLE_ENFORCE(hipStreamWaitEvent(stream, ev.second, 0));
}
}
#else
for (auto &dev_ctx : dev_ctxes_) {
dev_ctx.second->Wait();
Expand Down
2 changes: 2 additions & 0 deletions paddle/fluid/framework/details/op_handle_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ class OpHandleBase {

#ifdef PADDLE_WITH_CUDA
std::unordered_map<int, cudaEvent_t> events_;
#elif defined(PADDLE_WITH_HIP)
std::unordered_map<int, hipEvent_t> events_;
#endif

OpHandleBase() {}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ void ScaleLossGradOpHandle::RunImpl() {
if (platform::is_cpu_place(place_)) {
*tmp = coeff_;
} else {
#ifdef PADDLE_WITH_CUDA
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP))
auto stream =
static_cast<platform::CUDADeviceContext *>(this->dev_ctxes_[place_])
->stream();
Expand Down
Loading