ROCm · dzhwinter · Apr 9, 2018 · Apr 9, 2018 · Apr 9, 2018 · Apr 10, 2018
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -26,14 +26,6 @@ repos:
         entry: bash ./.clang_format.hook -i
         language: system
         files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
--   repo: local
-    hooks:
-    -   id: cpplint-cpp-source
-        name: cpplint
-        description: Check C++ code style using cpplint.py.
-        entry: bash ./tools/codestyle/cpplint_pre_commit.hook
-        language: system
-        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx)$
 -   repo: https://github.com/PaddlePaddle/pre-commit-golang
     sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
     hooks:

diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
@@ -9,7 +9,7 @@ if(WITH_AMD_GPU)
         extern_eigen3
         ${EXTERNAL_PROJECT_LOG_ARGS}
         GIT_REPOSITORY  "https://github.com/sabreshao/hipeigen.git"
-        GIT_TAG         0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9
+        GIT_TAG         e1c9e50333361eb826a2b35bda5d08c55dfbf16e
         PREFIX          ${EIGEN_SOURCE_DIR}
         UPDATE_COMMAND  ""
         CONFIGURE_COMMAND ""

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
@@ -338,9 +338,13 @@ function(hip_library TARGET_NAME)
         target_link_libraries(${TARGET_NAME} /opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a)
 	find_fluid_modules(${TARGET_NAME})
       endif()
-      if (hip_library_DEPS)
-	add_dependencies(${TARGET_NAME} ${hip_library_DEPS})
-	target_link_libraries(${TARGET_NAME} ${hip_library_DEPS})
+      if("${hip_library_DEPS}" MATCHES "ARCHIVE_START")
+        # Support linking flags: --whole-archive (Linux) / -force_load (MacOS).
+        # WARNING: Please don't use ARCHIVE_START&ARCHIVE_END if TARGET_NAME will be linked by other libraries.
+        target_circle_link_libraries(${TARGET_NAME} ${hip_library_DEPS})
+        list(REMOVE_ITEM hip_library_DEPS ARCHIVE_START ARCHIVE_END)
+      else()
+        target_link_libraries(${TARGET_NAME} ${hip_library_DEPS})
       endif()
       # cpplint code style
       foreach(source_file ${hip_library_SRCS})

diff --git a/cmake/hip.cmake b/cmake/hip.cmake
@@ -11,7 +11,7 @@ include_directories("/opt/rocm/thrust")
 
 list(APPEND EXTERNAL_LIBS "-L/opt/rocm/lib/ -lhip_hcc")
 
-set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++14" )
+set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++11" )
 
 if(WITH_DSO)
   set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_USE_DSO")

diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
@@ -3,6 +3,6 @@ add_subdirectory(platform)
 add_subdirectory(framework)
 add_subdirectory(operators)
 add_subdirectory(pybind)
-add_subdirectory(inference)
+#add_subdirectory(inference)
 add_subdirectory(string)
 add_subdirectory(recordio)
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
@@ -8,6 +8,8 @@ nv_test(dim_test SRCS dim_test.cu DEPS ddim)
 
 if(WITH_GPU)
   nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place paddle_memory device_context framework_proto)
+elseif(WITH_AMD_GPU)
+  hip_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place paddle_memory device_context framework_proto)
 else()
   cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place paddle_memory device_context framework_proto)
 endif()
@@ -23,7 +25,7 @@ cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
 
 nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place paddle_memory device_context init)
 cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio)
-cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory)
+hip_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init)
 
 cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
@@ -43,6 +45,9 @@ nv_test(data_device_transform_test SRCS data_device_transform_test.cu
 if(WITH_GPU)
   nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
   nv_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform)
+elseif(WITH_AMD_GPU)
+  hip_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
+  hip_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform)
 else()
   cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor)
   cc_test(data_type_transform_test SRCS data_type_transform_test.cc DEPS data_type_transform)
@@ -55,19 +60,19 @@ cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor
         framework_proto selected_rows data_device_transform data_type_transform data_layout_transform)
 
 cc_library(attribute SRCS attribute.cc DEPS framework_proto boost)
-cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc
+hip_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc
 device_context)
 cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute)
 cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
     shape_inference data_transform lod_tensor profiler)
-cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry init)
+hip_test(operator_test SRCS operator_test.cc DEPS operator op_registry init)
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog)
 
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
-nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
+hip_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
 
 py_proto_compile(framework_py_proto SRCS framework.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
@@ -80,7 +85,7 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
     WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
 cc_library(backward SRCS backward.cc DEPS net_op)
-cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context fill_constant_op)
+hip_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context fill_constant_op)
 cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
 
 cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
@@ -92,11 +97,11 @@ framework_proto backward glog lod_rank_table feed_fetch_method)
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS multi_devices_graph_builder threaded_ssa_graph_executor)
 
 cc_library(prune SRCS prune.cc DEPS framework_proto)
-cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
-cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
+hip_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
+hip_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
         proto_desc)
 cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
-cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
+hip_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
 
 cc_library(init SRCS init.cc DEPS gflags device_context place stringpiece operator)
 cc_test(init_test SRCS init_test.cc DEPS init)
@@ -105,7 +110,7 @@ cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_contex
 cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
 
 cc_test(channel_test SRCS channel_test.cc)
-cc_test(tuple_test SRCS tuple_test.cc )
-cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
+hip_test(tuple_test SRCS tuple_test.cc )
+hip_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
         channel_send_op channel_recv_op sum_op select_op elementwise_add_op compare_op
         conditional_block_op while_op assign_op print_op executor proto_desc)
diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc
@@ -47,7 +47,7 @@ struct CastDataType {
       auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
       trans(*context, in_begin, in_end, out_begin,
             CastDataTypeFunctor<InType, OutType>());
-#ifdef __NVCC__
+#ifdef __HIPCC__
     } else if (platform::is_gpu_place(in_.place())) {
       platform::Transform<platform::CUDADeviceContext> trans;
       auto* context = static_cast<const platform::CUDADeviceContext*>(ctx_);

diff --git a/paddle/fluid/framework/ddim.h b/paddle/fluid/framework/ddim.h
@@ -17,7 +17,11 @@ limitations under the License. */
 #include <initializer_list>
 #include <stdexcept>
 #include <vector>
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/framework/dim_hip.h"
+#else
 #include "paddle/fluid/framework/dim.h"
+#endif
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/variant.h"
 

diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
@@ -4,6 +4,8 @@ cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_h
 cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
         dynload_cuda)
+hip_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
+        dynload_hip)
 cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
 
 cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
@@ -12,7 +14,11 @@ cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
 if(WITH_GPU)
     set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle)
 else()
+  if(WITH_AMD_GPU)
+    set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle)
+  else()
     set(multi_devices_graph_builder_deps)
+  endif()
 endif()
 cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
             scale_loss_grad_op_handle ${multi_devices_graph_builder_deps})

diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -59,7 +59,7 @@ void FetchOpHandle::RunImpl() {
     auto &scope = scopes[i];
     auto &t = scope->FindVar(var_name)->Get<framework::LoDTensor>();
     if (platform::is_gpu_place(var->place_)) {
-#ifdef PADDLE_WITH_CUDA
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP))
       TensorCopy(t, cpu, *dev_ctxes_[t.place()], &tensors_[i]);
       dev_ctxes_[t.place()]->Wait();
 #endif

diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -17,7 +17,7 @@
 #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
 #include "paddle/fluid/framework/scope.h"
 
-#ifdef PADDLE_WITH_CUDA
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP))
 #include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
 #endif
 
@@ -28,7 +28,7 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-#ifdef PADDLE_WITH_CUDA
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP))
 MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
     const std::vector<platform::Place> &places,
     const std::string &loss_var_name,
@@ -97,7 +97,7 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
       if (is_forwarding) {
         if (var_names.size() == 1 && var_names[0] == loss_var_name_) {
 // Insert ScaleCost OpHandle
-#ifdef PADDLE_WITH_CUDA
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP))
           auto *communication_dev_ctx = nccl_ctxs_->DevCtx(p);
 #else
           auto *communication_dev_ctx =
@@ -135,7 +135,7 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
             og_has_been_broadcast.count(og) == 0) {  // is param grad
                                                      // Insert NCCL AllReduce Op
           og_has_been_broadcast.insert(og);
-#ifdef PADDLE_WITH_CUDA
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP))
           result.ops_.emplace_back(
               new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_));
           auto *op_handle = result.ops_.back().get();

diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -26,7 +26,7 @@ class Scope;
 namespace details {
 class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
  public:
-#ifdef PADDLE_WITH_CUDA
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP))
   MultiDevSSAGraphBuilder(const std::vector<platform::Place> &places,
                           const std::string &loss_var_name,
                           const std::unordered_set<std::string> &params,
@@ -47,7 +47,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
   const std::vector<Scope *> &local_scopes_;
   std::unordered_set<std::string> grad_names_;
 
-#ifdef PADDLE_WITH_CUDA
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP))
   platform::NCCLContextMap *nccl_ctxs_;
 #endif
 };

diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
@@ -63,8 +63,8 @@ void NCCLAllReduceOpHandle::RunImpl() {
       auto stream = nccl_ctx.stream();
       auto comm = nccl_ctx.comm_;
       all_reduce_calls.emplace_back([=] {
-        PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
-            buffer, buffer, numel, static_cast<ncclDataType_t>(dtype), ncclSum,
+        PADDLE_ENFORCE(platform::dynload::rcclAllReduce(
+            buffer, buffer, numel, static_cast<rcclDataType_t>(dtype), rcclSum,
             comm, stream));
       });
     }

diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
@@ -20,7 +20,11 @@
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/platform/rccl_helper.h"
+#else
 #include "paddle/fluid/platform/nccl_helper.h"
+#endif
 
 namespace paddle {
 namespace framework {

diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
@@ -36,6 +36,10 @@ OpHandleBase::~OpHandleBase() {
   for (auto &ev : events_) {
     PADDLE_ENFORCE(cudaEventDestroy(ev.second));
   }
+#elif defined(PADDLE_WITH_HIP)
+  for (auto &ev : events_) {
+    PADDLE_ENFORCE(hipEventDestroy(ev.second));
+  }
 #endif
 }
 
@@ -49,6 +53,15 @@ void OpHandleBase::Run(bool use_event) {
           cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming));
     }
   }
+#elif defined(PADDLE_WITH_HIP)
+  if (events_.empty() && use_event) {
+    for (auto &p : dev_ctxes_) {
+      int dev_id = boost::get<platform::CUDAPlace>(p.first).device;
+      PADDLE_ENFORCE(hipSetDevice(dev_id));
+      PADDLE_ENFORCE(
+          hipEventCreateWithFlags(&events_[dev_id], hipEventDisableTiming));
+    }
+  }
 #else
   PADDLE_ENFORCE(!use_event);
 #endif
@@ -64,6 +77,15 @@ void OpHandleBase::Run(bool use_event) {
       PADDLE_ENFORCE(cudaEventRecord(events_.at(dev_id), stream));
     }
   }
+#elif defined(PADDLE_WITH_HIP)
+  if (use_event) {
+    for (auto &p : dev_ctxes_) {
+      int dev_id = boost::get<platform::CUDAPlace>(p.first).device;
+      auto stream =
+          static_cast<platform::CUDADeviceContext *>(p.second)->stream();
+      PADDLE_ENFORCE(hipEventRecord(events_.at(dev_id), stream));
+    }
+  }
 #endif
 }
 
@@ -80,6 +102,18 @@ void OpHandleBase::Wait(platform::DeviceContext *waited_dev) {
       PADDLE_ENFORCE(cudaStreamWaitEvent(stream, ev.second, 0));
     }
   }
+#elif defined(PADDLE_WITH_HIP)
+  if (platform::is_cpu_place(waited_dev->GetPlace()) || events_.empty()) {
+    for (auto &dev_ctx : dev_ctxes_) {
+      dev_ctx.second->Wait();
+    }
+  } else {
+    auto stream =
+        static_cast<platform::CUDADeviceContext *>(waited_dev)->stream();
+    for (auto &ev : events_) {
+      PADDLE_ENFORCE(hipStreamWaitEvent(stream, ev.second, 0));
+    }
+  }
 #else
   for (auto &dev_ctx : dev_ctxes_) {
     dev_ctx.second->Wait();

diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
@@ -37,6 +37,8 @@ class OpHandleBase {
 
 #ifdef PADDLE_WITH_CUDA
   std::unordered_map<int, cudaEvent_t> events_;
+#elif defined(PADDLE_WITH_HIP)
+  std::unordered_map<int, hipEvent_t> events_;
 #endif
 
   OpHandleBase() {}

diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -36,7 +36,7 @@ void ScaleLossGradOpHandle::RunImpl() {
   if (platform::is_cpu_place(place_)) {
     *tmp = coeff_;
   } else {
-#ifdef PADDLE_WITH_CUDA
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP))
     auto stream =
         static_cast<platform::CUDADeviceContext *>(this->dev_ctxes_[place_])
             ->stream();