unify fluid::CUDADeviceContext and phi::GpuContext (#44723)

* remove cudaDeviceContext * remove more template * fix rocm compile
PaddlePaddle · Jul 29, 2022 · 8849056 · 8849056
1 parent 0a2db7c
commit 8849056
Show file tree

Hide file tree

Showing 26 changed files with 122 additions and 2,801 deletions.
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.h b/paddle/fluid/framework/details/eager_deletion_op_handle.h
@@ -23,12 +23,6 @@
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
 
-namespace paddle {
-namespace platform {
-class CUDADeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace framework {
 class GarbageCollector;

diff --git a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
@@ -25,11 +25,6 @@
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle {
-
-namespace platform {
-class CUDADeviceContext;
-}  // namespace platform
-
 namespace memory {
 namespace allocation {
 

diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -23,13 +23,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/miopen_lstm_cache.h"
 #endif
 
-namespace paddle {
-namespace platform {
-class CUDADeviceContext;
-
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 

diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
@@ -182,7 +182,7 @@ void FusedSeqpoolCVM(const framework::ExecutionContext
 #endif
 
   size_t N = static_cast<size_t>(batch_size * slot_num * embedding_size);
-  platform::GpuLaunchConfig config = GetGpuLaunchConfig1D(dev_ctx, N);
+  platform::GpuLaunchConfig config = platform::GetGpuLaunchConfig1D(dev_ctx, N);
   // first sum pool
   FusedSeqpoolKernelNormal<<<config.block_per_grid.x,
                              config.thread_per_block.x,
@@ -209,7 +209,8 @@ void FusedSeqpoolCVM(const framework::ExecutionContext
     // not need show click input
     N = static_cast<size_t>(batch_size * slot_num *
                             (embedding_size - cvm_offset));
-    platform::GpuLaunchConfig config = GetGpuLaunchConfig1D(dev_ctx, N);
+    platform::GpuLaunchConfig config =
+        platform::GetGpuLaunchConfig1D(dev_ctx, N);
     FusedCVMKernelNoCVM<<<config.block_per_grid.x,
                           config.thread_per_block.x,
                           0,
@@ -391,7 +392,7 @@ void FusedSeqpoolCVMGrad(const framework::ExecutionContext &ctx,
 #endif
 
   size_t N = static_cast<size_t>(batch_size * slot_num * embedding_size);
-  auto config = GetGpuLaunchConfig1D(dev_ctx, N);
+  auto config = platform::GetGpuLaunchConfig1D(dev_ctx, N);
   if (use_cvm) {
     // join grad
     FusedSeqpoolCVMGradKernelWithCVM<<<config.block_per_grid.x,

diff --git a/paddle/fluid/operators/gru_op.cu.cc b/paddle/fluid/operators/gru_op.cu.cc
@@ -14,13 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/gru_op.h"
 
-namespace paddle {
-namespace platform {
-class CUDADeviceContext;
-
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 

diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu
@@ -150,11 +150,6 @@ void CrossEntropyFunctor<DeviceContext, T>::operator()(
   }
 }
 
-template class CrossEntropyFunctor<platform::CUDADeviceContext, float>;
-template class CrossEntropyFunctor<platform::CUDADeviceContext, double>;
-template class CrossEntropyFunctor<platform::CUDADeviceContext,
-                                   platform::float16>;
-
 template class CrossEntropyFunctor<phi::GPUContext, float>;
 template class CrossEntropyFunctor<phi::GPUContext, double>;
 template class CrossEntropyFunctor<phi::GPUContext, platform::float16>;

diff --git a/paddle/fluid/operators/math/im2col.cu b/paddle/fluid/operators/math/im2col.cu
@@ -308,24 +308,12 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
   }
 };
 
-template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CUDADeviceContext,
-                             float>;
-template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CUDADeviceContext,
-                             double>;
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
                              phi::GPUContext,
                              float>;
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
                              phi::GPUContext,
                              double>;
-template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CUDADeviceContext,
-                             float>;
-template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CUDADeviceContext,
-                             double>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
                              phi::GPUContext,
                              float>;
@@ -576,25 +564,13 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
   }
 };
 
-template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CUDADeviceContext,
-                             float>;
-template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CUDADeviceContext,
-                             double>;
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
                              phi::GPUContext,
                              float>;
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
                              phi::GPUContext,
                              double>;
 
-template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CUDADeviceContext,
-                             float>;
-template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CUDADeviceContext,
-                             double>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
                              phi::GPUContext,
                              float>;

diff --git a/paddle/fluid/operators/math/maxouting.cu b/paddle/fluid/operators/math/maxouting.cu
@@ -173,12 +173,6 @@ void MaxOutGradFunctor<DeviceContext, T>::operator()(
                                                               axis);
 }
 
-template class MaxOutGradFunctor<platform::CUDADeviceContext, float>;
-template class MaxOutGradFunctor<platform::CUDADeviceContext, double>;
-
-template class MaxOutFunctor<platform::CUDADeviceContext, float>;
-template class MaxOutFunctor<platform::CUDADeviceContext, double>;
-
 template class MaxOutGradFunctor<phi::GPUContext, float>;
 template class MaxOutGradFunctor<phi::GPUContext, double>;
 

diff --git a/paddle/fluid/operators/math/sample_prob.h b/paddle/fluid/operators/math/sample_prob.h
@@ -22,12 +22,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/sampler.h"
 #include "paddle/phi/core/ddim.h"
 
-namespace paddle {
-namespace platform {
-class CUDADeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 namespace math {

diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -133,77 +133,6 @@ __global__ void SelectedRowsAddTensorKernel(const T* selected_rows,
 }
 }  // namespace
 
-template <typename T>
-struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& context,
-                  const phi::SelectedRows& input1,
-                  const framework::Tensor& input2,
-                  framework::Tensor* output) {
-    auto in1_height = input1.height();
-    auto in2_dims = input2.dims();
-    auto out_dims = output->dims();
-    PADDLE_ENFORCE_EQ(
-        in1_height,
-        in2_dims[0],
-        platform::errors::InvalidArgument(
-            "The two inputs height must be equal."
-            "But received first input height = [%d], first input height = [%d]",
-            in1_height,
-            in2_dims[0]));
-    PADDLE_ENFORCE_EQ(
-        in1_height,
-        out_dims[0],
-        platform::errors::InvalidArgument(
-            "The input and output height must be equal."
-            "But received input height = [%d], output height = [%d]",
-            in1_height,
-            out_dims[0]));
-
-    auto& in1_value = input1.value();
-    auto& in1_rows = input1.rows();
-
-    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(
-        in1_row_numel,
-        input2.numel() / in1_height,
-        platform::errors::InvalidArgument(
-            "The two inputs width must be equal."
-            "But received first input width = [%d], second input width = [%d]",
-            in1_row_numel,
-            input2.numel() / in1_height));
-    PADDLE_ENFORCE_EQ(
-        in1_row_numel,
-        output->numel() / in1_height,
-        platform::errors::InvalidArgument(
-            "The input and output width must be equal."
-            "But received input width = [%d], output width = [%d]",
-            in1_row_numel,
-            output->numel() / in1_height));
-
-    auto* in1_data = in1_value.data<T>();
-    auto* in2_data = input2.data<T>();
-    auto* out_data = output->data<T>();
-
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T> functor;
-    functor(context, output, static_cast<T>(0));
-
-    const int block_size = 256;
-    dim3 threads(block_size, 1);
-    dim3 grid(in1_rows.size(), 1);
-    paddle::framework::MixVector<int64_t> mixv_in1_rows(&in1_rows);
-    SelectedRowsAddTensorKernel<T, block_size>
-        <<<grid, threads, 0, context.stream()>>>(
-            in1_data,
-            mixv_in1_rows.CUDAData(context.GetPlace()),
-            out_data,
-            in1_row_numel);
-
-    auto out_eigen = framework::EigenVector<T>::Flatten(*output);
-    auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
-    out_eigen.device(*context.eigen_device()) = out_eigen + in2_eigen;
-  }
-};
-
 template <typename T>
 struct SelectedRowsAddTensor<phi::GPUContext, T> {
   void operator()(const phi::GPUContext& context,
@@ -275,12 +204,6 @@ struct SelectedRowsAddTensor<phi::GPUContext, T> {
   }
 };
 
-template struct SelectedRowsAddTensor<platform::CUDADeviceContext, float>;
-template struct SelectedRowsAddTensor<platform::CUDADeviceContext, double>;
-template struct SelectedRowsAdd<platform::CUDADeviceContext, platform::float16>;
-template struct SelectedRowsAddTensor<platform::CUDADeviceContext,
-                                      platform::float16>;
-
 template struct SelectedRowsAddTensor<phi::GPUContext, float>;
 template struct SelectedRowsAddTensor<phi::GPUContext, double>;
 template struct SelectedRowsAdd<phi::GPUContext, platform::float16>;
@@ -363,50 +286,6 @@ __global__ void SelectedRowsAddToTensorKernel(const T* selected_rows,
 }
 }  // namespace
 
-template <typename T>
-struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& context,
-                  const phi::SelectedRows& input1,
-                  framework::Tensor* input2) {
-    auto in1_height = input1.height();
-    auto in2_dims = input2->dims();
-    PADDLE_ENFORCE_EQ(
-        in1_height,
-        in2_dims[0],
-        platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But received first input height = "
-                                          "[%d], second input height = [%d]",
-                                          in1_height,
-                                          in2_dims[0]));
-
-    auto& in1_value = input1.value();
-    auto& in1_rows = input1.rows();
-
-    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(
-        in1_row_numel,
-        input2->numel() / in1_height,
-        platform::errors::InvalidArgument(
-            "The two inputs width must be equal."
-            "But received first input width = [%d], second input width = [%d]",
-            in1_row_numel,
-            input2->numel() / in1_height));
-
-    auto* in1_data = in1_value.data<T>();
-    auto* in2_data = input2->data<T>();
-    const int block_size = 256;
-    dim3 threads(block_size, 1);
-    dim3 grid(in1_rows.size(), 1);
-    paddle::framework::MixVector<int64_t> mixv_in1_rows(&in1_rows);
-    SelectedRowsAddToTensorKernel<T, block_size>
-        <<<grid, threads, 0, context.stream()>>>(
-            in1_data,
-            mixv_in1_rows.CUDAData(context.GetPlace()),
-            in2_data,
-            in1_row_numel);
-  }
-};
-
 template <typename T>
 struct SelectedRowsAddToTensor<phi::GPUContext, T> {
   void operator()(const phi::GPUContext& context,
@@ -451,12 +330,6 @@ struct SelectedRowsAddToTensor<phi::GPUContext, T> {
   }
 };
 
-template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, float>;
-template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, double>;
-template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int>;
-template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int64_t>;
-template struct SelectedRowsAddToTensor<platform::CUDADeviceContext,
-                                        platform::float16>;
 template struct SelectedRowsAddToTensor<phi::GPUContext, float>;
 template struct SelectedRowsAddToTensor<phi::GPUContext, double>;
 template struct SelectedRowsAddToTensor<phi::GPUContext, int>;
@@ -625,34 +498,6 @@ struct MergeAddImpl {
   }
 };
 
-template <typename T>
-struct MergeAdd<platform::CUDADeviceContext, T> {
-  // unary functor, merge by adding duplicated rows in
-  // the input SelectedRows object.
-  phi::SelectedRows operator()(const platform::CUDADeviceContext& context,
-                               const phi::SelectedRows& input,
-                               const bool sorted_result) {
-    return MergeAddImpl<platform::CUDADeviceContext, T>()(
-        context, input, sorted_result);
-  }
-
-  void operator()(const platform::CUDADeviceContext& context,
-                  const phi::SelectedRows& input,
-                  phi::SelectedRows* output,
-                  const bool sorted_result) {
-    MergeAddImpl<platform::CUDADeviceContext, T>()(
-        context, input, output, sorted_result);
-  }
-
-  void operator()(const platform::CUDADeviceContext& context,
-                  const std::vector<const phi::SelectedRows*>& inputs,
-                  phi::SelectedRows* output,
-                  const bool sorted_result) {
-    MergeAddImpl<platform::CUDADeviceContext, T>()(
-        context, inputs, output, sorted_result);
-  }
-};
-
 template <typename T>
 struct MergeAdd<phi::GPUContext, T> {
   // unary functor, merge by adding duplicated rows in
@@ -678,10 +523,8 @@ struct MergeAdd<phi::GPUContext, T> {
   }
 };
 
-#define TEMPLATE_SPECIALIZED_FOR_MERGEADD(dtype)                    \
-  template struct MergeAddImpl<platform::CUDADeviceContext, dtype>; \
-  template struct MergeAddImpl<phi::GPUContext, dtype>;             \
-  template struct MergeAdd<platform::CUDADeviceContext, dtype>;     \
+#define TEMPLATE_SPECIALIZED_FOR_MERGEADD(dtype)        \
+  template struct MergeAddImpl<phi::GPUContext, dtype>; \
   template struct MergeAdd<phi::GPUContext, dtype>;
 
 TEMPLATE_SPECIALIZED_FOR_MERGEADD(float)