Skip to content

Commit

Permalink
unify fluid::CUDADeviceContext and phi::GpuContext (#44723)
Browse files Browse the repository at this point in the history
* remove cudaDeviceContext

* remove more template

* fix rocm compile
  • Loading branch information
zhiqiu authored Jul 29, 2022
1 parent 0a2db7c commit 8849056
Show file tree
Hide file tree
Showing 26 changed files with 122 additions and 2,801 deletions.
6 changes: 0 additions & 6 deletions paddle/fluid/framework/details/eager_deletion_op_handle.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,6 @@
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"

namespace paddle {
namespace platform {
class CUDADeviceContext;
} // namespace platform
} // namespace paddle

namespace paddle {
namespace framework {
class GarbageCollector;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,6 @@
#include "paddle/fluid/platform/place.h"

namespace paddle {

namespace platform {
class CUDADeviceContext;
} // namespace platform

namespace memory {
namespace allocation {

Expand Down
7 changes: 0 additions & 7 deletions paddle/fluid/operators/cudnn_lstm_op.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,6 @@ limitations under the License. */
#include "paddle/fluid/operators/miopen_lstm_cache.h"
#endif

namespace paddle {
namespace platform {
class CUDADeviceContext;

} // namespace platform
} // namespace paddle

namespace paddle {
namespace operators {

Expand Down
7 changes: 4 additions & 3 deletions paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ void FusedSeqpoolCVM(const framework::ExecutionContext
#endif

size_t N = static_cast<size_t>(batch_size * slot_num * embedding_size);
platform::GpuLaunchConfig config = GetGpuLaunchConfig1D(dev_ctx, N);
platform::GpuLaunchConfig config = platform::GetGpuLaunchConfig1D(dev_ctx, N);
// first sum pool
FusedSeqpoolKernelNormal<<<config.block_per_grid.x,
config.thread_per_block.x,
Expand All @@ -209,7 +209,8 @@ void FusedSeqpoolCVM(const framework::ExecutionContext
// not need show click input
N = static_cast<size_t>(batch_size * slot_num *
(embedding_size - cvm_offset));
platform::GpuLaunchConfig config = GetGpuLaunchConfig1D(dev_ctx, N);
platform::GpuLaunchConfig config =
platform::GetGpuLaunchConfig1D(dev_ctx, N);
FusedCVMKernelNoCVM<<<config.block_per_grid.x,
config.thread_per_block.x,
0,
Expand Down Expand Up @@ -391,7 +392,7 @@ void FusedSeqpoolCVMGrad(const framework::ExecutionContext &ctx,
#endif

size_t N = static_cast<size_t>(batch_size * slot_num * embedding_size);
auto config = GetGpuLaunchConfig1D(dev_ctx, N);
auto config = platform::GetGpuLaunchConfig1D(dev_ctx, N);
if (use_cvm) {
// join grad
FusedSeqpoolCVMGradKernelWithCVM<<<config.block_per_grid.x,
Expand Down
7 changes: 0 additions & 7 deletions paddle/fluid/operators/gru_op.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,6 @@ limitations under the License. */

#include "paddle/fluid/operators/gru_op.h"

namespace paddle {
namespace platform {
class CUDADeviceContext;

} // namespace platform
} // namespace paddle

namespace paddle {
namespace operators {

Expand Down
5 changes: 0 additions & 5 deletions paddle/fluid/operators/math/cross_entropy.cu
Original file line number Diff line number Diff line change
Expand Up @@ -150,11 +150,6 @@ void CrossEntropyFunctor<DeviceContext, T>::operator()(
}
}

template class CrossEntropyFunctor<platform::CUDADeviceContext, float>;
template class CrossEntropyFunctor<platform::CUDADeviceContext, double>;
template class CrossEntropyFunctor<platform::CUDADeviceContext,
platform::float16>;

template class CrossEntropyFunctor<phi::GPUContext, float>;
template class CrossEntropyFunctor<phi::GPUContext, double>;
template class CrossEntropyFunctor<phi::GPUContext, platform::float16>;
Expand Down
24 changes: 0 additions & 24 deletions paddle/fluid/operators/math/im2col.cu
Original file line number Diff line number Diff line change
Expand Up @@ -308,24 +308,12 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
}
};

template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
platform::CUDADeviceContext,
float>;
template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
platform::CUDADeviceContext,
double>;
template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
phi::GPUContext,
float>;
template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
phi::GPUContext,
double>;
template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
platform::CUDADeviceContext,
float>;
template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
platform::CUDADeviceContext,
double>;
template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
phi::GPUContext,
float>;
Expand Down Expand Up @@ -576,25 +564,13 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
}
};

template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
platform::CUDADeviceContext,
float>;
template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
platform::CUDADeviceContext,
double>;
template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
phi::GPUContext,
float>;
template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
phi::GPUContext,
double>;

template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
platform::CUDADeviceContext,
float>;
template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
platform::CUDADeviceContext,
double>;
template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
phi::GPUContext,
float>;
Expand Down
6 changes: 0 additions & 6 deletions paddle/fluid/operators/math/maxouting.cu
Original file line number Diff line number Diff line change
Expand Up @@ -173,12 +173,6 @@ void MaxOutGradFunctor<DeviceContext, T>::operator()(
axis);
}

template class MaxOutGradFunctor<platform::CUDADeviceContext, float>;
template class MaxOutGradFunctor<platform::CUDADeviceContext, double>;

template class MaxOutFunctor<platform::CUDADeviceContext, float>;
template class MaxOutFunctor<platform::CUDADeviceContext, double>;

template class MaxOutGradFunctor<phi::GPUContext, float>;
template class MaxOutGradFunctor<phi::GPUContext, double>;

Expand Down
6 changes: 0 additions & 6 deletions paddle/fluid/operators/math/sample_prob.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,6 @@ limitations under the License. */
#include "paddle/fluid/operators/math/sampler.h"
#include "paddle/phi/core/ddim.h"

namespace paddle {
namespace platform {
class CUDADeviceContext;
} // namespace platform
} // namespace paddle

namespace paddle {
namespace operators {
namespace math {
Expand Down
161 changes: 2 additions & 159 deletions paddle/fluid/operators/math/selected_rows_functor.cu
Original file line number Diff line number Diff line change
Expand Up @@ -133,77 +133,6 @@ __global__ void SelectedRowsAddTensorKernel(const T* selected_rows,
}
} // namespace

template <typename T>
struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& context,
const phi::SelectedRows& input1,
const framework::Tensor& input2,
framework::Tensor* output) {
auto in1_height = input1.height();
auto in2_dims = input2.dims();
auto out_dims = output->dims();
PADDLE_ENFORCE_EQ(
in1_height,
in2_dims[0],
platform::errors::InvalidArgument(
"The two inputs height must be equal."
"But received first input height = [%d], first input height = [%d]",
in1_height,
in2_dims[0]));
PADDLE_ENFORCE_EQ(
in1_height,
out_dims[0],
platform::errors::InvalidArgument(
"The input and output height must be equal."
"But received input height = [%d], output height = [%d]",
in1_height,
out_dims[0]));

auto& in1_value = input1.value();
auto& in1_rows = input1.rows();

int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(
in1_row_numel,
input2.numel() / in1_height,
platform::errors::InvalidArgument(
"The two inputs width must be equal."
"But received first input width = [%d], second input width = [%d]",
in1_row_numel,
input2.numel() / in1_height));
PADDLE_ENFORCE_EQ(
in1_row_numel,
output->numel() / in1_height,
platform::errors::InvalidArgument(
"The input and output width must be equal."
"But received input width = [%d], output width = [%d]",
in1_row_numel,
output->numel() / in1_height));

auto* in1_data = in1_value.data<T>();
auto* in2_data = input2.data<T>();
auto* out_data = output->data<T>();

phi::funcs::SetConstant<platform::CUDADeviceContext, T> functor;
functor(context, output, static_cast<T>(0));

const int block_size = 256;
dim3 threads(block_size, 1);
dim3 grid(in1_rows.size(), 1);
paddle::framework::MixVector<int64_t> mixv_in1_rows(&in1_rows);
SelectedRowsAddTensorKernel<T, block_size>
<<<grid, threads, 0, context.stream()>>>(
in1_data,
mixv_in1_rows.CUDAData(context.GetPlace()),
out_data,
in1_row_numel);

auto out_eigen = framework::EigenVector<T>::Flatten(*output);
auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
out_eigen.device(*context.eigen_device()) = out_eigen + in2_eigen;
}
};

template <typename T>
struct SelectedRowsAddTensor<phi::GPUContext, T> {
void operator()(const phi::GPUContext& context,
Expand Down Expand Up @@ -275,12 +204,6 @@ struct SelectedRowsAddTensor<phi::GPUContext, T> {
}
};

template struct SelectedRowsAddTensor<platform::CUDADeviceContext, float>;
template struct SelectedRowsAddTensor<platform::CUDADeviceContext, double>;
template struct SelectedRowsAdd<platform::CUDADeviceContext, platform::float16>;
template struct SelectedRowsAddTensor<platform::CUDADeviceContext,
platform::float16>;

template struct SelectedRowsAddTensor<phi::GPUContext, float>;
template struct SelectedRowsAddTensor<phi::GPUContext, double>;
template struct SelectedRowsAdd<phi::GPUContext, platform::float16>;
Expand Down Expand Up @@ -363,50 +286,6 @@ __global__ void SelectedRowsAddToTensorKernel(const T* selected_rows,
}
} // namespace

template <typename T>
struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& context,
const phi::SelectedRows& input1,
framework::Tensor* input2) {
auto in1_height = input1.height();
auto in2_dims = input2->dims();
PADDLE_ENFORCE_EQ(
in1_height,
in2_dims[0],
platform::errors::InvalidArgument("The two inputs height must be equal."
"But received first input height = "
"[%d], second input height = [%d]",
in1_height,
in2_dims[0]));

auto& in1_value = input1.value();
auto& in1_rows = input1.rows();

int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(
in1_row_numel,
input2->numel() / in1_height,
platform::errors::InvalidArgument(
"The two inputs width must be equal."
"But received first input width = [%d], second input width = [%d]",
in1_row_numel,
input2->numel() / in1_height));

auto* in1_data = in1_value.data<T>();
auto* in2_data = input2->data<T>();
const int block_size = 256;
dim3 threads(block_size, 1);
dim3 grid(in1_rows.size(), 1);
paddle::framework::MixVector<int64_t> mixv_in1_rows(&in1_rows);
SelectedRowsAddToTensorKernel<T, block_size>
<<<grid, threads, 0, context.stream()>>>(
in1_data,
mixv_in1_rows.CUDAData(context.GetPlace()),
in2_data,
in1_row_numel);
}
};

template <typename T>
struct SelectedRowsAddToTensor<phi::GPUContext, T> {
void operator()(const phi::GPUContext& context,
Expand Down Expand Up @@ -451,12 +330,6 @@ struct SelectedRowsAddToTensor<phi::GPUContext, T> {
}
};

template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, float>;
template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, double>;
template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int>;
template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int64_t>;
template struct SelectedRowsAddToTensor<platform::CUDADeviceContext,
platform::float16>;
template struct SelectedRowsAddToTensor<phi::GPUContext, float>;
template struct SelectedRowsAddToTensor<phi::GPUContext, double>;
template struct SelectedRowsAddToTensor<phi::GPUContext, int>;
Expand Down Expand Up @@ -625,34 +498,6 @@ struct MergeAddImpl {
}
};

template <typename T>
struct MergeAdd<platform::CUDADeviceContext, T> {
// unary functor, merge by adding duplicated rows in
// the input SelectedRows object.
phi::SelectedRows operator()(const platform::CUDADeviceContext& context,
const phi::SelectedRows& input,
const bool sorted_result) {
return MergeAddImpl<platform::CUDADeviceContext, T>()(
context, input, sorted_result);
}

void operator()(const platform::CUDADeviceContext& context,
const phi::SelectedRows& input,
phi::SelectedRows* output,
const bool sorted_result) {
MergeAddImpl<platform::CUDADeviceContext, T>()(
context, input, output, sorted_result);
}

void operator()(const platform::CUDADeviceContext& context,
const std::vector<const phi::SelectedRows*>& inputs,
phi::SelectedRows* output,
const bool sorted_result) {
MergeAddImpl<platform::CUDADeviceContext, T>()(
context, inputs, output, sorted_result);
}
};

template <typename T>
struct MergeAdd<phi::GPUContext, T> {
// unary functor, merge by adding duplicated rows in
Expand All @@ -678,10 +523,8 @@ struct MergeAdd<phi::GPUContext, T> {
}
};

#define TEMPLATE_SPECIALIZED_FOR_MERGEADD(dtype) \
template struct MergeAddImpl<platform::CUDADeviceContext, dtype>; \
template struct MergeAddImpl<phi::GPUContext, dtype>; \
template struct MergeAdd<platform::CUDADeviceContext, dtype>; \
#define TEMPLATE_SPECIALIZED_FOR_MERGEADD(dtype) \
template struct MergeAddImpl<phi::GPUContext, dtype>; \
template struct MergeAdd<phi::GPUContext, dtype>;

TEMPLATE_SPECIALIZED_FOR_MERGEADD(float)
Expand Down
Loading

0 comments on commit 8849056

Please sign in to comment.