Skip to content

Commit

Permalink
Add CUDNNv8 max pooling (#59413)
Browse files Browse the repository at this point in the history
* Add CUDNNv8 version of pool2d

* Minor fix

* Fix build failure

* Remove dygraph API

* Fix CI failure

* Fix CI failure

* Fix timeout

* Fix timeout

* Add comments

* Minor fix
  • Loading branch information
Tom-Zheng authored Jan 8, 2024
1 parent e2b4247 commit 41679e4
Show file tree
Hide file tree
Showing 17 changed files with 831 additions and 2 deletions.
1 change: 1 addition & 0 deletions paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@
'fused_dot_product_attention',
'nce',
'lars_momentum',
'max_pool2d_v2',
'recv_v2',
'rnn_',
'row_conv',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,5 @@
'fused_rotary_position_embedding',
'fused_bias_dropout_residual_layer_norm',
'fused_dot_product_attention',
'max_pool2d_v2',
]
9 changes: 9 additions & 0 deletions paddle/fluid/pybind/pybind.cc
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,14 @@ bool IsCompiledWithCUDA() {
#endif
}

bool IsCompiledWithCudnnFrontend() {
#ifndef PADDLE_WITH_CUDNN_FRONTEND
return false;
#else
return true;
#endif
}

bool IsCompiledWithDISTRIBUTE() {
#if !defined(PADDLE_WITH_DISTRIBUTE)
return false;
Expand Down Expand Up @@ -2124,6 +2132,7 @@ All parameter, weight, gradient are variables in Paddle.
});
m.def("is_compiled_with_avx", IsCompiledWithAVX);
m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
m.def("is_compiled_with_cudnn_frontend", IsCompiledWithCudnnFrontend);
m.def("is_compiled_with_rocm", IsCompiledWithROCM);
m.def("is_compiled_with_custom_device", IsCompiledWithCustomDevice);
m.def("is_compiled_with_ipu", IsCompiledWithIPU);
Expand Down
11 changes: 11 additions & 0 deletions paddle/phi/api/yaml/fused_backward.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,14 @@
func : fused_rotary_position_embedding_grad
data_type : out_q_grad
support_dygraph_mode : true

- backward_op : max_pool2d_v2_grad
forward : max_pool2d_v2(Tensor x, int[] kernel_size, int[] strides= {1, 1}, int[] paddings = {0, 0}, str data_format = "NCHW", bool global_pooling = false, bool adaptive = false) -> Tensor(out), Tensor(saved_idx)
args : (Tensor x, Tensor out, Tensor saved_idx, Tensor out_grad, int[] kernel_size, int[] strides, int[] paddings, str data_format, bool global_pooling, bool adaptive)
output : Tensor(x_grad)
infer_meta :
func : UnchangedInferMeta
param: [x]
kernel :
func : max_pool2d_v2_grad
param: [x, out, saved_idx, out_grad, kernel_size, strides, paddings, data_format, global_pooling, adaptive]
15 changes: 15 additions & 0 deletions paddle/phi/api/yaml/fused_ops.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,21 @@
func : layer_norm_act_xpu
data_type : x

# This op is implemented using CUDNN Frontend API, which serves as a supplement to
# legacy max pooling implementation. It shows better performance with NHWC layout and
# half precision.
- op : max_pool2d_v2
args : (Tensor x, int[] kernel_size, int[] strides= {1, 1}, int[] paddings = {0, 0}, str data_format = "NCHW", bool global_pooling = false, bool adaptive = false)
output : Tensor(out), Tensor(saved_idx)
infer_meta :
func : MaxPoolV2InferMeta
param : [x, kernel_size, strides, paddings, data_format, global_pooling, adaptive]
kernel :
func : max_pool2d_v2
param : [x, kernel_size, strides, paddings, data_format, global_pooling, adaptive]
intermediate: saved_idx
backward : max_pool2d_v2_grad

- op : multi_encoder_xpu
args : (Tensor x, Tensor[] fc_weight, Tensor[] fc_weight_max, Tensor[] fc_bias, Tensor[] ln_scale, Tensor[] ln_bias, Tensor mask, Tensor seq_lod, Tensor max_seq_len, int layer_num, bool norm_before, int hidden_dim, int head_num, int size_per_head, int ffn_hidden_dim_scale, int act_type, int relative_type, int slice_idx)
output : Tensor(out), Tensor(x_fp16), Tensor(out_fp16)
Expand Down
31 changes: 31 additions & 0 deletions paddle/phi/infermeta/unary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2349,6 +2349,37 @@ void MaxPoolWithIndexInferMeta(const MetaTensor& x,
mask->set_dtype(phi::CppTypeToDataType<int>::Type());
}

void MaxPoolV2InferMeta(const MetaTensor& x,
const std::vector<int>& kernel_size,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& data_format,
bool global_pooling,
bool adaptive,
MetaTensor* out,
MetaTensor* saved_idx,
MetaConfig config) {
PADDLE_ENFORCE_EQ(adaptive,
false,
phi::errors::InvalidArgument(
"max_pool2d_v2 op does not support adaptive."));
Pool2DInferMeta(x,
kernel_size,
strides,
paddings,
false,
false,
data_format,
"max",
global_pooling,
adaptive,
"EXPLICIT",
out,
config);
saved_idx->set_dims(out->dims());
saved_idx->set_dtype(phi::CppTypeToDataType<int>::Type());
}

void MeanAllInferMeta(const MetaTensor& x, MetaTensor* out) {
out->set_dims(common::make_ddim({}));
out->set_dtype(x.dtype());
Expand Down
11 changes: 11 additions & 0 deletions paddle/phi/infermeta/unary.h
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,17 @@ void MaxPoolWithIndexInferMeta(const MetaTensor& x,
MetaTensor* mask,
MetaConfig config = MetaConfig());

void MaxPoolV2InferMeta(const MetaTensor& x,
const std::vector<int>& kernel_size,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& data_format,
bool global_pooling,
bool adaptive,
MetaTensor* out,
MetaTensor* saved_idx,
MetaConfig config = MetaConfig());

void MeanAllInferMeta(const MetaTensor& x, MetaTensor* out);

void ModeInferMeta(const MetaTensor& x,
Expand Down
4 changes: 3 additions & 1 deletion paddle/phi/kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,9 @@ if(NOT WITH_CUDNN_FRONTEND)
"fusion/gpu/fused_scale_bias_relu_conv_bn_kernel.cu"
"fusion/gpu/fused_scale_bias_add_relu_kernel.cu"
"fusion/gpu/fused_dconv_drelu_dbn_kernel.cu"
"fusion/gpu/fused_dot_product_attention_op.cu")
"fusion/gpu/fused_dot_product_attention_op.cu"
"fusion/gpu/max_pool2d_v2_grad_kernel.cu"
"fusion/gpu/max_pool2d_v2_kernel.cu")
endif()

set(cc_search_pattern
Expand Down
4 changes: 3 additions & 1 deletion paddle/phi/kernels/autotune/cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,9 @@ enum class AlgorithmType {
kDgradDreluBnBwdWeight = 16,
kDbnApply = 17,
kBnActWgrad = 18,
kAlgorithmCount = 19
kPoolingForwardV8 = 19,
kPoolingBackwardV8 = 20,
kAlgorithmCount = 21
#endif
};

Expand Down
Loading

0 comments on commit 41679e4

Please sign in to comment.