Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Cudnn conv dgrad algo filtering #14310

Merged
merged 12 commits into from
Mar 13, 2019
Merged
25 changes: 21 additions & 4 deletions src/operator/nn/cudnn/cudnn_convolution-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,19 @@ class CuDNNConvolutionOp {
wshape[1],
wshape[2],
wshape[3]));

#if CUDNN_VERSION >= 7301 && CUDNN_VERSION < 7500
auto kernel_h = wshape[2];
auto kernel_w = wshape[3];
auto stride_h = stride[0];
auto stride_w = stride[1];
auto pad_h = pad[0];
auto pad_w = pad[1];
if (param_.layout.value() == kNCHW &&
(((stride_h == 2) && (kernel_h % 2 == 0) && (pad_h % 2 == 0)) ||
((stride_w == 2) && (kernel_w % 2 == 0) && (pad_w % 2 == 0)))) {
exclude_dgrad_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING;
}
#endif
} else if (param_.kernel.ndim() == 3) {
// 3d conv
#if CUDNN_MAJOR >= 5
Expand Down Expand Up @@ -714,7 +726,7 @@ class CuDNNConvolutionOp {
bwd_data_results.resize(actual_bwd_data_algos);
AlgoFinalSelect<cudnnConvolutionBwdDataAlgoPerf_t,
cudnnConvolutionBwdDataAlgo_t>(bwd_data_results, "backprop-to-data",
workspace_byte, bwd);
workspace_byte, bwd, exclude_dgrad_algo_);
#else
// CUDNN_MAJOR < 7
const int kMaxAlgos = 10;
Expand Down Expand Up @@ -910,12 +922,14 @@ class CuDNNConvolutionOp {
// workspace constraints.
template <typename PerfType, typename AlgoType>
void AlgoFinalSelect(const std::vector<PerfType> &perf_results, std::string kernel_name,
size_t workspace_byte, CuDNNAlgo<AlgoType> *algo) {
size_t workspace_byte, CuDNNAlgo<AlgoType> *algo,
int32_t algo_exclude = -1) {
// Determine the fastest acceptable algo that matches the algo_preference (-1 = any),
// regardless of mathType.
bool enforce_determinism = dmlc::GetEnv("MXNET_ENFORCE_DETERMINISM", false);
for (decltype(perf_results.size()) i = 0; i != perf_results.size(); ++i) {
const auto &result = perf_results[i];
bool algo_exclusion = static_cast<int32_t>(result.algo) == algo_exclude;
bool algo_is_tensor_core = false;
#if CUDNN_MAJOR >= 7
algo_is_tensor_core = result.mathType == CUDNN_TENSOR_OP_MATH;
Expand All @@ -924,7 +938,8 @@ class CuDNNConvolutionOp {
#if CUDNN_MAJOR >= 7
(!enforce_determinism || result.determinism == cudnnDeterminism_t::CUDNN_DETERMINISTIC) &&
#endif
(param_.cudnn_tune.value() == conv::kFastest || result.memory <= workspace_byte)) {
(param_.cudnn_tune.value() == conv::kLimited || result.memory <= workspace_byte) &&
!algo_exclusion) {
algo->Set(result.algo, algo_is_tensor_core);
return;
}
Expand Down Expand Up @@ -1104,6 +1119,8 @@ class CuDNNConvolutionOp {
bool cudnn_tensor_core_;
// Is req[kWeight] == conv::kAddTo ?
bool add_to_weight_;
// Is there a dgrad algo that should be avoided (-1 == none)?
int32_t exclude_dgrad_algo_ = -1;
ConvolutionParam param_;
};
#endif // __CUDACC__ && CUDNN
Expand Down
24 changes: 21 additions & 3 deletions src/operator/nn/cudnn/cudnn_deconvolution-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -446,6 +446,19 @@ class CuDNNDeconvolutionOp {
wshape[1],
wshape[2],
wshape[3]));
#if CUDNN_VERSION >= 7301 && CUDNN_VERSION < 7500
auto kernel_h = wshape[2];
auto kernel_w = wshape[3];
auto stride_h = stride[0];
auto stride_w = stride[1];
auto pad_h = o_pad[0];
auto pad_w = o_pad[1];
if (param_.layout.value() == kNCHW &&
(((stride_h == 2) && (kernel_h % 2 == 0) && (pad_h % 2 == 0)) ||
((stride_w == 2) && (kernel_w % 2 == 0) && (pad_w % 2 == 0)))) {
exclude_dgrad_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING;
}
#endif
} else if (param_.kernel.ndim() == 3) {
// 3d conv
index_t o_pad[3];
Expand Down Expand Up @@ -618,7 +631,7 @@ class CuDNNDeconvolutionOp {
bwd_data_results.resize(actual_bwd_data_algos);
AlgoFinalSelect<cudnnConvolutionBwdDataAlgoPerf_t,
cudnnConvolutionBwdDataAlgo_t>(bwd_data_results, "backprop-to-data",
workspace_byte, bwd);
workspace_byte, bwd, exclude_dgrad_algo_);
#else
// CUDNN_MAJOR < 7
const int kMaxAlgos = 10;
Expand Down Expand Up @@ -829,11 +842,13 @@ class CuDNNDeconvolutionOp {
// workspace constraints and a possible user algo preference.
template <typename PerfType, typename AlgoType>
void AlgoFinalSelect(const std::vector<PerfType> &perf_results, std::string kernel_name,
size_t workspace_byte, CuDNNAlgo<AlgoType> *algo) {
size_t workspace_byte, CuDNNAlgo<AlgoType> *algo,
int32_t algo_exclude = -1) {
// Determine the fastest acceptable algo regardless of mathType.
bool enforce_determinism = dmlc::GetEnv("MXNET_ENFORCE_DETERMINISM", false);
for (decltype(perf_results.size()) i = 0; i != perf_results.size(); ++i) {
const auto &result = perf_results[i];
bool algo_exclusion = static_cast<int32_t>(result.algo) == algo_exclude;
bool algo_is_tensor_core = false;
#if CUDNN_MAJOR >= 7
algo_is_tensor_core = result.mathType == CUDNN_TENSOR_OP_MATH;
Expand All @@ -842,7 +857,8 @@ class CuDNNDeconvolutionOp {
#if CUDNN_MAJOR >= 7
(!enforce_determinism || result.determinism == cudnnDeterminism_t::CUDNN_DETERMINISTIC) &&
#endif
(param_.cudnn_tune.value() != conv::kLimited || result.memory <= workspace_byte)) {
(param_.cudnn_tune.value() != conv::kLimited || result.memory <= workspace_byte) &&
!algo_exclusion) {
algo->Set(result.algo, algo_is_tensor_core);
return;
}
Expand Down Expand Up @@ -1025,6 +1041,8 @@ class CuDNNDeconvolutionOp {
bool cudnn_tensor_core_;
// Is req[kWeight] == deconv::kAddTo ?
bool add_to_weight_;
// Is there a dgrad algo that should be avoided (-1 == none)?
int32_t exclude_dgrad_algo_ = -1;
DeconvolutionParam param_;
};
#endif // CUDNN
Expand Down
26 changes: 26 additions & 0 deletions tests/python/gpu/test_operator_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -522,6 +522,32 @@ def test_convolution_options():
check_consistency_NxM([sym, sym_no_cudnn], ctx_list)


@with_seed()
def test_conv_deconv_guards():
# Test cases for convolution and deconvolution via strided fft. Ensure that the framework
# guards against problematic CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING in cuDNN [7.3.1,7.5)
# see https://docs.nvidia.com/deeplearning/sdk/cudnn-release-notes/rel_750.html#rel_750
tol = 1e-1
for (op, opname) in [(mx.sym.Convolution, 'conv'), (mx.sym.Deconvolution, 'deconv')]:
dataname = opname + '_data'
ctx = {'ctx': mx.gpu(0), dataname: (32, 32, 64, 64), 'type_dict': {dataname: np.float32}}
test_cases = [
{'num_filter':32, 'kernel':(6,6), 'pad':(0,0), 'stride':(2,2), 'name': opname},
{'num_filter':32, 'kernel':(6,6), 'pad':(1,1), 'stride':(2,2), 'name': opname},
{'num_filter':32, 'kernel':(6,7), 'pad':(0,1), 'stride':(2,2), 'name': opname},
{'num_filter':32, 'kernel':(7,6), 'pad':(1,0), 'stride':(2,2), 'name': opname},
{'num_filter':32, 'kernel':(7,7), 'pad':(0,0), 'stride':(2,2), 'name': opname},
{'num_filter':32, 'kernel':(7,7), 'pad':(1,1), 'stride':(2,2), 'name': opname}]
for test_case_args in test_cases:
try:
sym = op(**test_case_args)
sym_no_cudnn = op(cudnn_off=True, **test_case_args)
check_consistency([sym, sym_no_cudnn], [ctx, ctx], tol=tol)
except:
print('Test failure of mx.sym.{} with args: {}'.format(op.__name__, test_case_args))
raise


def _conv_with_num_streams(seed):
with random_seed(seed):
# Try to expose timing-dependent improper workspace sharing by parallel dgrad and wgrad
Expand Down