Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

optimize conv algo cache #41891

Merged
merged 48 commits into from
Aug 25, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
5547efd
optimizer conv alog speed
phlrain Apr 18, 2022
490abce
code polish
phlrain Apr 18, 2022
f06dcad
remove useless code
phlrain Apr 18, 2022
12f8364
fix compile error
phlrain Apr 18, 2022
d9dfe6c
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
phlrain Apr 18, 2022
1729ba8
fix cpu compile error
phlrain Apr 18, 2022
b8c05fd
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
phlrain Apr 18, 2022
0f64787
not use cudnn alog t
phlrain Apr 18, 2022
5314766
add search cache max number
phlrain Apr 18, 2022
56eb2c6
polish code
phlrain Apr 18, 2022
e5bfa67
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
phlrain Apr 18, 2022
aabc60f
fix cache test bug
phlrain Apr 18, 2022
09a04fb
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
phlrain Apr 18, 2022
c455f11
add groups data format to conv args
phlrain Apr 18, 2022
445fe4d
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
phlrain Apr 18, 2022
442a9e1
fix cache test bug
phlrain Apr 18, 2022
19c59f7
fix cudnn_deterministic bug
phlrain Apr 18, 2022
4b0a58d
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
phlrain Apr 18, 2022
40c7d23
fix test switch auto tune bug
phlrain Apr 18, 2022
184cab6
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
phlrain Apr 18, 2022
df57ee6
fix test swith autotune bug;
phlrain Apr 19, 2022
82c2419
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
phlrain Apr 19, 2022
6dbeaa5
fix conv cache bug
phlrain Apr 20, 2022
2cd1c00
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
phlrain Apr 20, 2022
c8fe9c6
fix cache test error
phlrain Apr 20, 2022
2be9374
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
phlrain Apr 20, 2022
7b277f4
fix cache test bug
phlrain Apr 20, 2022
83e1c8c
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
phlrain Apr 20, 2022
93885d4
fix windows mac compile error
phlrain Apr 20, 2022
3404862
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
phlrain Apr 20, 2022
fb53df7
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
phlrain Apr 28, 2022
4098916
fix workspace search error
phlrain Apr 28, 2022
b15a4be
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
phlrain Apr 28, 2022
11b8315
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
phlrain Jun 23, 2022
ba41e29
update cudnn cache
phlrain Jul 1, 2022
662dca2
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
phlrain Jul 4, 2022
99a33bf
fix cache test bug; test=develop
phlrain Jul 5, 2022
ecfa2e4
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
phlrain Jul 5, 2022
af7fa80
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
phlrain Aug 5, 2022
f7afc76
fix autotune swith test error
phlrain Aug 8, 2022
0101cc4
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
phlrain Aug 8, 2022
4ad71a8
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
phlrain Aug 17, 2022
a806f93
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
phlrain Aug 24, 2022
10de962
polish code
phlrain Aug 24, 2022
7290528
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
phlrain Aug 24, 2022
a549c20
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
phlrain Aug 24, 2022
65c5ecc
oplish code
phlrain Aug 25, 2022
f1a0da6
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
phlrain Aug 25, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 34 additions & 6 deletions paddle/fluid/operators/conv_base_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,13 @@ struct SearchAlgorithm {};
template <typename AlgoT>
struct SearchResult {
SearchResult() {}
explicit SearchResult(const phi::autotune::DnnNode& node)
: algo(static_cast<AlgoT>(node.algo)),
workspace_size(node.workspace_size) {}

explicit SearchResult(AlgoT a) : algo(a) {}
explicit SearchResult(AlgoT a, float t, size_t size)
: algo(a), time(t), workspace_size(size) {}

AlgoT algo = static_cast<AlgoT>(0);
float time = -1.f;
Expand Down Expand Up @@ -76,28 +82,50 @@ struct ConvArgsBase {
// dilations
std::vector<int> d;

// groups
int group;

// data foramt
DataLayout data_layout;

ConvArgsBase(const framework::Tensor* x,
const framework::Tensor* w,
const framework::Tensor* o,
const std::vector<int> s,
const std::vector<int> p,
const std::vector<int> d,
DataT dtype)
: x(x), w(w), o(o), s(s), p(p), d(d), cudnn_dtype(dtype) {}
DataT dtype,
int g,
DataLayout layout)
: x(x),
w(w),
o(o),
s(s),
p(p),
d(d),
cudnn_dtype(dtype),
group(g),
data_layout(layout) {}

template <typename T>
size_t GetCacheKey() const {
phi::autotune::ConvCacheKey Convert2ConvCacheKey() const {
auto x_shape = phi::vectorize(x->dims());
auto w_shape = phi::vectorize(w->dims());
VLOG(10) << "[ConvArgs] x_dims=" << x_shape << ", w_dims=" << w_shape
<< ", strides=" << s << ", paddings=" << p << ", dilations=" << d;
return phi::autotune::ConvKey(
<< ", strides=" << s << ", paddings=" << p << ", dilations=" << d
<< ",data= " << paddle::experimental::CppTypeToDataType<T>::Type()
<< ", group=" << group
<< ", data layout=" << static_cast<int64_t>(data_layout);

return phi::autotune::ConvCacheKey(
x_shape,
w_shape,
p,
s,
d,
paddle::experimental::CppTypeToDataType<T>::Type());
paddle::experimental::CppTypeToDataType<T>::Type(),
group,
static_cast<int64_t>(data_layout));
}
};

Expand Down
74 changes: 49 additions & 25 deletions paddle/fluid/operators/conv_cudnn_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -191,32 +191,36 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
SetConvMathType(ctx, dtype, args.cdesc);

if (deterministic) {
result = FindAlgoDeterministic();
result = FindAlgoDeterministic(args);
} else {
// 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
// 2. Once turning on auto-tune, runn heuristic search(default) before
// auto-tune process, run exhaustive_search during mentioned process.
// 3. After auto-tune process, run cached algorithm if cached, run
// default mode for the rest.
size_t key = args.GetCacheKey<T>();
auto key = args.Convert2ConvCacheKey<T>();
auto& cache = phi::autotune::AutoTuneCache::Instance().GetConvForward();
if (cache.Find(key)) {
result.algo = static_cast<AlgoT>(cache.Get(key));
auto t = cache.Get(key);
result.algo = static_cast<AlgoT>(t.algo);
result.workspace_size = t.workspace_size;
} else {
bool use_autotune =
phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
if (exhaustive_search || use_autotune) {
result = FindAlgoExhaustiveSearch<T>(args, ctx);
cache.Set(key, static_cast<int64_t>(result.algo));
} else {
result = FindAlgoHeuristic(args, ctx);
}
phi::autotune::DnnNode node(static_cast<int64_t>(result.algo),
result.workspace_size);
cache.Set(key, node);
}
}
VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search
<< ", deterministic=" << deterministic
<< ", choose algo=" << result.algo << ", workspace="
<< ToMegaBytes(GetWorkspaceSize(args, result.algo)) << " MB";
<< ", choose algo=" << result.algo
<< ", workspace=" << ToMegaBytes(result.workspace_size) << " MB";
return result;
}

Expand All @@ -236,8 +240,9 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
}

private:
static SearchResult<AlgoT> FindAlgoDeterministic() {
return SearchResult<AlgoT>(static_cast<AlgoT>(1));
static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) {
auto workspace_size = GetWorkspaceSize(args, static_cast<AlgoT>(1));
return SearchResult<AlgoT>(static_cast<AlgoT>(1), -1.0, workspace_size);
}

// Heuristic search mode, calling the cudnnGetXxxAlgorithm.
Expand Down Expand Up @@ -298,6 +303,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
workspace_size_limit,
&(result.algo)));
#endif
result.workspace_size = GetWorkspaceSize(args, result.algo);
return result;
}

Expand Down Expand Up @@ -343,6 +349,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
ChooseAlgoByWorkspace<PerfT, AlgoT>(
perf_results, workspace_size_limit, &result);

result.workspace_size = GetWorkspaceSize(args, result.algo);
return result;
}

Expand Down Expand Up @@ -394,33 +401,37 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
SetConvMathType(ctx, dtype, args.cdesc);

if (deterministic) {
result = FindAlgoDeterministic();
result = FindAlgoDeterministic(args);
} else {
// 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
// 2. Once turning on auto-tune, runn heuristic search(default) before
// auto-tune process, run exhaustive_search during mentioned process.
// 3. After auto-tune process, run cached algorithm if cached, run
// default mode for the rest.
size_t key = args.GetCacheKey<T>();
auto key = args.Convert2ConvCacheKey<T>();
auto& cache =
phi::autotune::AutoTuneCache::Instance().GetConvBackwardData();
if (cache.Find(key)) {
result.algo = static_cast<AlgoT>(cache.Get(key));
auto t = cache.Get(key);
result.algo = static_cast<AlgoT>(t.algo);
result.workspace_size = t.workspace_size;
} else {
bool use_autotune =
phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
if (exhaustive_search || use_autotune) {
result = FindAlgoExhaustiveSearch<T>(args, ctx);
cache.Set(key, static_cast<int64_t>(result.algo));
} else {
result = FindAlgoHeuristic(args, ctx);
}
phi::autotune::DnnNode node(static_cast<int64_t>(result.algo),
result.workspace_size);
cache.Set(key, node);
}
}
VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search
<< ", deterministic=" << deterministic
<< ", choose algo=" << result.algo << ", workspace="
<< ToMegaBytes(GetWorkspaceSize(args, result.algo)) << " MB";
<< ", choose algo=" << result.algo
<< ", workspace=" << ToMegaBytes(result.workspace_size) << " MB";
return result;
}

Expand All @@ -440,8 +451,11 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
}

private:
static SearchResult<AlgoT> FindAlgoDeterministic() {
return SearchResult<AlgoT>(CUDNN_CONVOLUTION_BWD_DATA_ALGO_1);
static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) {
auto workspace_size =
GetWorkspaceSize(args, CUDNN_CONVOLUTION_BWD_DATA_ALGO_1);
return SearchResult<AlgoT>(
CUDNN_CONVOLUTION_BWD_DATA_ALGO_1, -1.0, workspace_size);
}

static SearchResult<AlgoT> FindAlgoHeuristic(const ConvArgs& args,
Expand Down Expand Up @@ -513,7 +527,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
workspace_size_limit,
&(result.algo)));
#endif

result.workspace_size = GetWorkspaceSize(args, result.algo);
return result;
}

Expand Down Expand Up @@ -559,6 +573,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
ChooseAlgoByWorkspace<PerfT, AlgoT>(
perf_results, workspace_size_limit, &result);

result.workspace_size = GetWorkspaceSize(args, result.algo);
return result;
}

Expand Down Expand Up @@ -609,33 +624,37 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
SetConvMathType(ctx, dtype, args.cdesc);

if (deterministic) {
result = FindAlgoDeterministic();
result = FindAlgoDeterministic(args);
} else {
// 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
// 2. Once turning on auto-tune, runn heuristic search(default) before
// auto-tune process, run exhaustive_search during mentioned process.
// 3. After auto-tune process, run cached algorithm if cached, run
// default mode for the rest.
size_t key = args.GetCacheKey<T>();
auto key = args.Convert2ConvCacheKey<T>();
auto& cache =
phi::autotune::AutoTuneCache::Instance().GetConvBackwardFilter();
if (cache.Find(key)) {
result.algo = static_cast<AlgoT>(cache.Get(key));
auto t = cache.Get(key);
result.algo = static_cast<AlgoT>(t.algo);
result.workspace_size = t.workspace_size;
} else {
bool use_autotune =
phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
if (exhaustive_search || use_autotune) {
result = FindAlgoExhaustiveSearch<T>(args, ctx);
cache.Set(key, static_cast<int64_t>(result.algo));
} else {
result = FindAlgoHeuristic(args, ctx);
}
phi::autotune::DnnNode node(static_cast<int64_t>(result.algo),
result.workspace_size);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DnnNode 的功能和SearchResult的重复性比较高,如果能够用SearchResult替代更好。不过后续我们这边应该会在DnnNode的基础上扩展出来AutoTuneResult类型。

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

我有一个版本是使用的SearchResult,但是search Result 里面模板T是 cudnnConvolutionFwdAlgoPerf_t, 这样cache.h会依赖,gpu_info.h, cache.h 在cpu场景下也会使用,编译会有问题

cache.Set(key, node);
}
}
VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search
<< ", deterministic=" << deterministic
<< ", choose algo=" << result.algo << ", workspace="
<< ToMegaBytes(GetWorkspaceSize(args, result.algo)) << " MB";
<< ", choose algo=" << result.algo
<< ", workspace=" << ToMegaBytes(result.workspace_size) << " MB";
return result;
}

Expand All @@ -656,8 +675,11 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
}

private:
static SearchResult<AlgoT> FindAlgoDeterministic() {
return SearchResult<AlgoT>(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1);
static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) {
auto workspace_size =
GetWorkspaceSize(args, CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1);
return SearchResult<AlgoT>(
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, -1.0, workspace_size);
}

static SearchResult<AlgoT> FindAlgoHeuristic(const ConvArgs& args,
Expand Down Expand Up @@ -718,6 +740,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
&(result.algo)));
#endif

result.workspace_size = GetWorkspaceSize(args, result.algo);
return result;
}

Expand Down Expand Up @@ -786,6 +809,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
ChooseAlgo(perf_results, workspace_size_limit, &result);
}

result.workspace_size = GetWorkspaceSize(args, result.algo);
return result;
}

Expand Down
11 changes: 11 additions & 0 deletions paddle/fluid/platform/flags.cc
Original file line number Diff line number Diff line change
Expand Up @@ -984,6 +984,17 @@ PADDLE_DEFINE_EXPORTED_bool(nccl_blocking_wait, false, "nccl blocking wait");
*/
PADDLE_DEFINE_EXPORTED_bool(use_autotune, false, "Whether enable autotune.");

/**
* Conv Search cache max number related FLAG
* Name: FLAGS_search_cache_max_number
* Since Version: 2.3.0
* Value Range: int32, default=1000000
* Example:
*/
PADDLE_DEFINE_EXPORTED_int32(search_cache_max_number,
1000000,
"search_cache_max_number.");

/**
* Preformance related FLAG
* Name: einsum_opt
Expand Down
28 changes: 13 additions & 15 deletions paddle/phi/kernels/autotune/cache.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,21 +21,6 @@
namespace phi {
namespace autotune {

// Define the cache key of operator
size_t ConvKey(const std::vector<int64_t>& x_dims,
const std::vector<int64_t>& w_dims,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
phi::DataType dtype) {
return GetKey(x_dims,
w_dims,
strides,
paddings,
dilations,
static_cast<int64_t>(dtype));
}

size_t TransposeKey(const std::vector<int64_t>& x_dims,
const std::vector<int32_t>& perm,
phi::DataType dtype) {
Expand Down Expand Up @@ -73,6 +58,19 @@ void AutoTuneCache::UpdateStatus() {
cache_hits += v.second.CacheHits();
cache_misses += v.second.CacheMisses();
}

for (auto& v : cudnn_auto_tune_map_) {
VLOG(4) << "AlgoType: " << std::setfill(' ') << std::setw(name_width)
<< AlgorithmTypeString(v.first)
<< " Cache Size: " << v.second.Size()
<< " Hits: " << v.second.CacheHits()
<< " Misses: " << v.second.CacheMisses()
<< " Hit Rate: " << v.second.CacheHitRate();
size += v.second.Size();
cache_hits += v.second.CacheHits();
cache_misses += v.second.CacheMisses();
}

total_size_ = size;
total_cache_hits_ = cache_hits;
total_cache_misses_ = cache_misses;
Expand Down
Loading