1010#pragma once
1111
1212#include < ATen/cuda/tunable/Tunable.h>
13+ #include < ATen/cuda/tunable/StreamTimer.h>
1314#include < ATen/cuda/Sleep.h>
1415#include < c10/cuda/CUDACachingAllocator.h>
1516
@@ -38,7 +39,57 @@ class Callable {
3839 }
3940};
4041
41- template <typename ParamsT, typename TimerT>
42+ namespace {
43+
44+ /* * http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance */
45+
46+ class Stats {
47+ public:
48+ Stats () {
49+ _n = 0UL ;
50+ _mean = 0.0 ;
51+ _M2 = 0.0 ;
52+ _sum = 0.0 ;
53+ _min = 0.0 ;
54+ _max = 0.0 ;
55+ }
56+
57+ void sample_value (const double x) {
58+ double delta = 0 ;
59+ _sum = _sum + x;
60+ if (0UL == _n) {
61+ _min = x;
62+ _max = x;
63+ }
64+ else {
65+ _min = _min < x ? _min : x;
66+ _max = _max > x ? _max : x;
67+ }
68+ _n = _n + 1UL ;
69+ delta = x - _mean;
70+ _mean = _mean + delta/_n;
71+ _M2 = _M2 + delta * (x - _mean);
72+ }
73+
74+ double variance () const {
75+ return _M2/(_n-1 );
76+ }
77+
78+ double stddev () const {
79+ return std::sqrt (variance ());
80+ }
81+
82+ unsigned long _n;
83+ double _mean;
84+ double _M2;
85+ double _sum;
86+ double _min;
87+ double _max;
88+ };
89+
90+ } // anonymous namespace
91+
92+ template <typename ParamsT>
4293class TunableOp {
4394 public:
4495 TunableOp () = default ;
@@ -99,10 +150,17 @@ class TunableOp {
99150 }
100151 }
101152
102- static double Profile (Callable<ParamsT> *op, const std::vector<ParamsT*> ¶m, size_t num_iter, size_t &offset) {
153+ static double ProfileSimple (Callable<ParamsT> *op, const std::vector<ParamsT*> ¶m, size_t num_iter, size_t &offset) {
103154 TuningContext* ctx = getTuningContext ();
104155 bool do_flush = ctx->IsICacheFlushEnabled ();
105- TimerT timer{};
156+ StreamTimerNoSync timer{};
157+
158+ // Small Mandatory Warmup
159+ // Reduces outliers
160+ for (size_t i = 0 ; i < 2 ; i++) {
161+ TORCH_CHECK (op->Call (param[(i+offset++)%param.size ()]) == OK);
162+ }
163+
106164 timer.Start ();
107165 for (size_t i = 0 ; i < num_iter; i++) {
108166 if (do_flush) {
@@ -114,6 +172,32 @@ class TunableOp {
114172 return timer.Duration () / num_iter;
115173 }
116174
175+ static Stats ProfileStats (Callable<ParamsT> *op, const std::vector<ParamsT*> ¶m, size_t num_iter, size_t &offset) {
176+ TuningContext* ctx = getTuningContext ();
177+ bool do_flush = ctx->IsICacheFlushEnabled ();
178+ std::vector<StreamTimerNoSync> timer (num_iter);
179+
180+ // Small Mandatory Warmup
181+ // Reduces outliers
182+ for (size_t i = 0 ; i < 2 ; i++) {
183+ TORCH_CHECK (op->Call (param[(i+offset++)%param.size ()]) == OK);
184+ }
185+
186+ for (size_t i = 0 ; i < num_iter; i++) {
187+ timer[i].Start ();
188+ TORCH_CHECK (op->Call (param[(i+offset++)%param.size ()]) == OK);
189+ timer[i].End ();
190+ if (do_flush) {
191+ at::cuda::flush_icache ();
192+ }
193+ }
194+ Stats s;
195+ for (size_t i = 0 ; i < num_iter; i++) {
196+ s.sample_value (timer[i].Duration ());
197+ }
198+ return s;
199+ }
200+
117201 protected:
118202 virtual ResultEntry FindFastest (const ParamsT* params) {
119203 TuningContext* ctx = getTuningContext ();
@@ -183,14 +267,25 @@ class TunableOp {
183267 }
184268
185269 // collect a small profile
186- constexpr const int approx_num_iter = 3 ;
187- auto approx_duration = Profile (candidate, reusable_params, approx_num_iter, offset);
270+ int approx_num_iter = 3 ;
271+ auto s = ProfileStats (candidate, reusable_params, approx_num_iter, offset);
272+ double approx_duration = s._mean ;
188273 // bail if too slow
189- if (approx_duration > 2 * min_duration_ms) {
274+ if (approx_duration > 1.5 * min_duration_ms) {
190275 TUNABLE_LOG3 (" ├──skip slow instance id=" , i, " , " , op_sig, ' (' , params_sig, " ) " , op_names_[i]);
191276 continue ;
192277 }
193278
279+ // 2nd phase skip, more aggressive
280+ approx_num_iter = 10 ;
281+ s = ProfileStats (candidate, reusable_params, approx_num_iter, offset);
282+ approx_duration = s._mean ;
283+ // bail if too slow
284+ if (approx_duration > 1.15 * min_duration_ms) {
285+ TUNABLE_LOG3 (" ├──2nd skip slow instance id=" , i, " , " , op_sig, ' (' , params_sig, " ) " , op_names_[i]);
286+ continue ;
287+ }
288+
194289 // for warmup does user set max duration, max iters, or both?
195290 // warmup is allowed to be skipped by setting either iterations or duration to 0
196291 double max_warmup_duration = ctx->GetMaxWarmupDurationMs ();
@@ -237,12 +332,27 @@ class TunableOp {
237332 " instance id=" , i, " , " , op_sig, " (" , params_sig, " ) " , op_names_[i]);
238333 TUNABLE_LOG3 (" ├──offset at " , offset);
239334 WarmUp (candidate, reusable_params, warmup_iter, offset);
240- auto duration_ms = Profile (candidate, reusable_params, tuning_iter, offset);
241- if (duration_ms < min_duration_ms) {
242- TUNABLE_LOG3 (" ├──found better instance id=" , i, " . " , duration_ms, " ms. " , op_names_[i]);
243- min_duration_ms = duration_ms;
335+ s = ProfileStats (candidate, reusable_params, tuning_iter, offset);
336+ auto s_stddev = s.stddev ();
337+ // Assume normal distribution.
338+ // Solution with smallest mean + 2*sigma will be a better solution?
339+ // if ((s._mean + 2*s_stddev) < (min_duration_ms + 2*min_stddev_ms)) {
340+ if (s._mean < min_duration_ms) {
341+ TUNABLE_LOG3 (" ├──found better instance id=" , i, " . " , s._mean , " ms. " , op_names_[i],
342+ " min " , s._min ,
343+ " max " , s._max ,
344+ " mean " , s._mean ,
345+ " std " , s_stddev);
346+ min_duration_ms = s._mean ;
244347 id_name = op_names_[i];
245348 }
349+ else {
350+ TUNABLE_LOG3 (" ├──found slower instance id=" , i, " . " , s._mean , " ms. " , op_names_[i],
351+ " min " , s._min ,
352+ " max " , s._max ,
353+ " mean " , s._mean ,
354+ " std " , s_stddev);
355+ }
246356 }
247357
248358 for (size_t i = 0 ; i < reusable_params.size (); i++) {
0 commit comments