From b4b6f85e7390299201c75b6bb1e595a2fd7bce8e Mon Sep 17 00:00:00 2001 From: spectrometerHBH Date: Wed, 12 Jul 2023 19:17:30 -0400 Subject: [PATCH] [Runtime] Flush L2 cache in time eval This PR introduces an optional cache flush functionality to `time_evaluator`. It is implemented by allocating two large empty NDArrays on the device so that the L2 cache are flushed. This gives us more accurate evaluation on the performance of a runtime function. --- include/tvm/runtime/profiling.h | 4 ++- python/tvm/runtime/module.py | 5 ++++ src/runtime/crt/common/crt_runtime_api.c | 5 ++-- .../debug/graph_executor_debug.cc | 2 +- src/runtime/profiling.cc | 16 +++++++++-- src/runtime/rpc/rpc_module.cc | 28 +++++++++---------- web/emcc/tvmjs_support.cc | 2 +- 7 files changed, 40 insertions(+), 22 deletions(-) diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h index 3922ef76dfb7..2a41aeff6bff 100644 --- a/include/tvm/runtime/profiling.h +++ b/include/tvm/runtime/profiling.h @@ -579,13 +579,15 @@ PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, i * defined by `repeats_to_cooldown`. * \param repeats_to_cooldown The number of repeats before the * cooldown is activated. + * \param cache_flush_bytes The number of bytes to flush from cache before * \param f_preproc The function to be executed before we execute time * evaluator. * \return f_timer A timer function. */ PackedFunc WrapTimeEvaluator(PackedFunc f, Device dev, int number, int repeat, int min_repeat_ms, int limit_zero_time_iterations, int cooldown_interval_ms, - int repeats_to_cooldown, PackedFunc f_preproc = nullptr); + int repeats_to_cooldown, int cache_flush_bytes = 0, + PackedFunc f_preproc = nullptr); } // namespace profiling } // namespace runtime diff --git a/python/tvm/runtime/module.py b/python/tvm/runtime/module.py index 82d82a0d463c..c9e3eb6add75 100644 --- a/python/tvm/runtime/module.py +++ b/python/tvm/runtime/module.py @@ -316,6 +316,7 @@ def time_evaluator( limit_zero_time_iterations=100, cooldown_interval_ms=0, repeats_to_cooldown=1, + cache_flush_bytes=0, f_preproc="", ): """Get an evaluator that measures time cost of running function. @@ -358,6 +359,9 @@ def time_evaluator( repeats_to_cooldown: int, optional The number of repeats before the cooldown is activated. + cache_flush_bytes: int, optional + The number of bytes to flush from the cache before each repeat. + f_preproc: str, optional The preprocess function name we want to execute before executing the time evaluator. @@ -384,6 +388,7 @@ def time_evaluator( limit_zero_time_iterations, cooldown_interval_ms, repeats_to_cooldown, + cache_flush_bytes, f_preproc, ) diff --git a/src/runtime/crt/common/crt_runtime_api.c b/src/runtime/crt/common/crt_runtime_api.c index b79e2247379a..a9c40c458322 100644 --- a/src/runtime/crt/common/crt_runtime_api.c +++ b/src/runtime/crt/common/crt_runtime_api.c @@ -489,14 +489,15 @@ int RPCTimeEvaluator(TVMValue* args, int* type_codes, int num_args, TVMValue* re int* ret_type_code) { ret_val[0].v_handle = NULL; ret_type_code[0] = kTVMNullptr; - if (num_args < 11) { + if (num_args < 12) { TVMAPIErrorf("not enough args"); return kTvmErrorFunctionCallNumArguments; } if (type_codes[0] != kTVMModuleHandle || type_codes[1] != kTVMStr || type_codes[2] != kTVMArgInt || type_codes[3] != kTVMArgInt || type_codes[4] != kTVMArgInt || type_codes[5] != kTVMArgInt || type_codes[6] != kTVMArgInt || type_codes[7] != kTVMArgInt || - type_codes[8] != kTVMArgInt || type_codes[9] != kTVMArgInt || type_codes[10] != kTVMStr) { + type_codes[8] != kTVMArgInt || type_codes[9] != kTVMArgInt || type_codes[10] != kTVMArgInt || + type_codes[11] != kTVMStr) { TVMAPIErrorf("one or more invalid arg types"); return kTvmErrorFunctionCallWrongArgType; } diff --git a/src/runtime/graph_executor/debug/graph_executor_debug.cc b/src/runtime/graph_executor/debug/graph_executor_debug.cc index 94e27703b68c..0dbcbff46ff2 100644 --- a/src/runtime/graph_executor/debug/graph_executor_debug.cc +++ b/src/runtime/graph_executor/debug/graph_executor_debug.cc @@ -143,7 +143,7 @@ std::vector GraphExecutorDebug::RunOpRPC(int index, int number, int repe -> operator()(module_, name, static_cast(dev.device_type), dev.device_id, number, repeat, min_repeat_ms, limit_zero_time_iterations, cooldown_interval_ms, - repeats_to_cooldown, ""); + repeats_to_cooldown, /*cache_flush_bytes=*/0, ""); int num_flat_args = num_inputs + num_outputs; auto values = std::make_unique(num_flat_args); diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc index 8b6600fbdfa9..2300c1a4e7c8 100644 --- a/src/runtime/profiling.cc +++ b/src/runtime/profiling.cc @@ -861,7 +861,7 @@ TVM_REGISTER_GLOBAL("runtime.profiling.ProfileFunction") PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat, int min_repeat_ms, int limit_zero_time_iterations, int cooldown_interval_ms, - int repeats_to_cooldown, PackedFunc f_preproc) { + int repeats_to_cooldown, int cache_flush_bytes, PackedFunc f_preproc) { ICHECK(pf != nullptr); if (static_cast(dev.device_type) == static_cast(kDLMicroDev)) { @@ -871,13 +871,20 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat, } auto ftimer = [pf, dev, number, repeat, min_repeat_ms, limit_zero_time_iterations, - cooldown_interval_ms, repeats_to_cooldown, + cooldown_interval_ms, repeats_to_cooldown, cache_flush_bytes, f_preproc](TVMArgs args, TVMRetValue* rv) mutable { TVMRetValue temp; std::ostringstream os; // skip first time call, to activate lazy compilation components. pf.CallPacked(args, &temp); + // allocate two large arrays to flush L2 cache + NDArray arr1, arr2; + if (cache_flush_bytes > 0) { + arr1 = NDArray::Empty({cache_flush_bytes / 4}, {kDLInt, 32, 1}, dev); + arr2 = NDArray::Empty({cache_flush_bytes / 4}, {kDLInt, 32, 1}, dev); + } + DeviceAPI::Get(dev)->StreamSync(dev, nullptr); for (int i = 0; i < repeat; ++i) { @@ -892,7 +899,10 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat, number = static_cast( std::max((min_repeat_ms / (duration_ms / number) + 1), number * golden_ratio)); } - + if (cache_flush_bytes > 0) { + arr1.CopyFrom(arr2); + } + DeviceAPI::Get(dev)->StreamSync(dev, nullptr); // start timing Timer t = Timer::Start(dev); for (int j = 0; j < number; ++j) { diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc index d82a0cc4719a..d8ee2d4c769b 100644 --- a/src/runtime/rpc/rpc_module.cc +++ b/src/runtime/rpc/rpc_module.cc @@ -198,7 +198,7 @@ class RPCModuleNode final : public ModuleNode { PackedFunc GetTimeEvaluator(const std::string& name, Device dev, int number, int repeat, int min_repeat_ms, int limit_zero_time_iterations, int cooldown_interval_ms, int repeats_to_cooldown, - const std::string& f_preproc_name) { + int cache_flush_bytes, const std::string& f_preproc_name) { InitRemoteFunc(&remote_get_time_evaluator_, "runtime.RPCTimeEvaluator"); // Remove session mask because we pass dev by parts. ICHECK_EQ(GetRPCSessionIndex(dev), sess_->table_index()) @@ -206,15 +206,15 @@ class RPCModuleNode final : public ModuleNode { dev = RemoveRPCSessionMask(dev); if (module_handle_ != nullptr) { - return remote_get_time_evaluator_(GetRef(this), name, - static_cast(dev.device_type), dev.device_id, number, - repeat, min_repeat_ms, limit_zero_time_iterations, - cooldown_interval_ms, repeats_to_cooldown, f_preproc_name); + return remote_get_time_evaluator_( + GetRef(this), name, static_cast(dev.device_type), dev.device_id, number, + repeat, min_repeat_ms, limit_zero_time_iterations, cooldown_interval_ms, + repeats_to_cooldown, cache_flush_bytes, f_preproc_name); } else { - return remote_get_time_evaluator_(Optional(nullptr), name, - static_cast(dev.device_type), dev.device_id, number, - repeat, min_repeat_ms, limit_zero_time_iterations, - cooldown_interval_ms, repeats_to_cooldown, f_preproc_name); + return remote_get_time_evaluator_( + Optional(nullptr), name, static_cast(dev.device_type), dev.device_id, number, + repeat, min_repeat_ms, limit_zero_time_iterations, cooldown_interval_ms, + repeats_to_cooldown, cache_flush_bytes, f_preproc_name); } } @@ -253,7 +253,7 @@ class RPCModuleNode final : public ModuleNode { std::shared_ptr sess_; // remote function to get time evaluator TypedPackedFunc, std::string, int, int, int, int, int, int, int, int, - std::string)> + int, std::string)> remote_get_time_evaluator_; // remote function getter for modules. TypedPackedFunc remote_mod_get_function_; @@ -372,7 +372,7 @@ inline void CPUCacheFlush(int begin_index, const TVMArgs& args) { TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator") .set_body_typed([](Optional opt_mod, std::string name, int device_type, int device_id, int number, int repeat, int min_repeat_ms, int limit_zero_time_iterations, - int cooldown_interval_ms, int repeats_to_cooldown, + int cooldown_interval_ms, int repeats_to_cooldown, int cache_flush_bytes, std::string f_preproc_name) { Device dev; dev.device_type = static_cast(device_type); @@ -384,7 +384,7 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator") return static_cast(m.operator->()) ->GetTimeEvaluator(name, dev, number, repeat, min_repeat_ms, limit_zero_time_iterations, cooldown_interval_ms, - repeats_to_cooldown, f_preproc_name); + repeats_to_cooldown, cache_flush_bytes, f_preproc_name); } else { PackedFunc f_preproc; if (!f_preproc_name.empty()) { @@ -397,7 +397,7 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator") CHECK(pf != nullptr) << "Cannot find " << name << " in the global registry"; return profiling::WrapTimeEvaluator(pf, dev, number, repeat, min_repeat_ms, limit_zero_time_iterations, cooldown_interval_ms, - repeats_to_cooldown, f_preproc); + repeats_to_cooldown, cache_flush_bytes, f_preproc); } } else { auto* pf = runtime::Registry::Get(name); @@ -411,7 +411,7 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator") } return profiling::WrapTimeEvaluator(*pf, dev, number, repeat, min_repeat_ms, limit_zero_time_iterations, cooldown_interval_ms, - repeats_to_cooldown, f_preproc); + repeats_to_cooldown, cache_flush_bytes, f_preproc); } }); diff --git a/web/emcc/tvmjs_support.cc b/web/emcc/tvmjs_support.cc index 5bc1c32382ce..a314f08fb472 100644 --- a/web/emcc/tvmjs_support.cc +++ b/web/emcc/tvmjs_support.cc @@ -297,7 +297,7 @@ class AsyncLocalSession : public LocalSession { CHECK(time_exec != nullptr) << "Cannot find wasm.GetTimer in the global function"; (*time_exec)(TypedPackedFunc(finvoke), dev, number, repeat, min_repeat_ms, limit_zero_time_iterations, cooldown_interval_ms, repeats_to_cooldown, - on_complete); + /*cache_flush_bytes=*/0, on_complete); }; return PackedFunc(ftimer); }