Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Runtime] Flush L2 cache in time eval #15305

Merged
merged 1 commit into from
Jul 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion include/tvm/runtime/profiling.h
Original file line number Diff line number Diff line change
Expand Up @@ -579,13 +579,15 @@ PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, i
* defined by `repeats_to_cooldown`.
* \param repeats_to_cooldown The number of repeats before the
* cooldown is activated.
* \param cache_flush_bytes The number of bytes to flush from cache before
* \param f_preproc The function to be executed before we execute time
* evaluator.
* \return f_timer A timer function.
*/
PackedFunc WrapTimeEvaluator(PackedFunc f, Device dev, int number, int repeat, int min_repeat_ms,
int limit_zero_time_iterations, int cooldown_interval_ms,
int repeats_to_cooldown, PackedFunc f_preproc = nullptr);
int repeats_to_cooldown, int cache_flush_bytes = 0,
PackedFunc f_preproc = nullptr);

} // namespace profiling
} // namespace runtime
Expand Down
5 changes: 5 additions & 0 deletions python/tvm/runtime/module.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,7 @@ def time_evaluator(
limit_zero_time_iterations=100,
cooldown_interval_ms=0,
repeats_to_cooldown=1,
cache_flush_bytes=0,
f_preproc="",
):
"""Get an evaluator that measures time cost of running function.
Expand Down Expand Up @@ -358,6 +359,9 @@ def time_evaluator(
repeats_to_cooldown: int, optional
The number of repeats before the cooldown is activated.

cache_flush_bytes: int, optional
The number of bytes to flush from the cache before each repeat.

f_preproc: str, optional
The preprocess function name we want to execute before executing the time evaluator.

Expand All @@ -384,6 +388,7 @@ def time_evaluator(
limit_zero_time_iterations,
cooldown_interval_ms,
repeats_to_cooldown,
cache_flush_bytes,
f_preproc,
)

Expand Down
5 changes: 3 additions & 2 deletions src/runtime/crt/common/crt_runtime_api.c
Original file line number Diff line number Diff line change
Expand Up @@ -489,14 +489,15 @@ int RPCTimeEvaluator(TVMValue* args, int* type_codes, int num_args, TVMValue* re
int* ret_type_code) {
ret_val[0].v_handle = NULL;
ret_type_code[0] = kTVMNullptr;
if (num_args < 11) {
if (num_args < 12) {
TVMAPIErrorf("not enough args");
return kTvmErrorFunctionCallNumArguments;
}
if (type_codes[0] != kTVMModuleHandle || type_codes[1] != kTVMStr ||
type_codes[2] != kTVMArgInt || type_codes[3] != kTVMArgInt || type_codes[4] != kTVMArgInt ||
type_codes[5] != kTVMArgInt || type_codes[6] != kTVMArgInt || type_codes[7] != kTVMArgInt ||
type_codes[8] != kTVMArgInt || type_codes[9] != kTVMArgInt || type_codes[10] != kTVMStr) {
type_codes[8] != kTVMArgInt || type_codes[9] != kTVMArgInt || type_codes[10] != kTVMArgInt ||
type_codes[11] != kTVMStr) {
TVMAPIErrorf("one or more invalid arg types");
return kTvmErrorFunctionCallWrongArgType;
}
Expand Down
2 changes: 1 addition & 1 deletion src/runtime/graph_executor/debug/graph_executor_debug.cc
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ std::vector<double> GraphExecutorDebug::RunOpRPC(int index, int number, int repe
->
operator()(module_, name, static_cast<int>(dev.device_type), dev.device_id, number,
repeat, min_repeat_ms, limit_zero_time_iterations, cooldown_interval_ms,
repeats_to_cooldown, "");
repeats_to_cooldown, /*cache_flush_bytes=*/0, "");

int num_flat_args = num_inputs + num_outputs;
auto values = std::make_unique<TVMValue[]>(num_flat_args);
Expand Down
16 changes: 13 additions & 3 deletions src/runtime/profiling.cc
Original file line number Diff line number Diff line change
Expand Up @@ -861,7 +861,7 @@ TVM_REGISTER_GLOBAL("runtime.profiling.ProfileFunction")

PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat, int min_repeat_ms,
int limit_zero_time_iterations, int cooldown_interval_ms,
int repeats_to_cooldown, PackedFunc f_preproc) {
int repeats_to_cooldown, int cache_flush_bytes, PackedFunc f_preproc) {
ICHECK(pf != nullptr);

if (static_cast<int>(dev.device_type) == static_cast<int>(kDLMicroDev)) {
Expand All @@ -871,13 +871,20 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat,
}

auto ftimer = [pf, dev, number, repeat, min_repeat_ms, limit_zero_time_iterations,
cooldown_interval_ms, repeats_to_cooldown,
cooldown_interval_ms, repeats_to_cooldown, cache_flush_bytes,
f_preproc](TVMArgs args, TVMRetValue* rv) mutable {
TVMRetValue temp;
std::ostringstream os;
// skip first time call, to activate lazy compilation components.
pf.CallPacked(args, &temp);

// allocate two large arrays to flush L2 cache
NDArray arr1, arr2;
if (cache_flush_bytes > 0) {
arr1 = NDArray::Empty({cache_flush_bytes / 4}, {kDLInt, 32, 1}, dev);
arr2 = NDArray::Empty({cache_flush_bytes / 4}, {kDLInt, 32, 1}, dev);
}

DeviceAPI::Get(dev)->StreamSync(dev, nullptr);

for (int i = 0; i < repeat; ++i) {
Expand All @@ -892,7 +899,10 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat,
number = static_cast<int>(
std::max((min_repeat_ms / (duration_ms / number) + 1), number * golden_ratio));
}

if (cache_flush_bytes > 0) {
arr1.CopyFrom(arr2);
}
DeviceAPI::Get(dev)->StreamSync(dev, nullptr);
// start timing
Timer t = Timer::Start(dev);
for (int j = 0; j < number; ++j) {
Expand Down
28 changes: 14 additions & 14 deletions src/runtime/rpc/rpc_module.cc
Original file line number Diff line number Diff line change
Expand Up @@ -198,23 +198,23 @@ class RPCModuleNode final : public ModuleNode {
PackedFunc GetTimeEvaluator(const std::string& name, Device dev, int number, int repeat,
int min_repeat_ms, int limit_zero_time_iterations,
int cooldown_interval_ms, int repeats_to_cooldown,
const std::string& f_preproc_name) {
int cache_flush_bytes, const std::string& f_preproc_name) {
InitRemoteFunc(&remote_get_time_evaluator_, "runtime.RPCTimeEvaluator");
// Remove session mask because we pass dev by parts.
ICHECK_EQ(GetRPCSessionIndex(dev), sess_->table_index())
<< "ValueError: Need to pass the matched remote device to RPCModule.GetTimeEvaluator";
dev = RemoveRPCSessionMask(dev);

if (module_handle_ != nullptr) {
return remote_get_time_evaluator_(GetRef<Module>(this), name,
static_cast<int>(dev.device_type), dev.device_id, number,
repeat, min_repeat_ms, limit_zero_time_iterations,
cooldown_interval_ms, repeats_to_cooldown, f_preproc_name);
return remote_get_time_evaluator_(
GetRef<Module>(this), name, static_cast<int>(dev.device_type), dev.device_id, number,
repeat, min_repeat_ms, limit_zero_time_iterations, cooldown_interval_ms,
repeats_to_cooldown, cache_flush_bytes, f_preproc_name);
} else {
return remote_get_time_evaluator_(Optional<Module>(nullptr), name,
static_cast<int>(dev.device_type), dev.device_id, number,
repeat, min_repeat_ms, limit_zero_time_iterations,
cooldown_interval_ms, repeats_to_cooldown, f_preproc_name);
return remote_get_time_evaluator_(
Optional<Module>(nullptr), name, static_cast<int>(dev.device_type), dev.device_id, number,
repeat, min_repeat_ms, limit_zero_time_iterations, cooldown_interval_ms,
repeats_to_cooldown, cache_flush_bytes, f_preproc_name);
}
}

Expand Down Expand Up @@ -253,7 +253,7 @@ class RPCModuleNode final : public ModuleNode {
std::shared_ptr<RPCSession> sess_;
// remote function to get time evaluator
TypedPackedFunc<PackedFunc(Optional<Module>, std::string, int, int, int, int, int, int, int, int,
std::string)>
int, std::string)>
remote_get_time_evaluator_;
// remote function getter for modules.
TypedPackedFunc<PackedFunc(Module, std::string, bool)> remote_mod_get_function_;
Expand Down Expand Up @@ -372,7 +372,7 @@ inline void CPUCacheFlush(int begin_index, const TVMArgs& args) {
TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator")
.set_body_typed([](Optional<Module> opt_mod, std::string name, int device_type, int device_id,
int number, int repeat, int min_repeat_ms, int limit_zero_time_iterations,
int cooldown_interval_ms, int repeats_to_cooldown,
int cooldown_interval_ms, int repeats_to_cooldown, int cache_flush_bytes,
std::string f_preproc_name) {
Device dev;
dev.device_type = static_cast<DLDeviceType>(device_type);
Expand All @@ -384,7 +384,7 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator")
return static_cast<RPCModuleNode*>(m.operator->())
->GetTimeEvaluator(name, dev, number, repeat, min_repeat_ms,
limit_zero_time_iterations, cooldown_interval_ms,
repeats_to_cooldown, f_preproc_name);
repeats_to_cooldown, cache_flush_bytes, f_preproc_name);
} else {
PackedFunc f_preproc;
if (!f_preproc_name.empty()) {
Expand All @@ -397,7 +397,7 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator")
CHECK(pf != nullptr) << "Cannot find " << name << " in the global registry";
return profiling::WrapTimeEvaluator(pf, dev, number, repeat, min_repeat_ms,
limit_zero_time_iterations, cooldown_interval_ms,
repeats_to_cooldown, f_preproc);
repeats_to_cooldown, cache_flush_bytes, f_preproc);
}
} else {
auto* pf = runtime::Registry::Get(name);
Expand All @@ -411,7 +411,7 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator")
}
return profiling::WrapTimeEvaluator(*pf, dev, number, repeat, min_repeat_ms,
limit_zero_time_iterations, cooldown_interval_ms,
repeats_to_cooldown, f_preproc);
repeats_to_cooldown, cache_flush_bytes, f_preproc);
}
});

Expand Down
2 changes: 1 addition & 1 deletion web/emcc/tvmjs_support.cc
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ class AsyncLocalSession : public LocalSession {
CHECK(time_exec != nullptr) << "Cannot find wasm.GetTimer in the global function";
(*time_exec)(TypedPackedFunc<void(int)>(finvoke), dev, number, repeat, min_repeat_ms,
limit_zero_time_iterations, cooldown_interval_ms, repeats_to_cooldown,
on_complete);
/*cache_flush_bytes=*/0, on_complete);
};
return PackedFunc(ftimer);
}
Expand Down