Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CLI TOOLS][RTVM] Improve rtvm tool with new options to measure native performance #15818

Merged
merged 3 commits into from
Sep 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions apps/cpp_rtvm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,11 @@ Command line usage
--input - Numpy file for the model input (optional and we use random of not given)
--output - Numpy file name to dump the model output as numpy
--dump-meta - Dump model meta information
--pre-compiled - The file name of a file where pre-compiled programs should be stored
--profile - Profile over all execution
--dry-run - Profile after given dry runs, default 10
--run-count - Profile for given runs, default 50
--zero-copy - Profile with zero copy api

Example
./rtvm --model=keras-resnet50 --device="opencl" --dump-meta
Expand Down Expand Up @@ -366,3 +371,20 @@ stored. If the pre-compiled file name was passed to the `rtvm` then After method
`Load`, method `UsePreCompiledProgram` is called. This method loads pre-compiled
programs if the file exists. In opposite case the file will be created and
pre-compiled programs will be saved to this file.

# Performnace Profiling Options
The tool has added few options to measure wall clock performance of the given model on Target natively.
--profile : Can turn on the profiling
--dry-run : The number of times dry run the model before mearuring the performance. Default value os 10
--run-count : The number times to run the model and take an average. Default value is 50.
--zero-copy: This option enables graph runtime zero copy to be used for input and output than byte copy to DLTensor.

Performance profile options dumps information summary as given below.
Module Load :27 ms
Graph Runtime Create :11 ms
Params Read :15 ms
Params Set :41 ms
Pre Compiled Progs Load :24 ms
Total Load Time :118 ms
Average ExecTime :27 ms
Unload Time :35.9236 ms
199 changes: 174 additions & 25 deletions apps/cpp_rtvm/main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#endif
#include <dmlc/logging.h>

#include <chrono>
#include <cstring>
#include <iostream>
#include <sstream>
Expand All @@ -54,7 +55,11 @@ static const string kUsage =
"--input - Numpy file for the model input (optional and we use random of not given)\n"
"--output - Numpy file name to dump the model output as numpy\n"
"--dump-meta - Dump model meta information\n"
"--pre-compiled - The file name of a file where pre-compiled programs should be stored"
"--pre-compiled - The file name of a file where pre-compiled programs should be stored\n"
"--profile - Profile over all execution\n"
"--dry-run - Profile after given dry runs, default 10\n"
"--run-count - Profile for given runs, default 50\n"
"--zero-copy - Profile with zero copy api\n"
"\n"
" Example\n"
" ./rtvm --model=keras-resnet50 --device=\"opencl\" --dump-meta\n"
Expand All @@ -68,14 +73,19 @@ static const string kUsage =
* \arg input Numpy file for the model input
* \arg output Numpy file name to dump the model output as numpy
* \arg pre_compiled File name where pre-compiled programs should be stored
* \arg profile Do we profile overall execution
*/
struct ToolArgs {
string model;
string device;
string input;
string output;
string pre_compiled;
bool dump_meta = false;
bool dump_meta{false};
bool profile{false};
int dry_run{10};
int run_count{50};
bool zero_copy{false};
};

/*!
Expand All @@ -89,6 +99,10 @@ void PrintArgs(const ToolArgs& args) {
LOG(INFO) << "Output = " << args.output;
LOG(INFO) << "Pre-compiled = " << args.pre_compiled;
LOG(INFO) << "Dump Metadata = " << ((args.dump_meta) ? ("True") : ("False"));
LOG(INFO) << "Profile = " << ((args.profile) ? ("True") : ("False"));
LOG(INFO) << "Dry Run = " << args.dry_run;
LOG(INFO) << "Run Count = " << args.run_count;
LOG(INFO) << "Zero Copy = " << ((args.zero_copy) ? ("True") : ("False"));
}

#if defined(__linux__) || defined(__ANDROID__)
Expand Down Expand Up @@ -178,6 +192,26 @@ void ParseCmdArgs(int argc, char* argv[], struct ToolArgs& args) {
}

args.pre_compiled = GetCmdOption(argc, argv, "--pre-compiled=");

const string pprofile = GetCmdOption(argc, argv, "--profile", true);
if (!pprofile.empty()) {
args.profile = true;
}

const string pdry_run = GetCmdOption(argc, argv, "--dry-run=");
if (!pdry_run.empty()) {
args.dry_run = stoi(pdry_run);
}

const string prun = GetCmdOption(argc, argv, "--run-count=");
if (!prun.empty()) {
args.run_count = stoi(prun);
}

const string pzcopy = GetCmdOption(argc, argv, "--zero-copy", true);
if (!pzcopy.empty()) {
args.zero_copy = true;
}
}

/*!
Expand All @@ -192,59 +226,174 @@ int ExecuteModel(ToolArgs& args) {
#endif

// Initialize TVM Runner
TVMRunner runner = TVMRunner(args.model, args.device);
auto runner = new TVMRunner(args.model, args.device);
srkreddy1238 marked this conversation as resolved.
Show resolved Hide resolved

// Load the model
runner.Load();
runner->Load();
if (!args.pre_compiled.empty()) {
runner.UsePreCompiledPrograms(args.pre_compiled);
runner->UsePreCompiledPrograms(args.pre_compiled);
}

// Query Model meta Information
TVMMetaInfo mInfo = runner.GetMetaInfo();
TVMMetaInfo mInfo = runner->GetMetaInfo();

// Print Meta Information
if (args.dump_meta) runner.PrintMetaInfo();
if (args.dump_meta) runner->PrintMetaInfo();

int total_exec_time = 0;

if (args.profile) {
if (args.dry_run) {
for (int ii = 0; ii < args.dry_run; ++ii) {
runner->Run();
}
TVMSynchronize(GetTVMDevice(args.device), 0, nullptr);
}
int total_time = 0;
std::map<std::string, NDArray> input_data_even, input_data_odd;
std::map<std::string, NDArray> output_data_even, output_data_odd;

std::map<std::string, char*> input_data;
std::map<std::string, char*> output_data;

// Alloc / populate and keep input data ready
for (auto& elem : mInfo.input_info) {
if (args.zero_copy) {
auto ndarr =
NDArray::Empty(elem.second.first, tvm::runtime::String2DLDataType(elem.second.second),
DLDevice{GetTVMDevice(args.device), 0});
input_data_even.insert({elem.first, ndarr});

ndarr =
NDArray::Empty(elem.second.first, tvm::runtime::String2DLDataType(elem.second.second),
DLDevice{GetTVMDevice(args.device), 0});
input_data_odd.insert({elem.first, ndarr});
} else {
char* data = (char*)malloc(runner->GetInputMemSize(elem.first));
input_data.insert({elem.first, data});
}
}

// Alloc and keep output bufers ready
for (auto& elem : mInfo.output_info) {
if (args.zero_copy) {
auto ndarr =
NDArray::Empty(elem.second.first, tvm::runtime::String2DLDataType(elem.second.second),
DLDevice{GetTVMDevice(args.device), 0});
output_data_even.insert({elem.first, ndarr});

ndarr =
NDArray::Empty(elem.second.first, tvm::runtime::String2DLDataType(elem.second.second),
DLDevice{GetTVMDevice(args.device), 0});
output_data_odd.insert({elem.first, ndarr});
} else {
char* data = (char*)malloc(runner->GetOutputMemSize(elem.first));
output_data.insert({elem.first, data});
}
}

for (int ii = 0; ii < args.run_count; ++ii) {
// Timer start
auto tstart = std::chrono::high_resolution_clock::now();
// Set random input for all input
for (auto& elem : mInfo.input_info) {
if (args.zero_copy) {
if (ii % 2) {
runner->SetInput(elem.first, input_data_even[elem.first]);
} else {
runner->SetInput(elem.first, input_data_odd[elem.first]);
}
} else {
runner->SetInput(elem.first, input_data[elem.first]);
}
}

if (args.zero_copy) {
// With zero copy set the result NDArray up front
for (auto& elem : mInfo.output_info) {
if (ii % 2) {
runner->SetOutput(elem.first, output_data_even[elem.first]);
} else {
runner->SetOutput(elem.first, output_data_odd[elem.first]);
}
}
}

if (args.input.empty() || args.output.empty()) {
// Run the model
runner->Run();

if (!args.zero_copy) {
// W/o zero copy we need to invoke explicite data copy
for (auto& elem : mInfo.output_info) {
runner->GetOutput(elem.first, output_data[elem.first]);
}
} else {
// Just wait for the run to complete.
TVMSynchronize(GetTVMDevice(args.device), 0, nullptr);
}

// Timer end
auto tend = std::chrono::high_resolution_clock::now();
LOG(INFO) << "Exec Time:" << static_cast<double>((tend - tstart).count()) / 1e6;
total_exec_time += static_cast<double>((tend - tstart).count()) / 1e6;
}

// Free input bufers
for (auto& elem : mInfo.input_info) {
free(input_data[elem.first]);
}

// Free output bufers
for (auto& elem : mInfo.output_info) {
free(output_data[elem.first]);
}
} else if (!args.input.empty() && !args.output.empty()) {
LOG(INFO) << "Executing with Input:" << args.input << " Output:" << args.output;
// Set Input from Numpy Input
runner->SetInput(args.input);
// Run the model
runner->Run();
// Get Output as Numpy dump
runner->GetOutput(args.output);
} else {
LOG(INFO) << "Executing dry run ... ";
// Set random input for all inputs
for (auto& elem : mInfo.input_info) {
LOG(INFO) << "Set Random Input for :" << elem.first;
auto shape = elem.second.first;
size_t ssize = runner.GetInputMemSize(elem.first);
size_t ssize = runner->GetInputMemSize(elem.first);
char* data = (char*)malloc(ssize);
LOG(INFO) << "Random Input Size:" << ssize << " bytes";
runner.SetInput(elem.first, data);
runner->SetInput(elem.first, data);
free(data);
}

// Run the model
runner.Run();

runner->Run();
// Get Output and dump few values
for (auto& elem : mInfo.output_info) {
LOG(INFO) << "Get Output for :" << elem.first;
auto shape = elem.second.first;
size_t ssize = runner.GetOutputMemSize(elem.first);
size_t ssize = runner->GetOutputMemSize(elem.first);
char* data = (char*)malloc(ssize);
runner.GetOutput(elem.first, data);
runner->GetOutput(elem.first, data);
LOG(INFO) << "Output Size:" << ssize << " bytes";
free(data);
}
} else {
LOG(INFO) << "Executing with Input:" << args.input << " Output:" << args.output;

// Set Input from Numpy Input
runner.SetInput(args.input);

// Run the model
runner.Run();
}

// Get Output as Numpy dump
runner.GetOutput(args.output);
if (args.profile) {
// Print Stats
runner->PrintStats();
}
auto tstart = std::chrono::high_resolution_clock::now();
delete runner;
auto tend = std::chrono::high_resolution_clock::now();

if (args.profile) {
LOG(INFO) << "Average ExecTime :" << total_exec_time / args.run_count << " ms";
LOG(INFO) << "Unload Time :" << static_cast<double>((tend - tstart).count()) / 1e6
<< " ms";
}
return 0;
}

Expand Down
Loading