Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

opencl: improve profiling #12442

Merged
merged 4 commits into from
Mar 18, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 95 additions & 22 deletions ggml/src/ggml-opencl/ggml-opencl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -297,8 +297,27 @@ static int ggml_backend_opencl_n_devices = 0;
struct ProfilingInfo {
std::string op_name;
std::string kernel_name;
// Kernel execution time in nanoseconds.
cl_ulong duration_ns;

cl_kernel kernel;
cl_event evt;

cl_ulong cmd_queued;
cl_ulong cmd_submit;
cl_ulong cmd_start;
cl_ulong cmd_end;
cl_ulong overhead_start;
cl_ulong overhead_end;
// For the times below, see spec for clGetEventProfilingInfo
// The time kernel spent in cmd queue - SUBMIT - QUEUED
cl_ulong cmd_queued_duration_ns;
// The time kernel spent for submission - START - SUBMIT
cl_ulong cmd_submit_duration_ns;
// Kernel execution time in nanoseconds - END - START
cl_ulong cmd_duration_ns;
// The time for the kernel to complete - COMPLETE - END
cl_ulong cmd_complete_duration_ns;
// Total time to finish the kernel - COMPELTE - QUEUED
cl_ulong cmd_total_duration_ns;
// Global and local work sizes.
size_t global_size[3];
size_t local_size[3];
Expand Down Expand Up @@ -903,19 +922,84 @@ static void ggml_cl2_free(void) {
return;
}

// Populate profiling info
for (ProfilingInfo & info : g_profiling_info) {
cl_ulong cmd_queued;
cl_ulong cmd_submit;
cl_ulong cmd_start;
cl_ulong cmd_end;
cl_ulong cmd_complete;

CL_CHECK(clWaitForEvents(1, &info.evt));
CL_CHECK(clGetEventProfilingInfo(
info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL));
CL_CHECK(clGetEventProfilingInfo(
info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL));
CL_CHECK(clGetEventProfilingInfo(
info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL));
CL_CHECK(clGetEventProfilingInfo(
info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL));
CL_CHECK(clGetEventProfilingInfo(
info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL));
CL_CHECK(clReleaseEvent(info.evt));

char kernel_name[512];
CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME,
sizeof(kernel_name), kernel_name, NULL));
info.kernel_name = kernel_name;

info.cmd_queued = cmd_queued;
info.cmd_submit = cmd_submit;
info.cmd_start = cmd_start;
info.cmd_end = cmd_end;

info.cmd_queued_duration_ns = cmd_submit - cmd_queued;
info.cmd_submit_duration_ns = cmd_start - cmd_submit;
info.cmd_duration_ns = cmd_end - cmd_start;
info.cmd_complete_duration_ns = cmd_complete - cmd_end;
info.cmd_total_duration_ns = cmd_complete - cmd_queued;
}

// Dump a csv
float total_kernel_time = 0;
fprintf(fperf, "op name, kernel name, duration (ms), global size, local size, output size\n");
fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n");
for (const ProfilingInfo & info : g_profiling_info) {
total_kernel_time += info.duration_ns/1.e6f;
fprintf(fperf, "%s,%s,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
info.op_name.c_str(), info.kernel_name.c_str(), info.duration_ns/1.e6f,
total_kernel_time += info.cmd_duration_ns/1.e6f;
fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
info.op_name.c_str(), info.kernel_name.c_str(),
info.cmd_queued_duration_ns/1.e6f,
info.cmd_submit_duration_ns/1.e6f,
info.cmd_duration_ns/1.e6f,
info.cmd_complete_duration_ns/1.e6f,
info.cmd_total_duration_ns/1.e6f,
info.global_size[0], info.global_size[1], info.global_size[2],
info.local_size[0], info.local_size[2], info.local_size[2],
info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
}
fclose(fperf);

GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time);

// Dump a simple chrome trace
FILE* ftrace = fopen("cl_trace.json", "w");
if (!ftrace) {
GGML_LOG_ERROR("Failed to open cl_trace.json\n");
return;
}

fprintf(ftrace, "[\n");
for (const ProfilingInfo & info : g_profiling_info) {
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
info.kernel_name.c_str(), info.cmd_queued/1000);
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
info.kernel_name.c_str(), info.cmd_submit/1000);

fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
info.kernel_name.c_str(), info.cmd_start/1000);
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
info.kernel_name.c_str(), info.cmd_end/1000);
}
fclose(ftrace);
#endif
}

Expand Down Expand Up @@ -2062,25 +2146,14 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
// Profiling utility
//------------------------------------------------------------------------------
#ifdef GGML_OPENCL_PROFILING
void populateProfilingInfo(
static void populateProfilingInfo(
ProfilingInfo& info, cl_event evt, cl_kernel kernel,
size_t global_size[3], size_t local_size[3],
const ggml_tensor * tensor) {
cl_ulong start;
cl_ulong end;
CL_CHECK(clWaitForEvents(1, &evt));
CL_CHECK(clGetEventProfilingInfo(
evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL));
CL_CHECK(clGetEventProfilingInfo(
evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL));

char kernel_name[512];
CL_CHECK(clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME,
sizeof(kernel_name), kernel_name, NULL));

info.duration_ns = end - start;
info.op_name = tensor->name;
info.kernel_name = kernel_name;
info.op_name = tensor->name;
info.kernel = kernel;
info.evt = evt;

info.local_size[0] = local_size[0];
info.local_size[1] = local_size[1];
info.local_size[2] = local_size[2];
Expand Down
Loading