diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 14d9934fb1b73..efaf7f4790797 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -297,8 +297,27 @@ static int ggml_backend_opencl_n_devices = 0; struct ProfilingInfo { std::string op_name; std::string kernel_name; - // Kernel execution time in nanoseconds. - cl_ulong duration_ns; + + cl_kernel kernel; + cl_event evt; + + cl_ulong cmd_queued; + cl_ulong cmd_submit; + cl_ulong cmd_start; + cl_ulong cmd_end; + cl_ulong overhead_start; + cl_ulong overhead_end; + // For the times below, see spec for clGetEventProfilingInfo + // The time kernel spent in cmd queue - SUBMIT - QUEUED + cl_ulong cmd_queued_duration_ns; + // The time kernel spent for submission - START - SUBMIT + cl_ulong cmd_submit_duration_ns; + // Kernel execution time in nanoseconds - END - START + cl_ulong cmd_duration_ns; + // The time for the kernel to complete - COMPLETE - END + cl_ulong cmd_complete_duration_ns; + // Total time to finish the kernel - COMPELTE - QUEUED + cl_ulong cmd_total_duration_ns; // Global and local work sizes. size_t global_size[3]; size_t local_size[3]; @@ -903,12 +922,56 @@ static void ggml_cl2_free(void) { return; } + // Populate profiling info + for (ProfilingInfo & info : g_profiling_info) { + cl_ulong cmd_queued; + cl_ulong cmd_submit; + cl_ulong cmd_start; + cl_ulong cmd_end; + cl_ulong cmd_complete; + + CL_CHECK(clWaitForEvents(1, &info.evt)); + CL_CHECK(clGetEventProfilingInfo( + info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL)); + CL_CHECK(clGetEventProfilingInfo( + info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL)); + CL_CHECK(clGetEventProfilingInfo( + info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL)); + CL_CHECK(clGetEventProfilingInfo( + info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL)); + CL_CHECK(clGetEventProfilingInfo( + info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL)); + CL_CHECK(clReleaseEvent(info.evt)); + + char kernel_name[512]; + CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME, + sizeof(kernel_name), kernel_name, NULL)); + info.kernel_name = kernel_name; + + info.cmd_queued = cmd_queued; + info.cmd_submit = cmd_submit; + info.cmd_start = cmd_start; + info.cmd_end = cmd_end; + + info.cmd_queued_duration_ns = cmd_submit - cmd_queued; + info.cmd_submit_duration_ns = cmd_start - cmd_submit; + info.cmd_duration_ns = cmd_end - cmd_start; + info.cmd_complete_duration_ns = cmd_complete - cmd_end; + info.cmd_total_duration_ns = cmd_complete - cmd_queued; + } + + // Dump a csv float total_kernel_time = 0; - fprintf(fperf, "op name, kernel name, duration (ms), global size, local size, output size\n"); + fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n"); for (const ProfilingInfo & info : g_profiling_info) { - total_kernel_time += info.duration_ns/1.e6f; - fprintf(fperf, "%s,%s,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n", - info.op_name.c_str(), info.kernel_name.c_str(), info.duration_ns/1.e6f, + total_kernel_time += info.cmd_duration_ns/1.e6f; + fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n", + info.op_name.c_str(), info.kernel_name.c_str(), + info.cmd_queued_duration_ns/1.e6f, + info.cmd_submit_duration_ns/1.e6f, + info.cmd_duration_ns/1.e6f, + info.cmd_complete_duration_ns/1.e6f, + info.cmd_total_duration_ns/1.e6f, info.global_size[0], info.global_size[1], info.global_size[2], info.local_size[0], info.local_size[2], info.local_size[2], info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]); @@ -916,6 +979,27 @@ static void ggml_cl2_free(void) { fclose(fperf); GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time); + + // Dump a simple chrome trace + FILE* ftrace = fopen("cl_trace.json", "w"); + if (!ftrace) { + GGML_LOG_ERROR("Failed to open cl_trace.json\n"); + return; + } + + fprintf(ftrace, "[\n"); + for (const ProfilingInfo & info : g_profiling_info) { + fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n", + info.kernel_name.c_str(), info.cmd_queued/1000); + fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n", + info.kernel_name.c_str(), info.cmd_submit/1000); + + fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n", + info.kernel_name.c_str(), info.cmd_start/1000); + fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n", + info.kernel_name.c_str(), info.cmd_end/1000); + } + fclose(ftrace); #endif } @@ -2062,25 +2146,14 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso // Profiling utility //------------------------------------------------------------------------------ #ifdef GGML_OPENCL_PROFILING -void populateProfilingInfo( +static void populateProfilingInfo( ProfilingInfo& info, cl_event evt, cl_kernel kernel, size_t global_size[3], size_t local_size[3], const ggml_tensor * tensor) { - cl_ulong start; - cl_ulong end; - CL_CHECK(clWaitForEvents(1, &evt)); - CL_CHECK(clGetEventProfilingInfo( - evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL)); - CL_CHECK(clGetEventProfilingInfo( - evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL)); - - char kernel_name[512]; - CL_CHECK(clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME, - sizeof(kernel_name), kernel_name, NULL)); - - info.duration_ns = end - start; - info.op_name = tensor->name; - info.kernel_name = kernel_name; + info.op_name = tensor->name; + info.kernel = kernel; + info.evt = evt; + info.local_size[0] = local_size[0]; info.local_size[1] = local_size[1]; info.local_size[2] = local_size[2];