@@ -297,8 +297,27 @@ static int ggml_backend_opencl_n_devices = 0;
297297struct ProfilingInfo {
298298 std::string op_name;
299299 std::string kernel_name;
300- // Kernel execution time in nanoseconds.
301- cl_ulong duration_ns;
300+
301+ cl_kernel kernel;
302+ cl_event evt;
303+
304+ cl_ulong cmd_queued;
305+ cl_ulong cmd_submit;
306+ cl_ulong cmd_start;
307+ cl_ulong cmd_end;
308+ cl_ulong overhead_start;
309+ cl_ulong overhead_end;
310+ // For the times below, see spec for clGetEventProfilingInfo
311+ // The time kernel spent in cmd queue - SUBMIT - QUEUED
312+ cl_ulong cmd_queued_duration_ns;
313+ // The time kernel spent for submission - START - SUBMIT
314+ cl_ulong cmd_submit_duration_ns;
315+ // Kernel execution time in nanoseconds - END - START
316+ cl_ulong cmd_duration_ns;
317+ // The time for the kernel to complete - COMPLETE - END
318+ cl_ulong cmd_complete_duration_ns;
319+ // Total time to finish the kernel - COMPELTE - QUEUED
320+ cl_ulong cmd_total_duration_ns;
302321 // Global and local work sizes.
303322 size_t global_size[3 ];
304323 size_t local_size[3 ];
@@ -903,19 +922,84 @@ static void ggml_cl2_free(void) {
903922 return ;
904923 }
905924
925+ // Populate profiling info
926+ for (ProfilingInfo & info : g_profiling_info) {
927+ cl_ulong cmd_queued;
928+ cl_ulong cmd_submit;
929+ cl_ulong cmd_start;
930+ cl_ulong cmd_end;
931+ cl_ulong cmd_complete;
932+
933+ CL_CHECK (clWaitForEvents (1 , &info.evt ));
934+ CL_CHECK (clGetEventProfilingInfo (
935+ info.evt , CL_PROFILING_COMMAND_QUEUED, sizeof (cl_ulong), &cmd_queued, NULL ));
936+ CL_CHECK (clGetEventProfilingInfo (
937+ info.evt , CL_PROFILING_COMMAND_SUBMIT, sizeof (cl_ulong), &cmd_submit, NULL ));
938+ CL_CHECK (clGetEventProfilingInfo (
939+ info.evt , CL_PROFILING_COMMAND_START, sizeof (cl_ulong), &cmd_start, NULL ));
940+ CL_CHECK (clGetEventProfilingInfo (
941+ info.evt , CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &cmd_end, NULL ));
942+ CL_CHECK (clGetEventProfilingInfo (
943+ info.evt , CL_PROFILING_COMMAND_COMPLETE, sizeof (cl_ulong), &cmd_complete, NULL ));
944+ CL_CHECK (clReleaseEvent (info.evt ));
945+
946+ char kernel_name[512 ];
947+ CL_CHECK (clGetKernelInfo (info.kernel , CL_KERNEL_FUNCTION_NAME,
948+ sizeof (kernel_name), kernel_name, NULL ));
949+ info.kernel_name = kernel_name;
950+
951+ info.cmd_queued = cmd_queued;
952+ info.cmd_submit = cmd_submit;
953+ info.cmd_start = cmd_start;
954+ info.cmd_end = cmd_end;
955+
956+ info.cmd_queued_duration_ns = cmd_submit - cmd_queued;
957+ info.cmd_submit_duration_ns = cmd_start - cmd_submit;
958+ info.cmd_duration_ns = cmd_end - cmd_start;
959+ info.cmd_complete_duration_ns = cmd_complete - cmd_end;
960+ info.cmd_total_duration_ns = cmd_complete - cmd_queued;
961+ }
962+
963+ // Dump a csv
906964 float total_kernel_time = 0 ;
907- fprintf (fperf, " op name, kernel name, duration (ms), global size, local size, output size\n " );
965+ fprintf (fperf, " op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n " );
908966 for (const ProfilingInfo & info : g_profiling_info) {
909- total_kernel_time += info.duration_ns /1 .e6f ;
910- fprintf (fperf, " %s,%s,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n " ,
911- info.op_name .c_str (), info.kernel_name .c_str (), info.duration_ns /1 .e6f ,
967+ total_kernel_time += info.cmd_duration_ns /1 .e6f ;
968+ fprintf (fperf, " %s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n " ,
969+ info.op_name .c_str (), info.kernel_name .c_str (),
970+ info.cmd_queued_duration_ns /1 .e6f ,
971+ info.cmd_submit_duration_ns /1 .e6f ,
972+ info.cmd_duration_ns /1 .e6f ,
973+ info.cmd_complete_duration_ns /1 .e6f ,
974+ info.cmd_total_duration_ns /1 .e6f ,
912975 info.global_size [0 ], info.global_size [1 ], info.global_size [2 ],
913976 info.local_size [0 ], info.local_size [2 ], info.local_size [2 ],
914977 info.output_size [0 ], info.output_size [1 ], info.output_size [2 ], info.output_size [3 ]);
915978 }
916979 fclose (fperf);
917980
918981 GGML_LOG_INFO (" ggml_opencl: total kernel time: %f\n " , total_kernel_time);
982+
983+ // Dump a simple chrome trace
984+ FILE* ftrace = fopen (" cl_trace.json" , " w" );
985+ if (!ftrace) {
986+ GGML_LOG_ERROR (" Failed to open cl_trace.json\n " );
987+ return ;
988+ }
989+
990+ fprintf (ftrace, " [\n " );
991+ for (const ProfilingInfo & info : g_profiling_info) {
992+ fprintf (ftrace, " {\" name\" : \" %s\" , \" cat\" : \" OpenCL\" , \" ph\" : \" B\" , \" ts\" : %lu, \" pid\" : \"\" , \" tid\" : \" Host\" },\n " ,
993+ info.kernel_name .c_str (), info.cmd_queued /1000 );
994+ fprintf (ftrace, " {\" name\" : \" %s\" , \" cat\" : \" OpenCL\" , \" ph\" : \" E\" , \" ts\" : %lu, \" pid\" : \"\" , \" tid\" : \" Host\" },\n " ,
995+ info.kernel_name .c_str (), info.cmd_submit /1000 );
996+
997+ fprintf (ftrace, " {\" name\" : \" %s\" , \" cat\" : \" OpenCL\" , \" ph\" : \" B\" , \" ts\" : %lu, \" pid\" : \"\" , \" tid\" : \" Device\" },\n " ,
998+ info.kernel_name .c_str (), info.cmd_start /1000 );
999+ fprintf (ftrace, " {\" name\" : \" %s\" , \" cat\" : \" OpenCL\" , \" ph\" : \" E\" , \" ts\" : %lu, \" pid\" : \"\" , \" tid\" : \" Device\" },\n " ,
1000+ info.kernel_name .c_str (), info.cmd_end /1000 );
1001+ }
1002+ fclose (ftrace);
9191003#endif
9201004}
9211005
@@ -2062,25 +2146,14 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
20622146// Profiling utility
20632147// ------------------------------------------------------------------------------
20642148#ifdef GGML_OPENCL_PROFILING
2065- void populateProfilingInfo (
2149+ static void populateProfilingInfo (
20662150 ProfilingInfo& info, cl_event evt, cl_kernel kernel,
20672151 size_t global_size[3 ], size_t local_size[3 ],
20682152 const ggml_tensor * tensor) {
2069- cl_ulong start;
2070- cl_ulong end;
2071- CL_CHECK (clWaitForEvents (1 , &evt));
2072- CL_CHECK (clGetEventProfilingInfo (
2073- evt, CL_PROFILING_COMMAND_START, sizeof (cl_ulong), &start, NULL ));
2074- CL_CHECK (clGetEventProfilingInfo (
2075- evt, CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &end, NULL ));
2076-
2077- char kernel_name[512 ];
2078- CL_CHECK (clGetKernelInfo (kernel, CL_KERNEL_FUNCTION_NAME,
2079- sizeof (kernel_name), kernel_name, NULL ));
2080-
2081- info.duration_ns = end - start;
2082- info.op_name = tensor->name ;
2083- info.kernel_name = kernel_name;
2153+ info.op_name = tensor->name ;
2154+ info.kernel = kernel;
2155+ info.evt = evt;
2156+
20842157 info.local_size [0 ] = local_size[0 ];
20852158 info.local_size [1 ] = local_size[1 ];
20862159 info.local_size [2 ] = local_size[2 ];
0 commit comments