From f3d14ecbda5572e1959be1ac5891ee2b17cb7a8e Mon Sep 17 00:00:00 2001 From: Roy Spliet Date: Fri, 26 Jul 2019 16:49:27 +0100 Subject: [PATCH 1/5] gpgpu-sim/shader: Calculate max pipeline depth when int is executed by SF. Fixes a problem exposed (but not introduced) by: d212d7e: take account of shfl latency Under the SM2_GTX480 configuration, a shfl instruction now results in an out-of- bounds write into a non-allocated pipeline slot, causing "random" crashes. While at it, tidy up a bit. There doesn't seem to be a strongly enforced coding convention, but three tabs for one level of identation seems a tad excessive for any style. Signed-off-by: Roy Spliet --- src/gpgpu-sim/shader.cc | 85 +++++++++++++++++++++-------------------- 1 file changed, 44 insertions(+), 41 deletions(-) diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc index 96ba385d9..4d22974e8 100644 --- a/src/gpgpu-sim/shader.cc +++ b/src/gpgpu-sim/shader.cc @@ -2998,47 +2998,50 @@ unsigned int shader_core_config::max_cta( const kernel_info_t &k ) const return result; } -void shader_core_config::set_pipeline_latency() { - - //calculate the max latency based on the input - - unsigned int_latency[6]; - unsigned fp_latency[5]; - unsigned dp_latency[5]; - unsigned sfu_latency; - unsigned tensor_latency; - - /* - * [0] ADD,SUB - * [1] MAX,Min - * [2] MUL - * [3] MAD - * [4] DIV - * [5] SHFL - */ - sscanf(opcode_latency_int, "%u,%u,%u,%u,%u,%u", - &int_latency[0],&int_latency[1],&int_latency[2], - &int_latency[3],&int_latency[4],&int_latency[5]); - sscanf(opcode_latency_fp, "%u,%u,%u,%u,%u", - &fp_latency[0],&fp_latency[1],&fp_latency[2], - &fp_latency[3],&fp_latency[4]); - sscanf(opcode_latency_dp, "%u,%u,%u,%u,%u", - &dp_latency[0],&dp_latency[1],&dp_latency[2], - &dp_latency[3],&dp_latency[4]); - sscanf(opcode_latency_sfu, "%u", - &sfu_latency); - sscanf(opcode_latency_tensor, "%u", - &tensor_latency); - - //all div operation are executed on sfu - //assume that the max latency are dp div or normal sfu_latency - max_sfu_latency = std::max(dp_latency[4],sfu_latency); - //assume that the max operation has the max latency - max_sp_latency = fp_latency[1]; - max_int_latency = std::max(int_latency[1],int_latency[5]); - max_dp_latency = dp_latency[1]; - max_tensor_core_latency = tensor_latency; - +void shader_core_config::set_pipeline_latency() +{ + // calculate the max latency based on the input + + unsigned int_latency[6]; + unsigned fp_latency[5]; + unsigned dp_latency[5]; + unsigned sfu_latency; + unsigned tensor_latency; + + /* + * [0] ADD,SUB + * [1] MAX,Min + * [2] MUL + * [3] MAD + * [4] DIV + * [5] SHFL + */ + sscanf(opcode_latency_int, "%u,%u,%u,%u,%u,%u", + &int_latency[0],&int_latency[1],&int_latency[2], + &int_latency[3],&int_latency[4],&int_latency[5]); + sscanf(opcode_latency_fp, "%u,%u,%u,%u,%u", + &fp_latency[0],&fp_latency[1],&fp_latency[2], + &fp_latency[3],&fp_latency[4]); + sscanf(opcode_latency_dp, "%u,%u,%u,%u,%u", + &dp_latency[0],&dp_latency[1],&dp_latency[2], + &dp_latency[3],&dp_latency[4]); + sscanf(opcode_latency_sfu, "%u", + &sfu_latency); + sscanf(opcode_latency_tensor, "%u", + &tensor_latency); + + // all div operation are executed on sfu + // assume that the max latency are dp div or normal sfu_latency + max_sfu_latency = std::max(dp_latency[4],sfu_latency); + // assume that the max operation has the max latency + max_sp_latency = fp_latency[1]; + max_int_latency = std::max(int_latency[1],int_latency[5]); + max_dp_latency = dp_latency[1]; + max_tensor_core_latency = tensor_latency; + + // Fermi GPUs have SP units that perform both FP and int arith. + if (gpgpu_num_int_units == 0) + max_sp_latency = std::max(max_sp_latency, max_int_latency); } void shader_core_ctx::cycle() From ac7bc416d3b676bfca0b151d7d3d0c3f62f07e2e Mon Sep 17 00:00:00 2001 From: Roy Spliet Date: Fri, 18 Oct 2019 11:30:25 +0100 Subject: [PATCH 2/5] libopencl: Define no_of_ptx The simplest fix to issue #138. Fixes linking error when building with OpenCL. For now this variable is unused. Signed-off-by: Roy Spliet --- libopencl/opencl_runtime_api.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/libopencl/opencl_runtime_api.cc b/libopencl/opencl_runtime_api.cc index 752bfdf2e..7f24901ce 100644 --- a/libopencl/opencl_runtime_api.cc +++ b/libopencl/opencl_runtime_api.cc @@ -102,6 +102,9 @@ #include #include +/* Defined in src/cuda-sim/ptx_loader.h. Unused for OpenCL for now. */ +int no_of_ptx=0; + static void setErrCode(cl_int *errcode_ret, cl_int err_code) { if ( errcode_ret ) { *errcode_ret = err_code; From e8dc59c3319c2941373abb3ba1c60da3d672e85e Mon Sep 17 00:00:00 2001 From: Andrea Picciau Date: Tue, 7 Jun 2016 16:21:44 +0100 Subject: [PATCH 3/5] Add some OpenCL functions so that the C++ API can be used. --- libopencl/opencl_runtime_api.cc | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/libopencl/opencl_runtime_api.cc b/libopencl/opencl_runtime_api.cc index 7f24901ce..d631ec6c5 100644 --- a/libopencl/opencl_runtime_api.cc +++ b/libopencl/opencl_runtime_api.cc @@ -1170,6 +1170,7 @@ clGetDeviceInfo(cl_device_id device, case CL_DEVICE_GLOBAL_MEM_SIZE: CL_ULONG_CASE( 1024*1024*1024 ); break; case CL_DEVICE_MAX_COMPUTE_UNITS: CL_UINT_CASE( device->the_device()->get_config().num_shader() ); break; case CL_DEVICE_MAX_CLOCK_FREQUENCY: CL_UINT_CASE( device->the_device()->shader_clock() ); break; + case CL_DEVICE_PLATFORM: CL_STRING_CASE("GPGPU-Sim OpenCL platform"); break; case CL_DEVICE_VENDOR:CL_STRING_CASE("GPGPU-Sim.org"); break; case CL_DEVICE_VERSION: CL_STRING_CASE("OpenCL 1.0"); break; case CL_DRIVER_VERSION: CL_STRING_CASE("1.0"); break; @@ -1228,6 +1229,30 @@ clFinish(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0 return CL_SUCCESS; } +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainMemObject(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0 +{ + return CL_SUCCESS; +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0 +{ + return CL_SUCCESS; +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainKernel(cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0 +{ + return CL_SUCCESS; +} + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainDevice(cl_device_id device) CL_API_SUFFIX__VERSION_1_0 +{ + return CL_SUCCESS; +} + extern CL_API_ENTRY cl_int CL_API_CALL clGetProgramInfo(cl_program program, cl_program_info param_name, From a7f2d3164eba8d73627299e49e292db35c094f01 Mon Sep 17 00:00:00 2001 From: Roy Spliet Date: Fri, 26 Jul 2019 13:34:51 +0100 Subject: [PATCH 4/5] libopencl: Amend clGetDeviceInfo, make missing information non-fatal. Unbreaks clinfo largely. Signed-off-by: Roy Spliet --- libopencl/opencl_runtime_api.cc | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/libopencl/opencl_runtime_api.cc b/libopencl/opencl_runtime_api.cc index d631ec6c5..412112361 100644 --- a/libopencl/opencl_runtime_api.cc +++ b/libopencl/opencl_runtime_api.cc @@ -1169,12 +1169,15 @@ clGetDeviceInfo(cl_device_id device, case CL_DEVICE_NAME: CL_STRING_CASE( "GPGPU-Sim" ); break; case CL_DEVICE_GLOBAL_MEM_SIZE: CL_ULONG_CASE( 1024*1024*1024 ); break; case CL_DEVICE_MAX_COMPUTE_UNITS: CL_UINT_CASE( device->the_device()->get_config().num_shader() ); break; - case CL_DEVICE_MAX_CLOCK_FREQUENCY: CL_UINT_CASE( device->the_device()->shader_clock() ); break; + case CL_DEVICE_MAX_CLOCK_FREQUENCY: CL_UINT_CASE( device->the_device()->shader_clock() / 1000 ); break; case CL_DEVICE_PLATFORM: CL_STRING_CASE("GPGPU-Sim OpenCL platform"); break; case CL_DEVICE_VENDOR:CL_STRING_CASE("GPGPU-Sim.org"); break; + case CL_DEVICE_VENDOR_ID:CL_UINT_CASE( 0x1337 ); break; case CL_DEVICE_VERSION: CL_STRING_CASE("OpenCL 1.0"); break; + case CL_DEVICE_OPENCL_C_VERSION: CL_STRING_CASE("OpenCL C 1.0"); break; case CL_DRIVER_VERSION: CL_STRING_CASE("1.0"); break; case CL_DEVICE_TYPE: CL_CASE(cl_device_type, CL_DEVICE_TYPE_GPU); break; + case CL_DEVICE_PROFILE: CL_STRING_CASE("FULL_PROFILE"); break; case CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: CL_INT_CASE( 3 ); break; case CL_DEVICE_MAX_WORK_ITEM_SIZES: if( param_value && param_value_size < 3*sizeof(size_t) ) return CL_INVALID_VALUE; \ @@ -1184,10 +1187,11 @@ clGetDeviceInfo(cl_device_id device, ((size_t*)param_value)[1] = n_thread_per_shader; ((size_t*)param_value)[2] = n_thread_per_shader; } - if( param_value_size_ret ) *param_value_size_ret = 3*sizeof(cl_uint); + if( param_value_size_ret ) *param_value_size_ret = 3*sizeof(size_t); break; case CL_DEVICE_MAX_WORK_GROUP_SIZE: CL_INT_CASE( device->the_device()->threads_per_core() ); break; case CL_DEVICE_ADDRESS_BITS: CL_INT_CASE( 32 ); break; + case CL_DEVICE_ENDIAN_LITTLE: CL_BOOL_CASE( CL_TRUE ); break; case CL_DEVICE_AVAILABLE: CL_BOOL_CASE( CL_TRUE ); break; case CL_DEVICE_COMPILER_AVAILABLE: CL_BOOL_CASE( CL_TRUE ); break; case CL_DEVICE_IMAGE_SUPPORT: CL_INT_CASE( CL_TRUE ); break; @@ -1209,16 +1213,17 @@ clGetDeviceInfo(cl_device_id device, if( param_value ) buf[0]=0; if( param_value_size_ret ) *param_value_size_ret = 1; break; - case CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR: CL_INT_CASE(1); break; - case CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT: CL_INT_CASE(1); break; - case CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT: CL_INT_CASE(1); break; - case CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG: CL_INT_CASE(1); break; - case CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT: CL_INT_CASE(1); break; - case CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE: CL_INT_CASE(0); break; + case CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR: CL_UINT_CASE(1); break; + case CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT: CL_UINT_CASE(1); break; + case CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT: CL_UINT_CASE(1); break; + case CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG: CL_UINT_CASE(1); break; + case CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT: CL_UINT_CASE(1); break; + case CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE: CL_UINT_CASE(0); break; + case CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF: CL_UINT_CASE(1); break; case CL_DEVICE_SINGLE_FP_CONFIG: CL_INT_CASE(0); break; case CL_DEVICE_MEM_BASE_ADDR_ALIGN: CL_INT_CASE(256*8); break; default: - opencl_not_implemented(__my_func__,__LINE__); + return CL_INVALID_VALUE; } return CL_SUCCESS; } From 88fa886ebadf017ba126d039edaa53d773983bbf Mon Sep 17 00:00:00 2001 From: Roy Spliet Date: Tue, 30 Jul 2019 16:58:48 +0100 Subject: [PATCH 5/5] libopencl: Implement just enough event for getEventProfilingInfo() Helps programs report their own simulated time. Signed-off-by: Roy Spliet --- libopencl/opencl_runtime_api.cc | 105 ++++++++++++++++++++++++++++---- 1 file changed, 94 insertions(+), 11 deletions(-) diff --git a/libopencl/opencl_runtime_api.cc b/libopencl/opencl_runtime_api.cc index 412112361..4902e819f 100644 --- a/libopencl/opencl_runtime_api.cc +++ b/libopencl/opencl_runtime_api.cc @@ -172,6 +172,21 @@ struct _cl_mem { size_t m_size; }; +struct _cl_event { +private: + cl_ulong start; + cl_ulong cmd_end; + size_t refcount; +public: + _cl_event() : start(0), cmd_end(0), refcount(1) {} + cl_ulong getCmdEnd(void); + void setCmdEnd(cl_ulong); + cl_ulong getStart(void); + void setStart(cl_ulong); + void retain(void); + bool release(void); +}; + struct pgm_info { std::string m_source; std::string m_asm; @@ -358,6 +373,36 @@ _cl_mem::_cl_mem( } } +cl_ulong _cl_event::getCmdEnd( void ) +{ + return cmd_end; +} + +void _cl_event::setCmdEnd( cl_ulong e ) +{ + cmd_end = e; +} + +cl_ulong _cl_event::getStart( void ) +{ + return start; +} + +void _cl_event::setStart( cl_ulong s ) +{ + start = s; +} + +void _cl_event::retain( void ) +{ + refcount++; +} + +bool _cl_event::release( void ) +{ + return ((--refcount) <= 0); +} + _cl_context::_cl_context( struct _cl_device_id *gpu ) { m_uid = sm_context_uid++; @@ -735,15 +780,6 @@ clCreateProgramWithBinary(cl_context /* context */, return cl_program(); } -extern CL_API_ENTRY cl_int CL_API_CALL -clGetEventProfilingInfo(cl_event /* event */, - cl_profiling_info /* param_name */, - size_t /* param_value_size */, - void * /* param_value */, - size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0{ - gpgpusim_opencl_warning(__my_func__,__LINE__, "GPGPUsim - OpenCLFunction is not implemented. Returning CL_SUCCESS"); - return CL_SUCCESS; -} /*******************************************************************************************************/ @@ -956,7 +992,13 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue, if ( err_val != CL_SUCCESS ) return err_val; - gpgpu_t *gpu = command_queue->get_device()->the_device(); + gpgpu_sim *gpu = command_queue->get_device()->the_device(); + + if ( event ) { + *event = new _cl_event(); + event[0]->setStart((gpu_tot_sim_cycle * 1000000) / gpu->shader_clock()); + } + if (kernel->get_implementation()->get_ptx_version().ver() <3.0){ gpgpu_ptx_sim_memcpy_symbol( "%_global_size", _global_size, 3 * sizeof(int), 0, 1, gpu ); gpgpu_ptx_sim_memcpy_symbol( "%_work_dim", &work_dim, 1 * sizeof(int), 0, 1, gpu ); @@ -980,6 +1022,10 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue, gpgpu_opencl_ptx_sim_main_func( grid ); else gpgpu_opencl_ptx_sim_main_perf( grid ); + + if ( event ) { + event[0]->setCmdEnd((gpu_tot_sim_cycle * 1000000) / gpu->shader_clock()); + } return CL_SUCCESS; } @@ -1243,6 +1289,10 @@ clRetainMemObject(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0 extern CL_API_ENTRY cl_int CL_API_CALL clRetainEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0 { + if ( !event ) + return CL_INVALID_EVENT; + + event->retain(); return CL_SUCCESS; } @@ -1258,6 +1308,33 @@ clRetainDevice(cl_device_id device) CL_API_SUFFIX__VERSION_1_0 return CL_SUCCESS; } +extern CL_API_ENTRY cl_int CL_API_CALL +clGetEventProfilingInfo(cl_event event, + cl_profiling_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0{ + if ( !event ) + return CL_INVALID_EVENT; + + switch (param_name) { + case CL_PROFILING_COMMAND_QUEUED: + case CL_PROFILING_COMMAND_SUBMIT: + case CL_PROFILING_COMMAND_START: + CL_ULONG_CASE( event->getStart() ); + break; + case CL_PROFILING_COMMAND_END: + CL_ULONG_CASE( event->getCmdEnd() ); + break; + default: + return CL_INVALID_VALUE; + break; + } + + //gpgpusim_opencl_warning(__my_func__,__LINE__, "GPGPUsim - OpenCLFunction is not implemented. Returning CL_SUCCESS"); + return CL_SUCCESS; +} + extern CL_API_ENTRY cl_int CL_API_CALL clGetProgramInfo(cl_program program, cl_program_info param_name, @@ -1407,8 +1484,14 @@ clWaitForEvents(cl_uint /* num_events */, } extern CL_API_ENTRY cl_int CL_API_CALL -clReleaseEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0 +clReleaseEvent(cl_event e) CL_API_SUFFIX__VERSION_1_0 { + if ( e == nullptr ) + return CL_INVALID_EVENT; + + if ( e->release() ) + delete e; + return CL_SUCCESS; }