diff --git a/brian2cuda/brianlib/cudaVector.h b/brian2cuda/brianlib/cudaVector.h index 1bf30fd8..959abe56 100644 --- a/brian2cuda/brianlib/cudaVector.h +++ b/brian2cuda/brianlib/cudaVector.h @@ -3,6 +3,7 @@ #include #include +#include "cuda_utils.h" /* * current memory allocation strategy: @@ -35,7 +36,8 @@ class cudaVector } else { - printf("ERROR while creating cudaVector with size %ld in cudaVector.h (constructor)\n", sizeof(scalar)*INITIAL_SIZE); + LOG_CUDA_ERROR("While creating cudaVector with size %ld in cudaVector.h" + "(constructor)\n", sizeof(scalar)*INITIAL_SIZE); assert(m_data != NULL); } } @@ -56,7 +58,8 @@ class cudaVector if (index < 0 || index >= m_size) { // TODO: check for proper exception throwing in cuda kernels - printf("ERROR returning a reference to index %d in cudaVector::at() (size = %u)\n", index, m_size); + LOG_CUDA_ERROR("Returning a reference to index %d in cudaVector::at()" + "(size = %u)\n", index, m_size); assert(index < m_size); } return m_data[index]; @@ -85,7 +88,7 @@ class cudaVector } else { - printf("ERROR invalid index %d, must be in range 0 - %d\n", pos, m_size); + LOG_CUDA_ERROR("Invalid index %d, must be in range 0 - %d\n", pos, m_size); assert(pos <= m_size); } }; @@ -134,7 +137,8 @@ class cudaVector } else { - printf("ERROR while allocating %ld bytes in cudaVector.h/reserve()\n", sizeof(scalar)*new_capacity); + LOG_CUDA_ERROR("While allocating %ld bytes in cudaVector.h/reserve()\n", + sizeof(scalar)*new_capacity); assert(new_data != NULL); } } diff --git a/brian2cuda/brianlib/cuda_utils.h b/brian2cuda/brianlib/cuda_utils.h index b8154d23..b4db094d 100644 --- a/brian2cuda/brianlib/cuda_utils.h +++ b/brian2cuda/brianlib/cuda_utils.h @@ -1,10 +1,5 @@ #ifndef BRIAN2CUDA_ERROR_CHECK_H #define BRIAN2CUDA_ERROR_CHECK_H -#include -#include -#include "objects.h" -#include "curand.h" - // Define this to turn on error checking #define BRIAN2CUDA_ERROR_CHECK // Define this to synchronize device before checking errors @@ -15,6 +10,80 @@ // Define this to synchronize device before checking memory //#define BRIAN2CUDA_MEMORY_CHECK_BLOCKING +// Choose which LOG macros to define based on LOG_LEVEL_ macro defined +// during compilation, is one of DEBUG, INFO, WARNING, ERROR, CRITICAL +// For now we treat CRITICAL the same as ERROR + +// TODO Could make this with less code with if !defined? Though this is easier +// to understand. +#if defined LOG_LEVEL_CRITICAL || defined LOG_LEVEL_ERROR + #define DEF_LOG_CUDA_ERROR + #define DEF_LOG_ERROR +#elif defined LOG_LEVEL_WARNING + #define DEF_LOG_CUDA_ERROR + #define DEF_LOG_ERROR + #define DEF_LOG_WARNING +#elif defined LOG_LEVEL_INFO + #define DEF_LOG_CUDA_ERROR + #define DEF_LOG_ERROR + #define DEF_LOG_WARNING + #define DEF_LOG_INFO +#elif defined LOG_LEVEL_DEBUG + #define DEF_LOG_CUDA_ERROR + #define DEF_LOG_ERROR + #define DEF_LOG_WARNING + #define DEF_LOG_INFO + #define DEF_LOG_DEBUG +#elif defined LOG_LEVEL_DIAGNOSTIC + #define DEF_LOG_CUDA_ERROR + #define DEF_LOG_ERROR + #define DEF_LOG_WARNING + #define DEF_LOG_INFO + #define DEF_LOG_DEBUG + #define DEF_LOG_DIAGNOSTIC +#endif + +// DEFINE the LOG macros as printf statements or no_ops if not defined +// LOG_CUDA_ERROR is the only macro usable in device code currently and will +// be printed to stdout when CUDA ring buffer is flushed at host/device +// serialization (this sometimes does not happen when the program crashes). +// TODO: All other LOG macros could in principle be redirected to the Brian2 +// log file via fprintf (not implemented yet) +#ifdef DEF_LOG_CUDA_ERROR + #define LOG_CUDA_ERROR(fmt, ...) printf("GPU ERROR\t" fmt, __VA_ARGS__) +#else + #define LOG_CUDA_ERROR(fmt, ...) do {} while(0) +#endif + +#ifdef DEF_LOG_ERROR + #define LOG_ERROR(fmt, ...) printf("CUDA ERROR\t" fmt, __VA_ARGS__); fflush(stdout); +#else + #define LOG_ERROR(fmt, ...) do {} while(0) +#endif + +#ifdef DEF_LOG_WARNING + #define LOG_WARNING(fmt, ...) printf("CUDA WARNING\t" fmt, __VA_ARGS__); fflush(stdout); +#else + #define LOG_WARNING(fmt, ...) do {} while(0) +#endif + +#ifdef DEF_LOG_INFO + #define LOG_INFO(fmt, ...) printf("CUDA INFO\t" fmt, __VA_ARGS__); fflush(stdout); +#else + #define LOG_INFO(fmt, ...) do {} while(0) +#endif + +#ifdef DEF_LOG_DEBUG + #define LOG_DEBUG(fmt, ...) printf("CUDA DEBUG\t" fmt, __VA_ARGS__); fflush(stdout); +#else + #define LOG_DEBUG(fmt, ...) do {} while(0) +#endif + +#ifdef DEF_LOG_DIAGNOSTIC + #define LOG_DIAGNOSTIC(fmt, ...) printf("CUDA DIAGNOSTIC\t" fmt, __VA_ARGS__); fflush(stdout); +#else + #define LOG_DIAGNOSTIC(fmt, ...) do {} while(0) +#endif // partly adapted from https://gist.github.com/ashwin/2652488 #define CUDA_SAFE_CALL(err) _cudaSafeCall(err, __FILE__, __LINE__, #err) @@ -23,6 +92,11 @@ #define THRUST_CHECK_ERROR(code) { try {code;} \ catch(...) {_thrustCheckError(__FILE__, __LINE__, #code);} } +// Place includes after macro definitions to avoid circular includes +#include +#include +#include "objects.h" +#include "curand.h" // adapted from NVIDIA cuda samples, shipped with cuda 10.1 (common/inc/helper_cuda.h) #ifdef CURAND_H_ @@ -79,8 +153,8 @@ inline void _cudaSafeCall(cudaError err, const char *file, const int line, const #ifdef BRIAN2CUDA_ERROR_CHECK if (cudaSuccess != err) { - fprintf(stderr, "ERROR: %s failed at %s:%i : %s\n", - call, file, line, cudaGetErrorString(err)); + LOG_ERROR("%s failed at %s:%i : %s\n", + call, file, line, cudaGetErrorString(err)); exit(-1); } #endif @@ -94,8 +168,8 @@ inline void _cudaSafeCall(curandStatus_t err, const char *file, const int line, #ifdef BRIAN2CUDA_ERROR_CHECK if (CURAND_STATUS_SUCCESS != err) { - fprintf(stderr, "ERROR: %s failed at %s:%i : %s\n", - call, file, line, _curandGetErrorEnum(err)); + LOG_ERROR("%s failed at %s:%i : %s\n", + call, file, line, _curandGetErrorEnum(err)); exit(-1); } #endif @@ -111,8 +185,8 @@ inline void _cudaCheckError(const char *file, const int line, const char *msg) cudaError err = cudaDeviceSynchronize(); if(cudaSuccess != err) { - fprintf(stderr, "ERROR: CUDA_CHECK_ERROR() failed after %s at %s:%i : %s\n", - msg, file, line, cudaGetErrorString(err)); + LOG_ERROR("CUDA_CHECK_ERROR() failed after %s at %s:%i : %s\n", + msg, file, line, cudaGetErrorString(err)); exit(-1); } #else @@ -120,8 +194,8 @@ inline void _cudaCheckError(const char *file, const int line, const char *msg) cudaError err = cudaGetLastError(); if (cudaSuccess != err) { - fprintf(stderr, "ERROR: CUDA_CHECK_ERROR() failed at %s:%i : %s\n", - file, line, cudaGetErrorString(err)); + LOG_ERROR("CUDA_CHECK_ERROR() failed at %s:%i : %s\n", + file, line, cudaGetErrorString(err)); exit(-1); } @@ -153,16 +227,16 @@ inline void _cudaCheckMemory(const char *file, const int line) // newly requested allocation. if (diff > 0) { - fprintf(stdout, "INFO: cuda device memory usage in %s:%i\n" - "\t used: \t %f MB\n" - "\t avail: \t %f MB\n" - "\t total: \t %f MB\n" - "\t diff: \t %f MB \t (%zu bytes)\n", - file, line, - double(used) * to_MB, - double(avail) * to_MB, - double(total) * to_MB, - double(diff) * to_MB, diff); + LOG_DEBUG("CUDA device memory usage in %s:%i\n" + "\t\t\t used: \t %f MB\n" + "\t\t\t avail: \t %f MB\n" + "\t\t\t total: \t %f MB\n" + "\t\t\t diff: \t %f MB \t (%zu bytes)\n", + file, line, + double(used) * to_MB, + double(avail) * to_MB, + double(total) * to_MB, + double(diff) * to_MB, diff); brian::used_device_memory = used; } #endif @@ -172,7 +246,7 @@ inline void _cudaCheckMemory(const char *file, const int line) inline void _thrustCheckError(const char *file, const int line, const char *code) { - fprintf(stderr, "ERROR: THRUST_CHECK_ERROR() caught an exception from %s at %s:%i\n", + LOG_ERROR("THRUST_CHECK_ERROR() caught an exception from %s at %s:%i\n", code, file, line); throw; } diff --git a/brian2cuda/brianlib/curand_buffer.h b/brian2cuda/brianlib/curand_buffer.h index 54f1a379..0b7c5529 100644 --- a/brian2cuda/brianlib/curand_buffer.h +++ b/brian2cuda/brianlib/curand_buffer.h @@ -39,9 +39,9 @@ class CurandBuffer { if (current_idx != buffer_size && memory_allocated) { - printf("WARNING: CurandBuffer::generate_numbers() called before " - "buffer was empty (current_idx = %u, buffer_size = %u)", - current_idx, buffer_size); + LOG_WARNING("CurandBuffer::generate_numbers() called before " + "buffer was empty (current_idx = %u, buffer_size = %u)", + current_idx, buffer_size); } // TODO: should we allocate the memory in the constructor (even if we end up not using it)? if (!memory_allocated) @@ -50,16 +50,16 @@ class CurandBuffer host_data = new randomNumber_t[buffer_size]; if (!host_data) { - printf("ERROR allocating host_data for CurandBuffer (size %ld)\n", sizeof(randomNumber_t)*buffer_size); + LOG_ERROR("Allocating host_data for CurandBuffer (size %ld)\n", sizeof(randomNumber_t)*buffer_size); exit(EXIT_FAILURE); } // allocate device memory cudaError_t status = cudaMalloc((void **)&dev_data, buffer_size*sizeof(randomNumber_t)); if (status != cudaSuccess) { - printf("ERROR allocating memory on device (size = %ld) in %s(%d):\n\t%s\n", - buffer_size*sizeof(randomNumber_t), __FILE__, __LINE__, - cudaGetErrorString(status)); + LOG_ERROR("Allocating memory on device (size = %ld) in %s(%d):\n\t\t\t%s\n", + buffer_size*sizeof(randomNumber_t), __FILE__, __LINE__, + cudaGetErrorString(status)); exit(EXIT_FAILURE); } memory_allocated = true; @@ -70,7 +70,7 @@ class CurandBuffer curandStatus_t status = generateUniform(*generator, dev_data, buffer_size); if (status != CURAND_STATUS_SUCCESS) { - printf("ERROR generating random numbers in %s(%d):\n", __FILE__, __LINE__); + LOG_ERROR("Generating random numbers in %s(%d):\n", __FILE__, __LINE__); exit(EXIT_FAILURE); } } @@ -79,8 +79,8 @@ class CurandBuffer curandStatus_t status = generateNormal(*generator, dev_data, buffer_size, 0, 1); if (status != CURAND_STATUS_SUCCESS) { - printf("ERROR generating normal distributed random numbers in %s(%d):\n", - __FILE__, __LINE__); + LOG_ERROR("Generating normal distributed random numbers in %s(%d):\n", + __FILE__, __LINE__); exit(EXIT_FAILURE); } } @@ -88,9 +88,9 @@ class CurandBuffer cudaError_t status = cudaMemcpy(host_data, dev_data, buffer_size*sizeof(randomNumber_t), cudaMemcpyDeviceToHost); if (status != cudaSuccess) { - printf("ERROR copying device to host memory (size = %ld) in %s(%d):\n\t%s\n", - buffer_size*sizeof(randomNumber_t), __FILE__, __LINE__, - cudaGetErrorString(status)); + LOG_ERROR("Copying device to host memory (size = %ld) in %s(%d):\n\t\t\t%s\n", + buffer_size*sizeof(randomNumber_t), __FILE__, __LINE__, + cudaGetErrorString(status)); exit(EXIT_FAILURE); } // reset buffer index @@ -99,14 +99,14 @@ class CurandBuffer curandStatus_t generateUniform(curandGenerator_t generator, randomNumber_t *outputPtr, size_t num) { - printf("ERROR curand can only generate random numbers as 'float' or 'double' types.\n"); + LOG_ERROR("%s", "Curand can only generate random numbers as 'float' or 'double' types.\n"); exit(EXIT_FAILURE); } curandStatus_t generateNormal(curandGenerator_t generator, randomNumber_t *outputPtr, size_t n, randomNumber_t mean, randomNumber_t stddev) { - printf("ERROR curand can only generate random numbers as 'float' or 'double' types.\n"); + LOG_ERROR("%s", "Curand can only generate random numbers as 'float' or 'double' types.\n"); exit(EXIT_FAILURE); } @@ -140,7 +140,7 @@ class CurandBuffer cudaError_t status = cudaFree(dev_data); if (status != cudaSuccess) { - printf("ERROR freeing device memory in %s(%d):%s\n", + LOG_ERROR("Freeing device memory in %s(%d):%s\n", __FILE__, __LINE__, cudaGetErrorString(status)); exit(EXIT_FAILURE); } diff --git a/brian2cuda/brianlib/spikequeue.h b/brian2cuda/brianlib/spikequeue.h index b26ee63a..ae4233e8 100644 --- a/brian2cuda/brianlib/spikequeue.h +++ b/brian2cuda/brianlib/spikequeue.h @@ -118,9 +118,9 @@ class CudaSpikeQueue synapses_queue = new cuda_vector*[required_num_queues]; if (!synapses_queue) { - printf("ERROR while allocating memory with size %ld in" - " spikequeue.h/prepare()\n", - sizeof(cuda_vector*) * required_num_queues); + LOG_CUDA_ERROR("While allocating memory with size %ld in" + " spikequeue.h/prepare()\n", + sizeof(cuda_vector*) * required_num_queues); } // only reset queue offset if we require new queues, in which // case we copy the old queues such that the offset is reset @@ -191,9 +191,9 @@ class CudaSpikeQueue synapses_queue[i] = new cuda_vector[num_blocks]; if (!synapses_queue[i]) { - printf("ERROR while allocating memory with size %ld in" - " spikequeue.h/prepare()\n", - sizeof(cuda_vector)*num_blocks); + LOG_CUDA_ERROR("While allocating memory with size %ld in" + " spikequeue.h/prepare()\n", + sizeof(cuda_vector)*num_blocks); } } } diff --git a/brian2cuda/cuda_prefs.py b/brian2cuda/cuda_prefs.py index 7bcb3998..331878ec 100644 --- a/brian2cuda/cuda_prefs.py +++ b/brian2cuda/cuda_prefs.py @@ -40,6 +40,11 @@ def validate_bundle_size_expression(string): 'devices.cuda_standalone', 'Brian2CUDA preferences', + helpful=BrianPreference( + default=False, + docs="Give basic information message to help new Brian2CUDA users", + validator=lambda v: v in [True, False]), + SM_multiplier = BrianPreference( default=1, docs='The number of blocks per SM. By default, this value is set to 1.', diff --git a/brian2cuda/device.py b/brian2cuda/device.py index 1c36d067..70d728c2 100644 --- a/brian2cuda/device.py +++ b/brian2cuda/device.py @@ -18,13 +18,13 @@ from brian2.core.preferences import prefs, PreferenceError from brian2.core.variables import ArrayVariable, DynamicArrayVariable, Constant from brian2.parsing.rendering import CPPNodeRenderer -from brian2.devices.device import all_devices +from brian2.devices.device import all_devices, get_device from brian2.synapses.synapses import Synapses, SynapticPathway from brian2.utils.filetools import copy_directory, ensure_directory from brian2.utils.stringtools import get_identifiers, stripped_deindented_lines from brian2.codegen.generators.cpp_generator import c_data_type from brian2.utils.logger import get_logger -from brian2.units import second +from brian2.units import second, msecond from brian2.monitors import SpikeMonitor, StateMonitor, EventMonitor from brian2.groups import Subgroup @@ -80,10 +80,6 @@ def __init__(self): self.include_dirs.remove('brianlib/randomkit') self.library_dirs.remove('brianlib/randomkit') - # Add code line slots used in our benchmarks - # TODO: Add to brian2 and remove here - self.code_lines.update({'before_network_run': [], - 'after_network_run': []}) ### Attributes specific to CUDAStandaloneDevice: # only true during first run call (relevant for synaptic pre/post ID deletion) self.first_run = True @@ -132,6 +128,13 @@ def __init__(self): # List of names of all variables which are only required on host and will not # be copied to device memory self.variables_on_host_only = [] + # Report self.timers + self.report_timers = True + self.timers_file = None + self.timers['run_binary'] = { + 'initialization': None, 'simulation_loop': None, 'finalization': None + } + def get_array_name(self, var, access_data=True, prefix=None): ''' @@ -702,7 +705,8 @@ def generate_main_source(self, writer): report_func=self.report_func, dt=float(defaultclock.dt), user_headers=user_headers, - gpu_heap_size=prefs['devices.cuda_standalone.cuda_backend.gpu_heap_size'] + gpu_heap_size=prefs['devices.cuda_standalone.cuda_backend.gpu_heap_size'], + helpful=prefs.devices.cuda_standalone.helpful ) writer.write('main.cu', main_tmp) @@ -1178,14 +1182,14 @@ def generate_makefile(self, writer, cpp_compiler, cpp_compiler_flags, cpp_linker # Log compiled GPU architecture if self.compute_capability is None: - logger.info( + logger.debug( f"Compiling device code with manually set architecture flags " f"({gpu_arch_flags}). Be aware that the minimal supported compute " f"capability is {self.minimal_compute_capability} " "(we are not checking your compile flags)" ) else: - logger.info( + logger.debug( f"Compiling device code for compute capability " f"{self.compute_capability} (compiler flags: {gpu_arch_flags})" ) @@ -1208,10 +1212,14 @@ def generate_makefile(self, writer, cpp_compiler, cpp_compiler_flags, cpp_linker compiler_debug_flags = '' linker_debug_flags = '' + # TODO: CHECK THIS IS WORKING? if disable_asserts: # NDEBUG precompiler macro disables asserts (both for C++ and CUDA) nvcc_compiler_flags += ['-NDEBUG'] + # Set brian2cuda standalone log leven based ot Brian2 log level + nvcc_compiler_flags += [f'-DLOG_LEVEL_{prefs["logging.console_log_level"].upper()}'] + makefile_tmp = self.code_object_class().templater.makefile( None, None, source_files=' '.join(sorted(writer.source_files)), @@ -1470,10 +1478,10 @@ def build(self, directory='output', for net in self.networks: net.after_run() - logger.info("Using the following preferences for CUDA standalone:") + logger.debug("Using the following preferences for CUDA standalone:") for pref_name in prefs: if "devices.cuda_standalone" in pref_name: - logger.info(f"\t{pref_name} = {prefs[pref_name]}") + logger.debug(f"\t{pref_name} = {prefs[pref_name]}") logger.debug("Using the following brian preferences:") for pref_name in prefs: @@ -1481,9 +1489,51 @@ def build(self, directory='output', logger.debug(f"\t{pref_name} = {prefs[pref_name]}") if compile: + if prefs.devices.cuda_standalone.helpful: + logger.info("Compiling CUDA standalone project...") + else: + logger.debug("Compiling CUDA standalone project...") self.compile_source(directory, cpp_compiler, debug, clean) if run: + if prefs.devices.cuda_standalone.helpful: + logger.info("Running CUDA standalone simulation...") + else: + logger.debug("Running CUDA standalone simulation...") self.run(directory, with_output, run_args) + if self.report_timers: + # Read standalone timers from file, using same code we used for + # benchmarks in brian2/tests/features/base.py (brian2.diff) + assert self.timers_file is not None + # Load timers from standalone project + cpp_timers = {} + timers_file_path = os.path.join(directory, self.timers_file) + if os.path.exists(timers_file_path): + with open(timers_file_path, "r") as f: + for line in f.readlines(): + name, time = line.split() + if time == "None": + time = None + else: + # We record in microseconds, convert to seconds + time = float(time) / 1e6 + cpp_timers[name] = time + else: + logger.error( + f"timers_file_path not found at {timers_file_path}" + ) + + self.timers['run_binary']['initialization'] = ( + cpp_timers['before_network_run'] - cpp_timers['before_start'] + ) + self.timers['run_binary']['simulation_loop'] = ( + cpp_timers['after_network_run'] + - cpp_timers['before_network_run'] + ) + self.timers['run_binary']['finalization'] = ( + cpp_timers['after_end'] - cpp_timers['after_network_run'] + ) + + print(f"\n{computation_time_summary()}") def network_run(self, net, duration, report=None, report_period=10*second, namespace=None, profile=False, level=0, **kwds): @@ -1510,6 +1560,9 @@ def network_run(self, net, duration, report=None, report_period=10*second, for clock in net._clocks: clock.set_interval(net.t, t_end) + if 'report_timers' in self.build_options: + self.report_timers = self.build_options.pop('report_timers') + # Get the local namespace if namespace is None: namespace = get_local_namespace(level=level+2) @@ -1671,11 +1724,46 @@ def network_run(self, net, duration, report=None, report_period=10*second, if clock not in all_clocks: run_lines.append(f'{net.name}.add(&{clock.name}, NULL);') + # Insert timer code when reporting timers is enabled (this needs to happen + # before the before/after_network_run slots are added to run_lines + if self.report_timers: + # Need chrono header for timing functions + prefs.codegen.cpp.headers += [""] + from .tests.features.cuda_configuration import ( + SETUP_TIMER, TIME_DIFF, CLOSE_TIMER + ) + # Insert code for timers, file path is relative to main.cu + self.timers_file = os.path.join('results', 'timers') + self.insert_code("before_start", SETUP_TIMER.format(fname=self.timers_file)) + self.insert_code("before_start", TIME_DIFF.format(name="before_start")) + self.insert_code( + "before_network_run", TIME_DIFF.format(name="before_network_run") + ) + self.insert_code( + "after_network_run", TIME_DIFF.format(name="after_network_run") + ) + self.insert_code("after_end", TIME_DIFF.format(name="after_end")) + self.insert_code("after_end", CLOSE_TIMER) + run_lines.extend(self.code_lines['before_network_run']) + + if prefs.devices.cuda_standalone.helpful: + start_sim = r'LOG_INFO("%s", "Starting simulation loop...\n");' + else: + start_sim = r'LOG_DEBUG("%s", "Starting simulation loop...\n");' + run_lines.append(start_sim) + # run everything that is run on a clock run_lines.append( f'{net.name}.run({float(duration)!r}, {report_call}, {float(report_period)!r});' ) + + if prefs.devices.cuda_standalone.helpful: + start_fin = r'LOG_INFO("%s", "Finalizing standalone simulation...\n");' + else: + start_fin = r'LOG_DEBUG("%s", "Finalizing standalone simulation...\n");' + run_lines.append(start_fin) + run_lines.extend(self.code_lines['after_network_run']) # for multiple runs, the random number buffer needs to be reset run_lines.append('random_number_buffer.run_finished();') @@ -1732,6 +1820,7 @@ def network_run(self, net, duration, report=None, report_period=10*second, self.build(direct_call=False, **self.build_options) self.first_run = False + def fill_with_array(self, var, *args, **kwargs): # If the delay variable is set after the first run call, do not delete it on the # device (which is happening by default) @@ -1758,6 +1847,75 @@ def network_restore(self, net, *args, **kwds): 'supported in CUDA standalone')) +class ComputationTimeSummary(object): + """ + Class to nicely display the contribution of different computations times. Objects of + this class are returned by `computation_time_summary`. + """ + def __init__(self, timers): + if timers is not None: + names = timers.keys() + times = timers.values() + else: # Happens if report_timers is False + # Use a dummy entry to prevent problems with empty lists later + names = ['no computation tims have been recorded'] + times = [0*second] + self.total_time = sum(times) + self.time_unit = msecond + if self.total_time>1*second: + self.time_unit = second + if self.total_time>0*second: + self.percentages = [100.0*time/self.total_time for time in times] + else: + self.percentages = [0. for _ in times] + self.names_maxlen = max(len(name) for name in names) + self.names = [name+' '*(self.names_maxlen-len(name)) for name in names] + self.times = times + + def __repr__(self): + times = [f'{time / self.time_unit:.2f} {self.time_unit}' for time in self.times] + times_maxlen = max(len(time) for time in times) + times = [' '*(times_maxlen-len(time))+time for time in times] + percentages = [f'{percentage:.2f} %' for percentage in self.percentages] + percentages_maxlen = max(len(percentage) for percentage in percentages) + percentages = [(' '*(percentages_maxlen-len(percentage)))+percentage for percentage in percentages] + + s = 'Computation time summary' + s += f"\n{'=' * len(s)}\n" + for name, time, percentage in zip(self.names, times, percentages): + s += f'{name} {time} {percentage}\n' + return s + + def _repr_html_(self): + times = [f'{time / self.time_unit:.2f} {self.time_unit}' for time in self.times] + percentages = [f'{percentage:.2f} %' for percentage in self.percentages] + s = '

Computation time summary

\n' + s += '\n' + for name, time, percentage in zip(self.names, times, percentages): + s += '' + s += f'' + s += f'' + s += f'' + s += '\n' + s += '
{name}{time}{percentage}
' + return s + + +def computation_time_summary(): + """ + Returns a `ComputationTimeSummary` of the profiling info for a run. This object + can be transformed to a string explicitly but on an interactive console + simply calling `profiling_summary` is enough since it will + automatically convert the `ProfilingSummary` object. + """ + device_timers = get_device().timers + pretty_times = { + 'Compilation': sum(filter(None, device_timers['compile'].values())) * second, + 'Initialization': device_timers['run_binary']['initialization'] * second, + 'Simulation loop': device_timers['run_binary']['simulation_loop'] * second, + 'Finalization': device_timers['run_binary']['finalization'] * second + } + return ComputationTimeSummary(pretty_times) def prepare_codeobj_code_for_rng(codeobj): ''' diff --git a/brian2cuda/templates/common_group.cu b/brian2cuda/templates/common_group.cu index 447db27b..0bb0e368 100644 --- a/brian2cuda/templates/common_group.cu +++ b/brian2cuda/templates/common_group.cu @@ -79,19 +79,19 @@ namespace { // functions works. Hacky, hacky ... randomNumber_t _host_rand(const int _vectorisation_idx) { - printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, + LOG_ERROR("Called dummy function `_host_rand` in %s:%d\n", __FILE__, __LINE__); exit(EXIT_FAILURE); } randomNumber_t _host_randn(const int _vectorisation_idx) { - printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, + LOG_ERROR("Called dummy function `_host_rand` in %s:%d\n", __FILE__, __LINE__); exit(EXIT_FAILURE); } int32_t _host_poisson(double _lambda, const int _vectorisation_idx) { - printf("ERROR: Called dummy function `_host_poisson` in %s:%d\n", __FILE__, + LOG_ERROR("Called dummy function `_host_poisson` in %s:%d\n", __FILE__, __LINE__); exit(EXIT_FAILURE); } @@ -246,17 +246,17 @@ void _run_{{codeobj_name}}() { // use the max num_threads before launch failure num_threads = funcAttrib.maxThreadsPerBlock; - printf("WARNING Not enough ressources available to call " - "_run_kernel_{{codeobj_name}} " - "with maximum possible threads per block (%u). " - "Reducing num_threads to %u. (Kernel needs %i " - "registers per block, %i bytes of " - "statically-allocated shared memory per block, %i " - "bytes of local memory per thread and a total of %i " - "bytes of user-allocated constant memory)\n", - max_threads_per_block, num_threads, funcAttrib.numRegs, - funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, - funcAttrib.constSizeBytes); + LOG_WARNING("Not enough ressources available to call " + "_run_kernel_{{codeobj_name}} " + "with maximum possible threads per block (%u). " + "Reducing num_threads to %u. (Kernel needs %i " + "registers per block, %i bytes of " + "statically-allocated shared memory per block, %i " + "bytes of local memory per thread and a total of %i " + "bytes of user-allocated constant memory)\n", + max_threads_per_block, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes); {% block update_occupancy %} // calculate theoretical occupancy for new num_threads @@ -276,26 +276,26 @@ void _run_{{codeobj_name}}() {% block kernel_info %} else { - printf("INFO _run_kernel_{{codeobj_name}}\n" - {% block kernel_info_num_blocks_str %} - "\t%u blocks\n" - {% endblock %} - "\t%u threads\n" - "\t%i registers per thread\n" - "\t%i bytes statically-allocated shared memory per block\n" - "\t%i bytes local memory per thread\n" - "\t%i bytes user-allocated constant memory\n" - {% if calc_occupancy %} - "\t%.3f theoretical occupancy\n", - {% else %} - "", - {% endif %} - {% block kernel_info_num_blocks_var %} - num_blocks, - {% endblock %} - num_threads, funcAttrib.numRegs, - funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, - funcAttrib.constSizeBytes{% if calc_occupancy %}, occupancy{% endif %}); + LOG_DEBUG("_run_kernel_{{codeobj_name}}\n" + {% block kernel_info_num_blocks_str %} + "\t\t\t%u blocks\n" + {% endblock %} + "\t\t\t%u threads\n" + "\t\t\t%i registers per thread\n" + "\t\t\t%i bytes statically-allocated shared memory per block\n" + "\t\t\t%i bytes local memory per thread\n" + "\t\t\t%i bytes user-allocated constant memory\n" + {% if calc_occupancy %} + "\t\t\t%.3f theoretical occupancy\n", + {% else %} + "", + {% endif %} + {% block kernel_info_num_blocks_var %} + num_blocks, + {% endblock %} + num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes{% if calc_occupancy %}, occupancy{% endif %}); } {% endblock %} first_run = false; diff --git a/brian2cuda/templates/main.cu b/brian2cuda/templates/main.cu index eb617c5b..dd9e08b5 100644 --- a/brian2cuda/templates/main.cu +++ b/brian2cuda/templates/main.cu @@ -27,14 +27,17 @@ int main(int argc, char **argv) { + {% if helpful %} + LOG_INFO("%s", "Initializing standalone simulation...\n"); + {% else %} + LOG_DEBUG("%s", "Initializing standalone simulation...\n"); {{'\n'.join(code_lines['before_start'])|autoindent}} + {% endif %} // seed variable set in Python through brian2.seed() calls can use this // variable (see device.py CUDAStandaloneDevice.generate_main_source()) unsigned long long seed; - //const std::clock_t _start_time = std::clock(); - CUDA_SAFE_CALL( cudaSetDevice({{gpu_id}}) ); @@ -51,9 +54,6 @@ int main(int argc, char **argv) cudaDeviceSynchronize() ); - //const double _run_time2 = (double)(std::clock() -_start_time)/CLOCKS_PER_SEC; - //printf("INFO: setting cudaDevice stuff took %f seconds\n", _run_time2); - brian_start(); {{'\n'.join(code_lines['after_start'])|autoindent}} @@ -65,16 +65,9 @@ int main(int argc, char **argv) {{main_lines|autoindent}} } - //const double _run_time3 = (double)(std::clock() -_start_time3)/CLOCKS_PER_SEC; - //printf("INFO: main_lines took %f seconds\n", _run_time3); - {{'\n'.join(code_lines['before_end'])|autoindent}} brian_end(); {{'\n'.join(code_lines['after_end'])|autoindent}} - // Profiling - //const double _run_time = (double)(std::clock() -_start_time)/CLOCKS_PER_SEC; - //printf("INFO: main function took %f seconds\n", _run_time); - return 0; } diff --git a/brian2cuda/templates/objects.cu b/brian2cuda/templates/objects.cu index 1eed57df..d6f1e880 100644 --- a/brian2cuda/templates/objects.cu +++ b/brian2cuda/templates/objects.cu @@ -326,13 +326,15 @@ void _init_arrays() {% endfor %} CUDA_CHECK_MEMORY(); +#ifdef DEF_LOG_DEBUG const double to_MB = 1.0 / (1024.0 * 1024.0); double tot_memory_MB = (used_device_memory - used_device_memory_start) * to_MB; double time_passed = (double)(std::clock() - start_timer) / CLOCKS_PER_SEC; - std::cout << "INFO: _init_arrays() took " << time_passed << "s"; + std::cout << "CUDA DEBUG: _init_arrays() took " << time_passed << "s"; if (tot_memory_MB > 0) std::cout << " and used " << tot_memory_MB << "MB of device memory."; std::cout << std::endl; +#endif } void _load_arrays() @@ -351,7 +353,7 @@ void _load_arrays() {% endif %} } else { - std::cout << "Error opening static array {{name}}." << endl; + LOG_ERROR("%s", "Error opening static array {{name}}.\n"); } {% if not (name in dynamic_array_specs.values()) %} CUDA_SAFE_CALL( @@ -389,7 +391,7 @@ void _write_arrays() outfile_{{varname}}.close(); } else { - std::cout << "Error writing output file for {{varname}}." << endl; + LOG_ERROR("%s", "Error writing output file for {{varname}}.\n"); } {% endif %} {% endfor %} @@ -406,7 +408,7 @@ void _write_arrays() outfile_{{varname}}.close(); } else { - std::cout << "Error writing output file for {{varname}}." << endl; + LOG_ERROR("%s", "Error writing output file for {{varname}}.\n"); } {% endfor %} @@ -443,7 +445,7 @@ void _write_arrays() outfile_{{varname}}.close(); } else { - std::cout << "Error writing output file for {{varname}}." << endl; + LOG_ERROR("%s", "Error writing output file for {{varname}}.\n"); } {% endfor %} @@ -471,7 +473,7 @@ void _write_arrays() outfile_profiling_info.close(); } else { - std::cout << "Error writing profiling info to file." << std::endl; + LOG_ERROR("%s", "Error writing profiling info to file.\n"); } {% endif %} // Write last run info to disk @@ -483,7 +485,7 @@ void _write_arrays() outfile_last_run_info.close(); } else { - std::cout << "Error writing last run info to file." << std::endl; + LOG_ERROR("%s", "Error writing last run info to file.\n"); } } diff --git a/brian2cuda/templates/rand.cu b/brian2cuda/templates/rand.cu index 08208d5a..793f7d16 100644 --- a/brian2cuda/templates/rand.cu +++ b/brian2cuda/templates/rand.cu @@ -116,16 +116,16 @@ void RandomNumberBuffer::init() num_per_gen_{{type}}_{{co.name}} = num_per_gen_{{type}}_{{co.name}} + 1; } - // make sure that we don't use more memory then available + // make sure that we don't use more memory than available // this checks per codeobject the number of generated floats against total available floats while (num_free_floats < num_per_gen_{{type}}_{{co.name}}) { - printf("INFO not enough memory available to generate %i random numbers for {{co.name}}, reducing the buffer size\n", num_free_floats); + LOG_DEBUG("Not enough memory available to generate %i random numbers for {{co.name}}, reducing the buffer size\n", num_free_floats); if (num_per_gen_{{type}}_{{co.name}} < num_per_cycle_{{type}}_{{co.name}}) { if (num_free_floats < num_per_cycle_{{type}}_{{co.name}}) { - printf("ERROR not enough memory to generate random numbers for {{co.name}} %s:%d\n", __FILE__, __LINE__); + LOG_ERROR("Not enough memory to generate random numbers for {{co.name}} %s:%d\n", __FILE__, __LINE__); _dealloc_arrays(); exit(1); } @@ -137,7 +137,7 @@ void RandomNumberBuffer::init() } num_per_gen_{{type}}_{{co.name}} /= 2; } - printf("INFO generating %i {{type}} every %i clock cycles for {{co.name}}\n", num_per_gen_{{type}}_{{co.name}}, {{type}}_interval_{{co.name}}); + LOG_DEBUG("Generating %i {{type}} every %i clock cycles for {{co.name}}\n", num_per_gen_{{type}}_{{co.name}}, {{type}}_interval_{{co.name}}); {% if type in ['rand', 'randn'] %} {% set dtype = "randomNumber_t" %} @@ -162,9 +162,9 @@ void RandomNumberBuffer::init() { // TODO: find a way to deal with this? E.g. looping over buffers sorted // by buffer size and reducing them until it fits. - printf("MEMORY ERROR: Trying to generate more random numbers than fit " - "into available memory. Please report this as an issue on " - "GitHub: https://github.com/brian-team/brian2cuda/issues/new"); + LOG_ERROR("%s", "MEMORY ERROR: Trying to generate more random numbers than fit " + "into available memory. Please report this as an issue on " + "GitHub: https://github.com/brian-team/brian2cuda/issues/new"); _dealloc_arrays(); exit(1); } @@ -384,7 +384,6 @@ void RandomNumberBuffer::refill_poisson_numbers( { // generate poisson distributed random numbers and reset buffer index - printf("num_per_gen_poisson %d, lambda %f\n", num_per_gen_poisson, lambda); CUDA_SAFE_CALL( curandGeneratePoisson(curand_generator, dev_poisson_allocator, num_per_gen_poisson, lambda) ); diff --git a/brian2cuda/templates/spatialstateupdate.cu b/brian2cuda/templates/spatialstateupdate.cu index e10c5d82..4eb99ccb 100644 --- a/brian2cuda/templates/spatialstateupdate.cu +++ b/brian2cuda/templates/spatialstateupdate.cu @@ -510,14 +510,14 @@ __global__ void _currents_kernel_{{codeobj_name}}( float occupancy_currents = (max_active_blocks_currents * num_threads_currents / num_threads_per_warp) / (float)(max_threads_per_sm / num_threads_per_warp); - printf("INFO _currents\n_kernel_{{codeobj_name}}" - "\t%u blocks\n" - "\t%u threads\n" - "\t%i registers per thread\n" - "\t%i bytes statically-allocated shared memory per block\n" - "\t%i bytes local memory per thread\n" - "\t%i bytes user-allocated constant memory\n" - "\t%.3f theoretical occupancy\n", + LOG_DEBUG("_currents\n_kernel_{{codeobj_name}}" + "\t\t\t%u blocks\n" + "\t\t\t%u threads\n" + "\t\t\t%i registers per thread\n" + "\t\t\t%i bytes statically-allocated shared memory per block\n" + "\t\t\t%i bytes local memory per thread\n" + "\t\t\t%i bytes user-allocated constant memory\n" + "\t\t\t%.3f theoretical occupancy\n", num_blocks_currents, num_threads_currents, funcAttrib_currents.numRegs, funcAttrib_currents.sharedSizeBytes, funcAttrib_currents.localSizeBytes, funcAttrib_currents.constSizeBytes, occupancy_currents); diff --git a/brian2cuda/templates/spikemonitor.cu b/brian2cuda/templates/spikemonitor.cu index a7bc390d..55ca990c 100644 --- a/brian2cuda/templates/spikemonitor.cu +++ b/brian2cuda/templates/spikemonitor.cu @@ -204,7 +204,7 @@ void _debugmsg_{{codeobj_name}}() // HOST_CONSTANTS %HOST_CONSTANTS% - printf("Number of spikes: %d\n", _array_{{owner.name}}_N[0]); + LOG_DEBUG("Number of spikes: %d\n", _array_{{owner.name}}_N[0]); } {% endblock %} diff --git a/brian2cuda/templates/synapses.cu b/brian2cuda/templates/synapses.cu index 3c1aee84..eea4008f 100644 --- a/brian2cuda/templates/synapses.cu +++ b/brian2cuda/templates/synapses.cu @@ -211,7 +211,7 @@ num_threads = max_threads_per_block; {% if bundle_mode %} //num_threads_per_bundle = {{pathway.name}}_bundle_size_max; num_threads_per_bundle = getThreadsPerBundle(); -printf("INFO _run_kernel_{{codeobj_name}}: Using %d threads per bundle\n", num_threads_per_bundle); +LOG_DEBUG("_run_kernel_{{codeobj_name}}: Using %d threads per bundle\n", num_threads_per_bundle); {% endif %} num_loops = 1; @@ -231,12 +231,12 @@ if (!{{owner.name}}_multiple_pre_post){ {% endif %} } else { - printf("WARNING: Detected multiple synapses for same (pre, post) neuron " - "pair in Synapses object ``{{owner.name}}`` and no atomic operations are used. " - "Falling back to serialised effect application for SynapticPathway " - "``{{pathway.name}}``. This will be slow. You can avoid serialisation " - "by separating this Synapses object into multiple Synapses objects " - "with at most one connection between the same (pre, post) neuron pair.\n"); + LOG_WARNING("%s", "Detected multiple synapses for same (pre, post) neuron " + "pair in Synapses object ``{{owner.name}}`` and no atomic operations are used. " + "Falling back to serialised effect application for SynapticPathway " + "``{{pathway.name}}``. This will be slow. You can avoid serialisation " + "by separating this Synapses object into multiple Synapses objects " + "with at most one connection between the same (pre, post) neuron pair.\n"); } if (num_threads > max_threads_per_block) num_threads = max_threads_per_block; @@ -255,7 +255,7 @@ num_threads_per_bundle = 1; num_loops = num_parallel_blocks; {% else %} -printf("ERROR: got unknown 'synaptic_effects' mode ({{synaptic_effects}})\n"); +LOG_ERROR("%s", "Got unknown 'synaptic_effects' mode ({{synaptic_effects}})\n"); _dealloc_arrays(); exit(1); {% endif %} @@ -265,7 +265,7 @@ exit(1); {% block extra_info_msg %} else if ({{pathway.name}}_max_size <= 0) { - printf("INFO there are no synapses in the {{pathway.name}} pathway. Skipping synapses_push and synapses kernels.\n"); + LOG_DEBUG("%s", "There are no synapses in the {{pathway.name}} pathway. Skipping synapses_push and synapses kernels.\n"); } {% endblock %} @@ -325,7 +325,7 @@ if ({{pathway.name}}_max_size > 0) void _debugmsg_{{codeobj_name}}() { using namespace brian; - std::cout << "Number of synapses: " << {{constant_or_scalar('N', variables['N'])}} << endl; + LOG_DEBUG("Number of synapses: %d\n", {{constant_or_scalar('N', variables['N'])}}); } {% endblock %} diff --git a/brian2cuda/templates/synapses_create_array.cu b/brian2cuda/templates/synapses_create_array.cu index 712054e2..a5262a49 100644 --- a/brian2cuda/templates/synapses_create_array.cu +++ b/brian2cuda/templates/synapses_create_array.cu @@ -35,13 +35,15 @@ size_t used_device_memory_start = used_device_memory; {% block profiling_stop %} CUDA_CHECK_MEMORY(); +#ifdef DEF_LOG_DEBUG const double to_MB = 1.0 / (1024.0 * 1024.0); double tot_memory_MB = (used_device_memory - used_device_memory_start) * to_MB; double time_passed = (double)(std::clock() - start_timer) / CLOCKS_PER_SEC; -std::cout << "INFO: {{owner.name}} creation took " << time_passed << "s"; +std::cout << "CUDA DEBUG: {{owner.name}} creation took " << time_passed << "s"; if (tot_memory_MB > 0) std::cout << " and used " << tot_memory_MB << "MB of device memory."; std::cout << std::endl; +#endif {% endblock %} {% block host_maincode %} @@ -112,7 +114,6 @@ for (int _i=0; _i 1) { {{owner.name}}_multiple_pre_post = true; diff --git a/brian2cuda/templates/synapses_create_generator.cu b/brian2cuda/templates/synapses_create_generator.cu index 7056d635..32ecfae6 100644 --- a/brian2cuda/templates/synapses_create_generator.cu +++ b/brian2cuda/templates/synapses_create_generator.cu @@ -152,13 +152,15 @@ size_t used_device_memory_start = used_device_memory; {% block profiling_stop %} CUDA_CHECK_MEMORY(); +#ifdef DEF_LOG_DEBUG const double to_MB = 1.0 / (1024.0 * 1024.0); double tot_memory_MB = (used_device_memory - used_device_memory_start) * to_MB; double time_passed = (double)(std::clock() - start_timer) / CLOCKS_PER_SEC; -std::cout << "INFO: {{owner.name}} creation took " << time_passed << "s"; +std::cout << "CUDA DEBUG: {{owner.name}} creation took " << time_passed << "s"; if (tot_memory_MB > 0) std::cout << " and used " << tot_memory_MB << "MB of memory."; std::cout << std::endl; +#endif {% endblock %} {% block host_maincode %} @@ -260,8 +262,8 @@ std::cout << std::endl; {% if skip_if_invalid %} _uiter_size = _n_total; {% else %} - cout << "Error: Requested sample size " << _uiter_size << " is bigger than the " << - "population size " << _n_total << "." << endl; + LOG_ERROR("Error: Requested sample size %ld is bigger than the population " + "size %d.\n", _uiter_size, _n_total); exit(1); {% endif %} } else if (_uiter_size < 0) @@ -269,7 +271,7 @@ std::cout << std::endl; {% if skip_if_invalid %} continue; {% else %} - cout << "Error: Requested sample size " << _uiter_size << " is negative." << endl; + LOG_ERROR("Error: Requested sample size %ld is negative.\n", _uiter_size); exit(1); {% endif %} } else if (_uiter_size == 0) @@ -352,8 +354,9 @@ std::cout << std::endl; {% if skip_if_invalid %} continue; {% else %} - cout << "Error: tried to create synapse to neuron {{result_index}}=" << _{{result_index}} << " outside range 0 to " << - _{{result_index_size}}-1 << endl; + LOG_ERROR("Error: tried to create synapse to neuron " + "{{result_index}}=%ld outside range 0 to %d\n", + _{{result_index}}, _{{result_index_size}}-1); exit(1); {% endif %} } @@ -376,8 +379,9 @@ std::cout << std::endl; {% if skip_if_invalid %} continue; {% else %} - cout << "Error: tried to create synapse to neuron {{result_index}}=" << _{{result_index}} << - " outside range 0 to " << _{{result_index_size}}-1 << endl; + LOG_ERROR("Error: tried to create synapse to neuron " + "{{result_index}}=%ld outside range 0 to %d\n", + _{{result_index}}, _{{result_index_size}}-1); exit(1); {% endif %} } @@ -438,7 +442,6 @@ std::cout << std::endl; {{dynamic_multisynaptic_idx}}[_i] = source_target_count[source_target]; {% endif %} source_target_count[source_target]++; - //printf("source target count = %i\n", source_target_count[source_target]); if (source_target_count[source_target] > 1) { {{owner.name}}_multiple_pre_post = true; diff --git a/brian2cuda/templates/synapses_push_spikes.cu b/brian2cuda/templates/synapses_push_spikes.cu index 08dee42e..c15c480e 100644 --- a/brian2cuda/templates/synapses_push_spikes.cu +++ b/brian2cuda/templates/synapses_push_spikes.cu @@ -188,8 +188,8 @@ __global__ void _before_run_kernel_{{codeobj_name}}( return; } else if (syn_N_check > INT_MAX){ - printf("ERROR: There are more Synapses (%lu) than an int can " - "hold on this system (%u).\n", syn_N_check, INT_MAX); + LOG_ERROR("There are more Synapses (%lu) than an int can " + "hold on this system (%u).\n", syn_N_check, INT_MAX); } // total number of synapses int syn_N = (int)syn_N_check; @@ -573,8 +573,8 @@ __global__ void _before_run_kernel_{{codeobj_name}}( sum_num_elements += num_elements; updateMeanStd(count_num_elements, mean_num_elements, M2_num_elements, num_elements); } // end for loop through connectivity matrix - printf("INFO connectivity matrix has size %i, number of (pre neuron ID, post neuron block) pairs is %u\n", - size_connectivity_matrix, num_pre_post_blocks); + LOG_DEBUG("Connectivity matrix has size %i, number of (pre neuron ID, post neuron block) pairs is %u\n", + size_connectivity_matrix, num_pre_post_blocks); {# If we have don't have heterogeneous delays, we just need to copy the synapse IDs and number of synapses per (preID, postBlock) to the device #} @@ -799,29 +799,30 @@ __global__ void _before_run_kernel_{{codeobj_name}}( double std_num_unique_elements = getStd(count_num_unique_elements, M2_num_unique_elements); {% endif %}{# not no_or_const_delay_mode #} +#ifdef DEF_LOG_DEBUG // print memory information std::cout.precision(1); std::cout.setf(std::ios::fixed, std::ios::floatfield); - std::cout << "INFO: synapse statistics and memory usage for {{owner.name}}:\n" - << "\tnumber of synapses: " << syn_N << "\n" + std::cout << "CUDA DEBUG\tSynapse statistics and memory usage for {{owner.name}}:\n" + << "\t\t\tnumber of synapses: " << syn_N << "\n" {% if not no_or_const_delay_mode and bundle_mode %} - << "\tnumber of bundles: " << num_bundle_ids << "\n" + << "\t\t\tnumber of bundles: " << num_bundle_ids << "\n" {% endif %} - << "\tnumber of pre/post blocks: " << num_pre_post_blocks << "\n" - << "\tnumber of synapses over all pre/post blocks:\n" - << "\t\tmean: " << mean_num_elements << "\tstd: " + << "\t\t\tnumber of pre/post blocks: " << num_pre_post_blocks << "\n" + << "\t\t\tnumber of synapses over all pre/post blocks:\n" + << "\t\t\t\tmean: " << mean_num_elements << "\tstd: " << std_num_elements << "\n" {% if not no_or_const_delay_mode %} - << "\tnumber of unique delays over all pre/post blocks:\n" - << "\t\tmean: " << mean_num_unique_elements << "\tstd: " + << "\t\t\tnumber of unique delays over all pre/post blocks:\n" + << "\t\t\t\tmean: " << mean_num_unique_elements << "\tstd: " << std_num_unique_elements << "\n" {% if bundle_mode %} - << "\tbundle size over all bundles:\n" - << "\t\tmean: " << mean_bundle_sizes << "\tstd: " + << "\t\t\tbundle size over all bundles:\n" + << "\t\t\t\tmean: " << mean_bundle_sizes << "\tstd: " << std_bundle_sizes << "\n" {% endif %}{# bundle_mode #} {% endif %}{# not no_or_const_delay_mode #} - << "\n\tmemory usage: TOTAL: " << total_memory_MB << " MB (~" + << "\n\t\t\tmemory usage: TOTAL: " << total_memory_MB << " MB (~" << total_memory_MB / syn_N * 1024.0 * 1024.0 << " byte per synapse)" << std::endl; @@ -832,10 +833,11 @@ __global__ void _before_run_kernel_{{codeobj_name}}( std::tie(name, bytes, num_elements) = tuple; double memory = bytes * to_MB; double fraction = memory / total_memory_MB * 100; - std::cout << "\t\t" << std::setprecision(1) << std::fixed << fraction + std::cout << "\t\t\t\t" << std::setprecision(1) << std::fixed << fraction << "%\t" << std::setprecision(3) << std::fixed << memory << " MB\t" << name << " [" << num_elements << "]" << std::endl; } +#endif // Create circular eventspaces in no_or_const_delay_mode @@ -895,31 +897,31 @@ __global__ void _before_run_kernel_{{codeobj_name}}( { // use the max num_threads before launch failure num_threads = funcAttrib.maxThreadsPerBlock; - printf("WARNING Not enough ressources available to call " - "_before_run_kernel_{{codeobj_name}}" - "with maximum possible threads per block (%u). " - "Reducing num_threads to %u. (Kernel needs %i " - "registers per block, %i bytes of " - "statically-allocated shared memory per block, %i " - "bytes of local memory per thread and a total of %i " - "bytes of user-allocated constant memory)\n", - max_threads_per_block, num_threads, funcAttrib.numRegs, - funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, - funcAttrib.constSizeBytes); + LOG_WARNING("Not enough ressources available to call " + "_before_run_kernel_{{codeobj_name}}" + "with maximum possible threads per block (%u). " + "Reducing num_threads to %u. (Kernel needs %i " + "registers per block, %i bytes of " + "statically-allocated shared memory per block, %i " + "bytes of local memory per thread and a total of %i " + "bytes of user-allocated constant memory)\n", + max_threads_per_block, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes); } else { - printf("INFO _before_run_kernel_{{codeobj_name}}\n" - "\t%u blocks\n" - "\t%u threads\n" - "\t%i registers per thread\n" - "\t%i bytes statically-allocated shared memory per block\n" - "\t%i bytes local memory per thread\n" - "\t%i bytes user-allocated constant memory\n" - "", - num_blocks, num_threads, funcAttrib.numRegs, - funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, - funcAttrib.constSizeBytes); + LOG_DEBUG("_before_run_kernel_{{codeobj_name}}\n" + "\t\t\t%u blocks\n" + "\t\t\t%u threads\n" + "\t\t\t%i registers per thread\n" + "\t\t\t%i bytes statically-allocated shared memory per block\n" + "\t\t\t%i bytes local memory per thread\n" + "\t\t\t%i bytes user-allocated constant memory\n" + "", + num_blocks, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes); } _before_run_kernel_{{codeobj_name}}<<>>( @@ -970,15 +972,16 @@ __global__ void _before_run_kernel_{{codeobj_name}}( cudaError_t status = cudaGetLastError(); if (status != cudaSuccess) { - printf("ERROR initialising {{owner.name}} in %s:%d %s\n", - __FILE__, __LINE__, cudaGetErrorString(status)); + LOG_ERROR("ERROR initialising {{owner.name}} in %s:%d %s\n", + __FILE__, __LINE__, cudaGetErrorString(status)); _dealloc_arrays(); exit(status); } CUDA_CHECK_MEMORY(); +#ifdef DEF_LOG_DEBUG double time_passed = (double)(std::clock() - start_timer) / CLOCKS_PER_SEC; - std::cout << "INFO: {{owner.name}} initialisation took " << time_passed << "s"; + std::cout << "CUDA DEBUG: {{owner.name}} initialisation took " << time_passed << "s"; if (used_device_memory_after_dealloc < used_device_memory_start){ size_t freed_bytes = used_device_memory_start - used_device_memory_after_dealloc; std::cout << ", freed " << freed_bytes * to_MB << "MB"; @@ -988,6 +991,7 @@ __global__ void _before_run_kernel_{{codeobj_name}}( std::cout << " and used " << used_bytes * to_MB << "MB of device memory."; } std::cout << std::endl; +#endif first_run = false; {% endblock before_run_host_maincode %} @@ -1084,7 +1088,7 @@ _run_kernel_{{codeobj_name}}( {% endblock host_maincode %} {% block kernel_info_num_blocks_str %} -"\tvariable number of blocks (depends on number of spiking neurons)\n" +"\t\t\tvariable number of blocks (depends on number of spiking neurons)\n" {% endblock %} {% block kernel_info_num_blocks_var %} {% endblock %} diff --git a/brian2cuda/tests/features/cuda_configuration.py b/brian2cuda/tests/features/cuda_configuration.py index 77bd51e5..906d36b6 100644 --- a/brian2cuda/tests/features/cuda_configuration.py +++ b/brian2cuda/tests/features/cuda_configuration.py @@ -164,7 +164,7 @@ def after_run(self): with open(python_benchmark_path, "w") as file: # Timer for `brian.run()` call recorded in `TimedSpeedTest.timed_run()` file.write(f"run_brian {self.feature_test.runtime}\n") - # Timers for compilcation and binary execution recorded in + # Timers for compilation and binary execution recorded in # `device.compile_source()` and `device.run()` for key, value in device.timers.items(): if isinstance(value, float): diff --git a/brian2cuda/utils/gputools.py b/brian2cuda/utils/gputools.py index 44129338..3484d0ce 100644 --- a/brian2cuda/utils/gputools.py +++ b/brian2cuda/utils/gputools.py @@ -178,7 +178,7 @@ def _get_cuda_path(): # Use preference if set cuda_path_pref = prefs.devices.cuda_standalone.cuda_backend.cuda_path if cuda_path_pref is not None: - logger.info( + logger.debug( f"CUDA installation directory given via preference " f"`prefs.devices.cuda_standalone.cuda_backend.cuda_path={cuda_path_pref}`" ) @@ -189,7 +189,7 @@ def _get_cuda_path(): # Use environment variable if set cuda_path = os.environ.get("CUDA_PATH", "") # Nvidia default on Windows if os.path.exists(cuda_path): - logger.info( + logger.debug( "CUDA installation directory given via environment variable `CUDA_PATH={}`" "".format(cuda_path) ) @@ -199,7 +199,7 @@ def _get_cuda_path(): nvcc_path = shutil.which("nvcc") if nvcc_path is not None: cuda_path_nvcc = os.path.dirname(os.path.dirname(nvcc_path)) - logger.info( + logger.debug( "CUDA installation directory detected via location of `nvcc` binary: {}" "".format(cuda_path_nvcc) ) @@ -208,7 +208,7 @@ def _get_cuda_path(): # Use standard location /usr/local/cuda if os.path.exists("/usr/local/cuda"): cuda_path_usr = "/usr/local/cuda" - logger.info( + logger.debug( f"CUDA installation directory found in standard location: {cuda_path_usr}" ) return (cuda_path_usr, 'default') @@ -216,7 +216,7 @@ def _get_cuda_path(): # Use standard location /opt/cuda if os.path.exists("/opt/cuda"): cuda_path_opt = "/opt/cuda" - logger.info( + logger.debug( f"CUDA installation directory found in standard location: {cuda_path_opt}" ) return (cuda_path_opt, 'default') @@ -317,7 +317,7 @@ def _select_gpu(): compute_capability = get_compute_capability(gpu_id) gpu_list = get_available_gpus() else: - logger.info( + logger.debug( "Automatic detection of GPU names and compute capabilities disabled, using " "manual preferences" ) @@ -336,9 +336,11 @@ def _select_gpu(): if gpu_list is not None: gpu_name = f" ({gpu_list[gpu_id]})" - logger.info( - f"Compiling device code for GPU {gpu_id}{gpu_name}" - ) + msg = f"Compiling device code for GPU {gpu_id}{gpu_name}" + if prefs.devices.cuda_standalone.helpful: + logger.info(msg) + else: + logger.debug(msg) return gpu_id, compute_capability @@ -485,7 +487,7 @@ def _get_compute_capability_with_device_query(gpu_id): f"Brian2CUDA documentations for more details." ) else: - logger.info( + logger.debug( "Path to `deviceQuery` binary set via " "`prefs.devices.cuda_standalone.cuda_backend.device_query_path = " f"{device_query_path}`"