From 190733322b6b4883020c853094574974fa665f2c Mon Sep 17 00:00:00 2001
From: Denis Alevi <mail@denisalevi.de>
Date: Fri, 28 Oct 2022 16:33:41 +0200
Subject: [PATCH 1/8] Remove `before/after_network_run` slot creation

These are now created in Brian2 already
---
 brian2cuda/device.py | 4 ----
 1 file changed, 4 deletions(-)
diff --git a/brian2cuda/device.py b/brian2cuda/device.py
index 1c36d067..c5248c16 100644
--- a/brian2cuda/device.py
+++ b/brian2cuda/device.py
@@ -80,10 +80,6 @@ def __init__(self):
         self.include_dirs.remove('brianlib/randomkit')
         self.library_dirs.remove('brianlib/randomkit')
 
-        # Add code line slots used in our benchmarks
-        # TODO: Add to brian2 and remove here
-        self.code_lines.update({'before_network_run': [],
-                                'after_network_run': []})
         ### Attributes specific to CUDAStandaloneDevice:
         # only true during first run call (relevant for synaptic pre/post ID deletion)
         self.first_run = True

From 9009e9a4983a53b996efc03226dad52281a42027 Mon Sep 17 00:00:00 2001
From: Denis Alevi <mail@denisalevi.de>
Date: Sun, 30 Oct 2022 22:12:22 +0100
Subject: [PATCH 2/8] Implement logging system in generated CUDA code

---
 brian2cuda/brianlib/cudaVector.h              |  12 +-
 brian2cuda/brianlib/cuda_utils.h              | 122 ++++++++++++++----
 brian2cuda/brianlib/curand_buffer.h           |  32 ++---
 brian2cuda/brianlib/spikequeue.h              |  12 +-
 brian2cuda/device.py                          |   3 +
 brian2cuda/templates/common_group.cu          |  68 +++++-----
 brian2cuda/templates/main.cu                  |  12 --
 brian2cuda/templates/objects.cu               |  16 ++-
 brian2cuda/templates/rand.cu                  |  15 +--
 brian2cuda/templates/spatialstateupdate.cu    |  16 +--
 brian2cuda/templates/spikemonitor.cu          |   2 +-
 brian2cuda/templates/synapses.cu              |  20 +--
 brian2cuda/templates/synapses_create_array.cu |   5 +-
 .../templates/synapses_create_generator.cu    |  21 +--
 brian2cuda/templates/synapses_push_spikes.cu  |  88 +++++++------
 15 files changed, 261 insertions(+), 183 deletions(-)

diff --git a/brian2cuda/brianlib/cudaVector.h b/brian2cuda/brianlib/cudaVector.h
index 1bf30fd8..959abe56 100644
--- a/brian2cuda/brianlib/cudaVector.h
+++ b/brian2cuda/brianlib/cudaVector.h
@@ -3,6 +3,7 @@
 
 #include <cstdio>
 #include <assert.h>
+#include "cuda_utils.h"
 
 /*
  * current memory allocation strategy:
@@ -35,7 +36,8 @@ class cudaVector
             }
             else
             {
-                printf("ERROR while creating cudaVector with size %ld in cudaVector.h (constructor)\n", sizeof(scalar)*INITIAL_SIZE);
+                LOG_CUDA_ERROR("While creating cudaVector with size %ld in cudaVector.h"
+                               "(constructor)\n", sizeof(scalar)*INITIAL_SIZE);
                 assert(m_data != NULL);
             }
         }
@@ -56,7 +58,8 @@ class cudaVector
         if (index < 0 || index >= m_size)
         {
             // TODO: check for proper exception throwing in cuda kernels
-            printf("ERROR returning a reference to index %d in cudaVector::at() (size = %u)\n", index, m_size);
+            LOG_CUDA_ERROR("Returning a reference to index %d in cudaVector::at()"
+                           "(size = %u)\n", index, m_size);
             assert(index < m_size);
         }
         return m_data[index];
@@ -85,7 +88,7 @@ class cudaVector
         }
         else
         {
-            printf("ERROR invalid index %d, must be in range 0 - %d\n", pos, m_size);
+            LOG_CUDA_ERROR("Invalid index %d, must be in range 0 - %d\n", pos, m_size);
             assert(pos <= m_size);
         }
     };
@@ -134,7 +137,8 @@ class cudaVector
             }
             else
             {
-                printf("ERROR while allocating %ld bytes in cudaVector.h/reserve()\n", sizeof(scalar)*new_capacity);
+                LOG_CUDA_ERROR("While allocating %ld bytes in cudaVector.h/reserve()\n",
+                               sizeof(scalar)*new_capacity);
                 assert(new_data != NULL);
             }
         }
diff --git a/brian2cuda/brianlib/cuda_utils.h b/brian2cuda/brianlib/cuda_utils.h
index b8154d23..b4db094d 100644
--- a/brian2cuda/brianlib/cuda_utils.h
+++ b/brian2cuda/brianlib/cuda_utils.h
@@ -1,10 +1,5 @@
 #ifndef BRIAN2CUDA_ERROR_CHECK_H
 #define BRIAN2CUDA_ERROR_CHECK_H
-#include <stdio.h>
-#include <thrust/system_error.h>
-#include "objects.h"
-#include "curand.h"
-
 // Define this to turn on error checking
 #define BRIAN2CUDA_ERROR_CHECK
 // Define this to synchronize device before checking errors
@@ -15,6 +10,80 @@
 // Define this to synchronize device before checking memory
 //#define BRIAN2CUDA_MEMORY_CHECK_BLOCKING
 
+// Choose which LOG macros to define based on LOG_LEVEL_<level> macro defined
+// during compilation, <level> is one of DEBUG, INFO, WARNING, ERROR, CRITICAL
+// For now we treat CRITICAL the same as ERROR
+
+// TODO Could make this with less code with if !defined? Though this is easier
+// to understand.
+#if defined LOG_LEVEL_CRITICAL || defined LOG_LEVEL_ERROR
+    #define DEF_LOG_CUDA_ERROR
+    #define DEF_LOG_ERROR
+#elif defined LOG_LEVEL_WARNING
+    #define DEF_LOG_CUDA_ERROR
+    #define DEF_LOG_ERROR
+    #define DEF_LOG_WARNING
+#elif defined LOG_LEVEL_INFO
+    #define DEF_LOG_CUDA_ERROR
+    #define DEF_LOG_ERROR
+    #define DEF_LOG_WARNING
+    #define DEF_LOG_INFO
+#elif defined LOG_LEVEL_DEBUG
+    #define DEF_LOG_CUDA_ERROR
+    #define DEF_LOG_ERROR
+    #define DEF_LOG_WARNING
+    #define DEF_LOG_INFO
+    #define DEF_LOG_DEBUG
+#elif defined LOG_LEVEL_DIAGNOSTIC
+    #define DEF_LOG_CUDA_ERROR
+    #define DEF_LOG_ERROR
+    #define DEF_LOG_WARNING
+    #define DEF_LOG_INFO
+    #define DEF_LOG_DEBUG
+    #define DEF_LOG_DIAGNOSTIC
+#endif
+
+// DEFINE the LOG macros as printf statements or no_ops if not defined
+// LOG_CUDA_ERROR is the only macro usable in device code currently and will
+//   be printed to stdout when CUDA ring buffer is flushed at host/device
+//   serialization (this sometimes does not happen when the program crashes).
+// TODO: All other LOG macros could in principle be redirected to the Brian2
+// log file via fprintf (not implemented yet)
+#ifdef DEF_LOG_CUDA_ERROR
+    #define LOG_CUDA_ERROR(fmt, ...)    printf("GPU ERROR\t"        fmt, __VA_ARGS__)
+#else
+    #define LOG_CUDA_ERROR(fmt, ...)    do {} while(0)
+#endif
+
+#ifdef DEF_LOG_ERROR
+    #define LOG_ERROR(fmt, ...)         printf("CUDA ERROR\t"       fmt, __VA_ARGS__); fflush(stdout);
+#else
+    #define LOG_ERROR(fmt, ...)         do {} while(0)
+#endif
+
+#ifdef DEF_LOG_WARNING
+    #define LOG_WARNING(fmt, ...)       printf("CUDA WARNING\t"     fmt, __VA_ARGS__); fflush(stdout);
+#else
+    #define LOG_WARNING(fmt, ...)       do {} while(0)
+#endif
+
+#ifdef DEF_LOG_INFO
+    #define LOG_INFO(fmt, ...)          printf("CUDA INFO\t"        fmt, __VA_ARGS__); fflush(stdout);
+#else
+    #define LOG_INFO(fmt, ...)          do {} while(0)
+#endif
+
+#ifdef DEF_LOG_DEBUG
+    #define LOG_DEBUG(fmt, ...)         printf("CUDA DEBUG\t"       fmt, __VA_ARGS__); fflush(stdout);
+#else
+    #define LOG_DEBUG(fmt, ...)         do {} while(0)
+#endif
+
+#ifdef DEF_LOG_DIAGNOSTIC
+    #define LOG_DIAGNOSTIC(fmt, ...)    printf("CUDA DIAGNOSTIC\t"  fmt, __VA_ARGS__); fflush(stdout);
+#else
+    #define LOG_DIAGNOSTIC(fmt, ...)    do {} while(0)
+#endif
 
 // partly adapted from https://gist.github.com/ashwin/2652488
 #define CUDA_SAFE_CALL(err)     _cudaSafeCall(err, __FILE__, __LINE__, #err)
@@ -23,6 +92,11 @@
 #define THRUST_CHECK_ERROR(code)  { try {code;} \
     catch(...) {_thrustCheckError(__FILE__, __LINE__, #code);} }
 
+// Place includes after macro definitions to avoid circular includes
+#include <stdio.h>
+#include <thrust/system_error.h>
+#include "objects.h"
+#include "curand.h"
 
 // adapted from NVIDIA cuda samples, shipped with cuda 10.1 (common/inc/helper_cuda.h)
 #ifdef CURAND_H_
@@ -79,8 +153,8 @@ inline void _cudaSafeCall(cudaError err, const char *file, const int line, const
 #ifdef BRIAN2CUDA_ERROR_CHECK
     if (cudaSuccess != err)
     {
-        fprintf(stderr, "ERROR: %s failed at %s:%i : %s\n",
-                call, file, line, cudaGetErrorString(err));
+        LOG_ERROR("%s failed at %s:%i : %s\n",
+                  call, file, line, cudaGetErrorString(err));
         exit(-1);
     }
 #endif
@@ -94,8 +168,8 @@ inline void _cudaSafeCall(curandStatus_t err, const char *file, const int line,
 #ifdef BRIAN2CUDA_ERROR_CHECK
     if (CURAND_STATUS_SUCCESS != err)
     {
-        fprintf(stderr, "ERROR: %s failed at %s:%i : %s\n",
-                call, file, line, _curandGetErrorEnum(err));
+        LOG_ERROR("%s failed at %s:%i : %s\n",
+                  call, file, line, _curandGetErrorEnum(err));
         exit(-1);
     }
 #endif
@@ -111,8 +185,8 @@ inline void _cudaCheckError(const char *file, const int line, const char *msg)
     cudaError err = cudaDeviceSynchronize();
     if(cudaSuccess != err)
     {
-        fprintf(stderr, "ERROR: CUDA_CHECK_ERROR() failed after %s at %s:%i : %s\n",
-                msg, file, line, cudaGetErrorString(err));
+        LOG_ERROR("CUDA_CHECK_ERROR() failed after %s at %s:%i : %s\n",
+                  msg, file, line, cudaGetErrorString(err));
         exit(-1);
     }
 #else
@@ -120,8 +194,8 @@ inline void _cudaCheckError(const char *file, const int line, const char *msg)
     cudaError err = cudaGetLastError();
     if (cudaSuccess != err)
     {
-        fprintf(stderr, "ERROR: CUDA_CHECK_ERROR() failed at %s:%i : %s\n",
-                file, line, cudaGetErrorString(err));
+        LOG_ERROR("CUDA_CHECK_ERROR() failed at %s:%i : %s\n",
+                  file, line, cudaGetErrorString(err));
         exit(-1);
     }
 
@@ -153,16 +227,16 @@ inline void _cudaCheckMemory(const char *file, const int line)
     // newly requested allocation.
     if (diff > 0)
     {
-        fprintf(stdout, "INFO: cuda device memory usage in %s:%i\n"
-               "\t used:  \t %f MB\n"
-               "\t avail: \t %f MB\n"
-               "\t total: \t %f MB\n"
-               "\t diff:  \t %f MB \t (%zu bytes)\n",
-               file, line,
-               double(used) * to_MB,
-               double(avail) * to_MB,
-               double(total) * to_MB,
-               double(diff) * to_MB, diff);
+        LOG_DEBUG("CUDA device memory usage in %s:%i\n"
+                  "\t\t\t used:  \t %f MB\n"
+                  "\t\t\t avail: \t %f MB\n"
+                  "\t\t\t total: \t %f MB\n"
+                  "\t\t\t diff:  \t %f MB \t (%zu bytes)\n",
+                  file, line,
+                  double(used) * to_MB,
+                  double(avail) * to_MB,
+                  double(total) * to_MB,
+                  double(diff) * to_MB, diff);
         brian::used_device_memory = used;
     }
 #endif
@@ -172,7 +246,7 @@ inline void _cudaCheckMemory(const char *file, const int line)
 inline void _thrustCheckError(const char *file, const int line,
         const char *code)
 {
-    fprintf(stderr, "ERROR: THRUST_CHECK_ERROR() caught an exception from %s at %s:%i\n",
+    LOG_ERROR("THRUST_CHECK_ERROR() caught an exception from %s at %s:%i\n",
             code, file, line);
     throw;
 }
diff --git a/brian2cuda/brianlib/curand_buffer.h b/brian2cuda/brianlib/curand_buffer.h
index 54f1a379..0b7c5529 100644
--- a/brian2cuda/brianlib/curand_buffer.h
+++ b/brian2cuda/brianlib/curand_buffer.h
@@ -39,9 +39,9 @@ class CurandBuffer
     {
         if (current_idx != buffer_size && memory_allocated)
         {
-            printf("WARNING: CurandBuffer::generate_numbers() called before "
-                    "buffer was empty (current_idx = %u, buffer_size = %u)",
-                    current_idx, buffer_size);
+            LOG_WARNING("CurandBuffer::generate_numbers() called before "
+                        "buffer was empty (current_idx = %u, buffer_size = %u)",
+                        current_idx, buffer_size);
         }
         // TODO: should we allocate the memory in the constructor (even if we end up not using it)?
         if (!memory_allocated)
@@ -50,16 +50,16 @@ class CurandBuffer
             host_data = new randomNumber_t[buffer_size];
             if (!host_data)
             {
-                printf("ERROR allocating host_data for CurandBuffer (size %ld)\n", sizeof(randomNumber_t)*buffer_size);
+                LOG_ERROR("Allocating host_data for CurandBuffer (size %ld)\n", sizeof(randomNumber_t)*buffer_size);
                 exit(EXIT_FAILURE);
             }
             // allocate device memory
             cudaError_t status = cudaMalloc((void **)&dev_data, buffer_size*sizeof(randomNumber_t));
             if (status != cudaSuccess)
             {
-                printf("ERROR allocating memory on device (size = %ld) in %s(%d):\n\t%s\n",
-                        buffer_size*sizeof(randomNumber_t), __FILE__, __LINE__,
-                        cudaGetErrorString(status));
+                LOG_ERROR("Allocating memory on device (size = %ld) in %s(%d):\n\t\t\t%s\n",
+                          buffer_size*sizeof(randomNumber_t), __FILE__, __LINE__,
+                          cudaGetErrorString(status));
                 exit(EXIT_FAILURE);
             }
             memory_allocated = true;
@@ -70,7 +70,7 @@ class CurandBuffer
             curandStatus_t status = generateUniform(*generator, dev_data, buffer_size);
             if (status != CURAND_STATUS_SUCCESS)
             {
-                printf("ERROR generating random numbers in %s(%d):\n", __FILE__, __LINE__);
+                LOG_ERROR("Generating random numbers in %s(%d):\n", __FILE__, __LINE__);
                 exit(EXIT_FAILURE);
             }
         }
@@ -79,8 +79,8 @@ class CurandBuffer
             curandStatus_t status = generateNormal(*generator, dev_data, buffer_size, 0, 1);
             if (status != CURAND_STATUS_SUCCESS)
             {
-                printf("ERROR generating normal distributed random numbers in %s(%d):\n",
-                        __FILE__, __LINE__);
+                LOG_ERROR("Generating normal distributed random numbers in %s(%d):\n",
+                          __FILE__, __LINE__);
                 exit(EXIT_FAILURE);
             }
         }
@@ -88,9 +88,9 @@ class CurandBuffer
         cudaError_t status = cudaMemcpy(host_data, dev_data, buffer_size*sizeof(randomNumber_t), cudaMemcpyDeviceToHost);
         if (status != cudaSuccess)
         {
-            printf("ERROR copying device to host memory (size = %ld) in %s(%d):\n\t%s\n",
-                    buffer_size*sizeof(randomNumber_t), __FILE__, __LINE__,
-                    cudaGetErrorString(status));
+            LOG_ERROR("Copying device to host memory (size = %ld) in %s(%d):\n\t\t\t%s\n",
+                      buffer_size*sizeof(randomNumber_t), __FILE__, __LINE__,
+                      cudaGetErrorString(status));
             exit(EXIT_FAILURE);
         }
         // reset buffer index
@@ -99,14 +99,14 @@ class CurandBuffer
 
     curandStatus_t generateUniform(curandGenerator_t generator, randomNumber_t *outputPtr, size_t num)
     {
-        printf("ERROR curand can only generate random numbers as 'float' or 'double' types.\n");
+        LOG_ERROR("%s", "Curand can only generate random numbers as 'float' or 'double' types.\n");
         exit(EXIT_FAILURE);
     }
 
     curandStatus_t generateNormal(curandGenerator_t generator, randomNumber_t *outputPtr,
             size_t n, randomNumber_t mean, randomNumber_t stddev)
     {
-        printf("ERROR curand can only generate random numbers as 'float' or 'double' types.\n");
+        LOG_ERROR("%s", "Curand can only generate random numbers as 'float' or 'double' types.\n");
         exit(EXIT_FAILURE);
     }
 
@@ -140,7 +140,7 @@ class CurandBuffer
         cudaError_t status = cudaFree(dev_data);
         if (status != cudaSuccess)
         {
-            printf("ERROR freeing device memory in %s(%d):%s\n",
+            LOG_ERROR("Freeing device memory in %s(%d):%s\n",
                     __FILE__, __LINE__, cudaGetErrorString(status));
             exit(EXIT_FAILURE);
         }
diff --git a/brian2cuda/brianlib/spikequeue.h b/brian2cuda/brianlib/spikequeue.h
index b26ee63a..ae4233e8 100644
--- a/brian2cuda/brianlib/spikequeue.h
+++ b/brian2cuda/brianlib/spikequeue.h
@@ -118,9 +118,9 @@ class CudaSpikeQueue
                 synapses_queue = new cuda_vector*[required_num_queues];
                 if (!synapses_queue)
                 {
-                    printf("ERROR while allocating memory with size %ld in"
-                           " spikequeue.h/prepare()\n",
-                           sizeof(cuda_vector*) * required_num_queues);
+                    LOG_CUDA_ERROR("While allocating memory with size %ld in"
+                                   " spikequeue.h/prepare()\n",
+                                   sizeof(cuda_vector*) * required_num_queues);
                 }
                 // only reset queue offset if we require new queues, in which
                 // case we copy the old queues such that the offset is reset
@@ -191,9 +191,9 @@ class CudaSpikeQueue
                     synapses_queue[i] = new cuda_vector[num_blocks];
                     if (!synapses_queue[i])
                     {
-                        printf("ERROR while allocating memory with size %ld in"
-                                " spikequeue.h/prepare()\n",
-                                sizeof(cuda_vector)*num_blocks);
+                        LOG_CUDA_ERROR("While allocating memory with size %ld in"
+                                       " spikequeue.h/prepare()\n",
+                                       sizeof(cuda_vector)*num_blocks);
                     }
                 }
             }
diff --git a/brian2cuda/device.py b/brian2cuda/device.py
index c5248c16..6ee84b02 100644
--- a/brian2cuda/device.py
+++ b/brian2cuda/device.py
@@ -1208,6 +1208,9 @@ def generate_makefile(self, writer, cpp_compiler, cpp_compiler_flags, cpp_linker
                 # NDEBUG precompiler macro disables asserts (both for C++ and CUDA)
                 nvcc_compiler_flags += ['-NDEBUG']
 
+            # Set brian2cuda standalone log leven based ot Brian2 log level
+            nvcc_compiler_flags += [f'-DLOG_LEVEL_{prefs["logging.console_log_level"].upper()}']
+
             makefile_tmp = self.code_object_class().templater.makefile(
                 None, None,
                 source_files=' '.join(sorted(writer.source_files)),
diff --git a/brian2cuda/templates/common_group.cu b/brian2cuda/templates/common_group.cu
index 447db27b..0bb0e368 100644
--- a/brian2cuda/templates/common_group.cu
+++ b/brian2cuda/templates/common_group.cu
@@ -79,19 +79,19 @@ namespace {
     // functions works. Hacky, hacky ...
     randomNumber_t _host_rand(const int _vectorisation_idx)
     {
-        printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__,
+        LOG_ERROR("Called dummy function `_host_rand` in %s:%d\n", __FILE__,
                 __LINE__);
         exit(EXIT_FAILURE);
     }
     randomNumber_t _host_randn(const int _vectorisation_idx)
     {
-        printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__,
+        LOG_ERROR("Called dummy function `_host_rand` in %s:%d\n", __FILE__,
                 __LINE__);
         exit(EXIT_FAILURE);
     }
     int32_t _host_poisson(double _lambda, const int _vectorisation_idx)
     {
-        printf("ERROR: Called dummy function `_host_poisson` in %s:%d\n", __FILE__,
+        LOG_ERROR("Called dummy function `_host_poisson` in %s:%d\n", __FILE__,
                 __LINE__);
         exit(EXIT_FAILURE);
     }
@@ -246,17 +246,17 @@ void _run_{{codeobj_name}}()
         {
             // use the max num_threads before launch failure
             num_threads = funcAttrib.maxThreadsPerBlock;
-            printf("WARNING Not enough ressources available to call "
-                   "_run_kernel_{{codeobj_name}} "
-                   "with maximum possible threads per block (%u). "
-                   "Reducing num_threads to %u. (Kernel needs %i "
-                   "registers per block, %i bytes of "
-                   "statically-allocated shared memory per block, %i "
-                   "bytes of local memory per thread and a total of %i "
-                   "bytes of user-allocated constant memory)\n",
-                   max_threads_per_block, num_threads, funcAttrib.numRegs,
-                   funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes,
-                   funcAttrib.constSizeBytes);
+            LOG_WARNING("Not enough ressources available to call "
+                        "_run_kernel_{{codeobj_name}} "
+                        "with maximum possible threads per block (%u). "
+                        "Reducing num_threads to %u. (Kernel needs %i "
+                        "registers per block, %i bytes of "
+                        "statically-allocated shared memory per block, %i "
+                        "bytes of local memory per thread and a total of %i "
+                        "bytes of user-allocated constant memory)\n",
+                        max_threads_per_block, num_threads, funcAttrib.numRegs,
+                        funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes,
+                        funcAttrib.constSizeBytes);
 
             {% block update_occupancy %}
             // calculate theoretical occupancy for new num_threads
@@ -276,26 +276,26 @@ void _run_{{codeobj_name}}()
         {% block kernel_info %}
         else
         {
-            printf("INFO _run_kernel_{{codeobj_name}}\n"
-                   {% block kernel_info_num_blocks_str %}
-                   "\t%u blocks\n"
-                   {% endblock %}
-                   "\t%u threads\n"
-                   "\t%i registers per thread\n"
-                   "\t%i bytes statically-allocated shared memory per block\n"
-                   "\t%i bytes local memory per thread\n"
-                   "\t%i bytes user-allocated constant memory\n"
-                   {% if calc_occupancy %}
-                   "\t%.3f theoretical occupancy\n",
-                   {% else %}
-                   "",
-                   {% endif %}
-                   {% block kernel_info_num_blocks_var %}
-                   num_blocks,
-                   {% endblock %}
-                   num_threads, funcAttrib.numRegs,
-                   funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes,
-                   funcAttrib.constSizeBytes{% if calc_occupancy %}, occupancy{% endif %});
+            LOG_DEBUG("_run_kernel_{{codeobj_name}}\n"
+                      {% block kernel_info_num_blocks_str %}
+                      "\t\t\t%u blocks\n"
+                      {% endblock %}
+                      "\t\t\t%u threads\n"
+                      "\t\t\t%i registers per thread\n"
+                      "\t\t\t%i bytes statically-allocated shared memory per block\n"
+                      "\t\t\t%i bytes local memory per thread\n"
+                      "\t\t\t%i bytes user-allocated constant memory\n"
+                      {% if calc_occupancy %}
+                      "\t\t\t%.3f theoretical occupancy\n",
+                      {% else %}
+                      "",
+                      {% endif %}
+                      {% block kernel_info_num_blocks_var %}
+                      num_blocks,
+                      {% endblock %}
+                      num_threads, funcAttrib.numRegs,
+                      funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes,
+                      funcAttrib.constSizeBytes{% if calc_occupancy %}, occupancy{% endif %});
         }
         {% endblock %}
         first_run = false;
diff --git a/brian2cuda/templates/main.cu b/brian2cuda/templates/main.cu
index eb617c5b..5ae31f4e 100644
--- a/brian2cuda/templates/main.cu
+++ b/brian2cuda/templates/main.cu
@@ -33,8 +33,6 @@ int main(int argc, char **argv)
     // variable (see device.py CUDAStandaloneDevice.generate_main_source())
     unsigned long long seed;
 
-    //const std::clock_t _start_time = std::clock();
-
     CUDA_SAFE_CALL(
             cudaSetDevice({{gpu_id}})
             );
@@ -51,9 +49,6 @@ int main(int argc, char **argv)
             cudaDeviceSynchronize()
             );
 
-    //const double _run_time2 = (double)(std::clock() -_start_time)/CLOCKS_PER_SEC;
-    //printf("INFO: setting cudaDevice stuff took %f seconds\n", _run_time2);
-
     brian_start();
 
     {{'\n'.join(code_lines['after_start'])|autoindent}}
@@ -65,16 +60,9 @@ int main(int argc, char **argv)
         {{main_lines|autoindent}}
     }
 
-    //const double _run_time3 = (double)(std::clock() -_start_time3)/CLOCKS_PER_SEC;
-    //printf("INFO: main_lines took %f seconds\n", _run_time3);
-
     {{'\n'.join(code_lines['before_end'])|autoindent}}
     brian_end();
     {{'\n'.join(code_lines['after_end'])|autoindent}}
 
-    // Profiling
-    //const double _run_time = (double)(std::clock() -_start_time)/CLOCKS_PER_SEC;
-    //printf("INFO: main function took %f seconds\n", _run_time);
-
     return 0;
 }
diff --git a/brian2cuda/templates/objects.cu b/brian2cuda/templates/objects.cu
index 1eed57df..d6f1e880 100644
--- a/brian2cuda/templates/objects.cu
+++ b/brian2cuda/templates/objects.cu
@@ -326,13 +326,15 @@ void _init_arrays()
     {% endfor %}
 
     CUDA_CHECK_MEMORY();
+#ifdef DEF_LOG_DEBUG
     const double to_MB = 1.0 / (1024.0 * 1024.0);
     double tot_memory_MB = (used_device_memory - used_device_memory_start) * to_MB;
     double time_passed = (double)(std::clock() - start_timer) / CLOCKS_PER_SEC;
-    std::cout << "INFO: _init_arrays() took " <<  time_passed << "s";
+    std::cout << "CUDA DEBUG: _init_arrays() took " <<  time_passed << "s";
     if (tot_memory_MB > 0)
         std::cout << " and used " << tot_memory_MB << "MB of device memory.";
     std::cout << std::endl;
+#endif
 }
 
 void _load_arrays()
@@ -351,7 +353,7 @@ void _load_arrays()
         {% endif %}
     } else
     {
-        std::cout << "Error opening static array {{name}}." << endl;
+        LOG_ERROR("%s", "Error opening static array {{name}}.\n");
     }
     {% if not (name in dynamic_array_specs.values()) %}
     CUDA_SAFE_CALL(
@@ -389,7 +391,7 @@ void _write_arrays()
         outfile_{{varname}}.close();
     } else
     {
-        std::cout << "Error writing output file for {{varname}}." << endl;
+        LOG_ERROR("%s", "Error writing output file for {{varname}}.\n");
     }
     {% endif %}
     {% endfor %}
@@ -406,7 +408,7 @@ void _write_arrays()
         outfile_{{varname}}.close();
     } else
     {
-        std::cout << "Error writing output file for {{varname}}." << endl;
+        LOG_ERROR("%s", "Error writing output file for {{varname}}.\n");
     }
     {% endfor %}
 
@@ -443,7 +445,7 @@ void _write_arrays()
             outfile_{{varname}}.close();
         } else
         {
-            std::cout << "Error writing output file for {{varname}}." << endl;
+            LOG_ERROR("%s", "Error writing output file for {{varname}}.\n");
         }
     {% endfor %}
 
@@ -471,7 +473,7 @@ void _write_arrays()
     outfile_profiling_info.close();
     } else
     {
-        std::cout << "Error writing profiling info to file." << std::endl;
+        LOG_ERROR("%s", "Error writing profiling info to file.\n");
     }
     {% endif %}
     // Write last run info to disk
@@ -483,7 +485,7 @@ void _write_arrays()
         outfile_last_run_info.close();
     } else
     {
-        std::cout << "Error writing last run info to file." << std::endl;
+        LOG_ERROR("%s", "Error writing last run info to file.\n");
     }
 }
 
diff --git a/brian2cuda/templates/rand.cu b/brian2cuda/templates/rand.cu
index 08208d5a..793f7d16 100644
--- a/brian2cuda/templates/rand.cu
+++ b/brian2cuda/templates/rand.cu
@@ -116,16 +116,16 @@ void RandomNumberBuffer::init()
             num_per_gen_{{type}}_{{co.name}} = num_per_gen_{{type}}_{{co.name}} + 1;
         }
 
-        // make sure that we don't use more memory then available
+        // make sure that we don't use more memory than available
         // this checks per codeobject the number of generated floats against total available floats
         while (num_free_floats < num_per_gen_{{type}}_{{co.name}})
         {
-            printf("INFO not enough memory available to generate %i random numbers for {{co.name}}, reducing the buffer size\n", num_free_floats);
+            LOG_DEBUG("Not enough memory available to generate %i random numbers for {{co.name}}, reducing the buffer size\n", num_free_floats);
             if (num_per_gen_{{type}}_{{co.name}} < num_per_cycle_{{type}}_{{co.name}})
             {
                 if (num_free_floats < num_per_cycle_{{type}}_{{co.name}})
                 {
-                    printf("ERROR not enough memory to generate random numbers for {{co.name}} %s:%d\n", __FILE__, __LINE__);
+                    LOG_ERROR("Not enough memory to generate random numbers for {{co.name}} %s:%d\n", __FILE__, __LINE__);
                     _dealloc_arrays();
                     exit(1);
                 }
@@ -137,7 +137,7 @@ void RandomNumberBuffer::init()
             }
             num_per_gen_{{type}}_{{co.name}} /= 2;
         }
-        printf("INFO generating %i {{type}} every %i clock cycles for {{co.name}}\n", num_per_gen_{{type}}_{{co.name}}, {{type}}_interval_{{co.name}});
+        LOG_DEBUG("Generating %i {{type}} every %i clock cycles for {{co.name}}\n", num_per_gen_{{type}}_{{co.name}}, {{type}}_interval_{{co.name}});
 
         {% if type in ['rand', 'randn'] %}
         {% set dtype = "randomNumber_t" %}
@@ -162,9 +162,9 @@ void RandomNumberBuffer::init()
         {
             // TODO: find a way to deal with this? E.g. looping over buffers sorted
             // by buffer size and reducing them until it fits.
-            printf("MEMORY ERROR: Trying to generate more random numbers than fit "
-                   "into available memory. Please report this as an issue on "
-                   "GitHub: https://github.com/brian-team/brian2cuda/issues/new");
+            LOG_ERROR("%s", "MEMORY ERROR: Trying to generate more random numbers than fit "
+                      "into available memory. Please report this as an issue on "
+                      "GitHub: https://github.com/brian-team/brian2cuda/issues/new");
             _dealloc_arrays();
             exit(1);
         }
@@ -384,7 +384,6 @@ void RandomNumberBuffer::refill_poisson_numbers(
 {
     // generate poisson distributed random numbers and reset buffer index
 
-    printf("num_per_gen_poisson %d, lambda %f\n", num_per_gen_poisson, lambda);
     CUDA_SAFE_CALL(
             curandGeneratePoisson(curand_generator, dev_poisson_allocator, num_per_gen_poisson, lambda)
             );
diff --git a/brian2cuda/templates/spatialstateupdate.cu b/brian2cuda/templates/spatialstateupdate.cu
index e10c5d82..4eb99ccb 100644
--- a/brian2cuda/templates/spatialstateupdate.cu
+++ b/brian2cuda/templates/spatialstateupdate.cu
@@ -510,14 +510,14 @@ __global__ void _currents_kernel_{{codeobj_name}}(
             float occupancy_currents = (max_active_blocks_currents * num_threads_currents / num_threads_per_warp) /
                               (float)(max_threads_per_sm / num_threads_per_warp);
 
-            printf("INFO _currents\n_kernel_{{codeobj_name}}"
-                       "\t%u blocks\n"
-                       "\t%u threads\n"
-                       "\t%i registers per thread\n"
-                       "\t%i bytes statically-allocated shared memory per block\n"
-                       "\t%i bytes local memory per thread\n"
-                       "\t%i bytes user-allocated constant memory\n"
-                       "\t%.3f theoretical occupancy\n",
+            LOG_DEBUG("_currents\n_kernel_{{codeobj_name}}"
+                       "\t\t\t%u blocks\n"
+                       "\t\t\t%u threads\n"
+                       "\t\t\t%i registers per thread\n"
+                       "\t\t\t%i bytes statically-allocated shared memory per block\n"
+                       "\t\t\t%i bytes local memory per thread\n"
+                       "\t\t\t%i bytes user-allocated constant memory\n"
+                       "\t\t\t%.3f theoretical occupancy\n",
                        num_blocks_currents, num_threads_currents, funcAttrib_currents.numRegs,
                        funcAttrib_currents.sharedSizeBytes, funcAttrib_currents.localSizeBytes,
                        funcAttrib_currents.constSizeBytes, occupancy_currents);
diff --git a/brian2cuda/templates/spikemonitor.cu b/brian2cuda/templates/spikemonitor.cu
index a7bc390d..55ca990c 100644
--- a/brian2cuda/templates/spikemonitor.cu
+++ b/brian2cuda/templates/spikemonitor.cu
@@ -204,7 +204,7 @@ void _debugmsg_{{codeobj_name}}()
     // HOST_CONSTANTS
     %HOST_CONSTANTS%
 
-    printf("Number of spikes: %d\n", _array_{{owner.name}}_N[0]);
+    LOG_DEBUG("Number of spikes: %d\n", _array_{{owner.name}}_N[0]);
 }
 {% endblock %}
 
diff --git a/brian2cuda/templates/synapses.cu b/brian2cuda/templates/synapses.cu
index 3c1aee84..eea4008f 100644
--- a/brian2cuda/templates/synapses.cu
+++ b/brian2cuda/templates/synapses.cu
@@ -211,7 +211,7 @@ num_threads = max_threads_per_block;
 {% if bundle_mode %}
 //num_threads_per_bundle = {{pathway.name}}_bundle_size_max;
 num_threads_per_bundle = getThreadsPerBundle();
-printf("INFO _run_kernel_{{codeobj_name}}: Using %d threads per bundle\n", num_threads_per_bundle);
+LOG_DEBUG("_run_kernel_{{codeobj_name}}: Using %d threads per bundle\n", num_threads_per_bundle);
 {% endif %}
 num_loops = 1;
 
@@ -231,12 +231,12 @@ if (!{{owner.name}}_multiple_pre_post){
     {% endif %}
 }
 else {
-    printf("WARNING: Detected multiple synapses for same (pre, post) neuron "
-           "pair in Synapses object ``{{owner.name}}`` and no atomic operations are used. "
-           "Falling back to serialised effect application for SynapticPathway "
-           "``{{pathway.name}}``. This will be slow. You can avoid serialisation "
-           "by separating this Synapses object into multiple Synapses objects "
-           "with at most one connection between the same (pre, post) neuron pair.\n");
+    LOG_WARNING("%s", "Detected multiple synapses for same (pre, post) neuron "
+                "pair in Synapses object ``{{owner.name}}`` and no atomic operations are used. "
+                "Falling back to serialised effect application for SynapticPathway "
+                "``{{pathway.name}}``. This will be slow. You can avoid serialisation "
+                "by separating this Synapses object into multiple Synapses objects "
+                "with at most one connection between the same (pre, post) neuron pair.\n");
 }
 if (num_threads > max_threads_per_block)
     num_threads = max_threads_per_block;
@@ -255,7 +255,7 @@ num_threads_per_bundle = 1;
 num_loops = num_parallel_blocks;
 
 {% else %}
-printf("ERROR: got unknown 'synaptic_effects' mode ({{synaptic_effects}})\n");
+LOG_ERROR("%s", "Got unknown 'synaptic_effects' mode ({{synaptic_effects}})\n");
 _dealloc_arrays();
 exit(1);
 {% endif %}
@@ -265,7 +265,7 @@ exit(1);
 {% block extra_info_msg %}
 else if ({{pathway.name}}_max_size <= 0)
 {
-    printf("INFO there are no synapses in the {{pathway.name}} pathway. Skipping synapses_push and synapses kernels.\n");
+    LOG_DEBUG("%s", "There are no synapses in the {{pathway.name}} pathway. Skipping synapses_push and synapses kernels.\n");
 }
 {% endblock %}
 
@@ -325,7 +325,7 @@ if ({{pathway.name}}_max_size > 0)
 void _debugmsg_{{codeobj_name}}()
 {
     using namespace brian;
-    std::cout << "Number of synapses: " << {{constant_or_scalar('N', variables['N'])}} << endl;
+    LOG_DEBUG("Number of synapses: %d\n", {{constant_or_scalar('N', variables['N'])}});
 }
 {% endblock %}
 
diff --git a/brian2cuda/templates/synapses_create_array.cu b/brian2cuda/templates/synapses_create_array.cu
index 712054e2..a5262a49 100644
--- a/brian2cuda/templates/synapses_create_array.cu
+++ b/brian2cuda/templates/synapses_create_array.cu
@@ -35,13 +35,15 @@ size_t used_device_memory_start = used_device_memory;
 
 {% block profiling_stop %}
 CUDA_CHECK_MEMORY();
+#ifdef DEF_LOG_DEBUG
 const double to_MB = 1.0 / (1024.0 * 1024.0);
 double tot_memory_MB = (used_device_memory - used_device_memory_start) * to_MB;
 double time_passed = (double)(std::clock() - start_timer) / CLOCKS_PER_SEC;
-std::cout << "INFO: {{owner.name}} creation took " <<  time_passed << "s";
+std::cout << "CUDA DEBUG: {{owner.name}} creation took " <<  time_passed << "s";
 if (tot_memory_MB > 0)
     std::cout << " and used " << tot_memory_MB << "MB of device memory.";
 std::cout << std::endl;
+#endif
 {% endblock %}
 
 {% block host_maincode %}
@@ -112,7 +114,6 @@ for (int _i=0; _i<newsize; _i++)
     {{dynamic_multisynaptic_idx}}[_i] = source_target_count[source_target];
     {% endif %}
     source_target_count[source_target]++;
-    //printf("source target count = %i\n", source_target_count[source_target]);
     if (source_target_count[source_target] > 1)
     {
         {{owner.name}}_multiple_pre_post = true;
diff --git a/brian2cuda/templates/synapses_create_generator.cu b/brian2cuda/templates/synapses_create_generator.cu
index 7056d635..32ecfae6 100644
--- a/brian2cuda/templates/synapses_create_generator.cu
+++ b/brian2cuda/templates/synapses_create_generator.cu
@@ -152,13 +152,15 @@ size_t used_device_memory_start = used_device_memory;
 
 {% block profiling_stop %}
 CUDA_CHECK_MEMORY();
+#ifdef DEF_LOG_DEBUG
 const double to_MB = 1.0 / (1024.0 * 1024.0);
 double tot_memory_MB = (used_device_memory - used_device_memory_start) * to_MB;
 double time_passed = (double)(std::clock() - start_timer) / CLOCKS_PER_SEC;
-std::cout << "INFO: {{owner.name}} creation took " <<  time_passed << "s";
+std::cout << "CUDA DEBUG: {{owner.name}} creation took " <<  time_passed << "s";
 if (tot_memory_MB > 0)
     std::cout << " and used " << tot_memory_MB << "MB of memory.";
 std::cout << std::endl;
+#endif
 {% endblock %}
 
 {% block host_maincode %}
@@ -260,8 +262,8 @@ std::cout << std::endl;
             {% if skip_if_invalid %}
             _uiter_size = _n_total;
             {% else %}
-            cout << "Error: Requested sample size " << _uiter_size << " is bigger than the " <<
-                    "population size " << _n_total << "." << endl;
+            LOG_ERROR("Error: Requested sample size %ld is bigger than the population "
+                      "size %d.\n", _uiter_size, _n_total);
             exit(1);
             {% endif %}
         } else if (_uiter_size < 0)
@@ -269,7 +271,7 @@ std::cout << std::endl;
             {% if skip_if_invalid %}
             continue;
             {% else %}
-            cout << "Error: Requested sample size " << _uiter_size << " is negative." << endl;
+            LOG_ERROR("Error: Requested sample size %ld is negative.\n", _uiter_size);
             exit(1);
             {% endif %}
         } else if (_uiter_size == 0)
@@ -352,8 +354,9 @@ std::cout << std::endl;
                     {% if skip_if_invalid %}
                     continue;
                     {% else %}
-                    cout << "Error: tried to create synapse to neuron {{result_index}}=" << _{{result_index}} << " outside range 0 to " <<
-                                            _{{result_index_size}}-1 << endl;
+                    LOG_ERROR("Error: tried to create synapse to neuron "
+                              "{{result_index}}=%ld outside range 0 to %d\n",
+                              _{{result_index}}, _{{result_index_size}}-1);
                     exit(1);
                     {% endif %}
                 }
@@ -376,8 +379,9 @@ std::cout << std::endl;
                 {% if skip_if_invalid %}
                 continue;
                 {% else %}
-                cout << "Error: tried to create synapse to neuron {{result_index}}=" << _{{result_index}} <<
-                        " outside range 0 to " << _{{result_index_size}}-1 << endl;
+                LOG_ERROR("Error: tried to create synapse to neuron "
+                          "{{result_index}}=%ld outside range 0 to %d\n",
+                          _{{result_index}}, _{{result_index_size}}-1);
                 exit(1);
                 {% endif %}
             }
@@ -438,7 +442,6 @@ std::cout << std::endl;
         {{dynamic_multisynaptic_idx}}[_i] = source_target_count[source_target];
         {% endif %}
         source_target_count[source_target]++;
-        //printf("source target count = %i\n", source_target_count[source_target]);
         if (source_target_count[source_target] > 1)
         {
             {{owner.name}}_multiple_pre_post = true;
diff --git a/brian2cuda/templates/synapses_push_spikes.cu b/brian2cuda/templates/synapses_push_spikes.cu
index 08dee42e..c15c480e 100644
--- a/brian2cuda/templates/synapses_push_spikes.cu
+++ b/brian2cuda/templates/synapses_push_spikes.cu
@@ -188,8 +188,8 @@ __global__ void _before_run_kernel_{{codeobj_name}}(
         return;
     }
     else if (syn_N_check > INT_MAX){
-        printf("ERROR: There are more Synapses (%lu) than an int can "
-               "hold on this system (%u).\n", syn_N_check, INT_MAX);
+        LOG_ERROR("There are more Synapses (%lu) than an int can "
+                  "hold on this system (%u).\n", syn_N_check, INT_MAX);
     }
     // total number of synapses
     int syn_N = (int)syn_N_check;
@@ -573,8 +573,8 @@ __global__ void _before_run_kernel_{{codeobj_name}}(
         sum_num_elements += num_elements;
         updateMeanStd(count_num_elements, mean_num_elements, M2_num_elements, num_elements);
     }  // end for loop through connectivity matrix
-    printf("INFO connectivity matrix has size %i, number of (pre neuron ID, post neuron block) pairs is %u\n",
-            size_connectivity_matrix, num_pre_post_blocks);
+    LOG_DEBUG("Connectivity matrix has size %i, number of (pre neuron ID, post neuron block) pairs is %u\n",
+              size_connectivity_matrix, num_pre_post_blocks);
 
     {# If we have don't have heterogeneous delays, we just need to copy the
        synapse IDs and number of synapses per (preID, postBlock) to the device #}
@@ -799,29 +799,30 @@ __global__ void _before_run_kernel_{{codeobj_name}}(
     double std_num_unique_elements = getStd(count_num_unique_elements, M2_num_unique_elements);
     {% endif %}{# not no_or_const_delay_mode #}
 
+#ifdef DEF_LOG_DEBUG
     // print memory information
     std::cout.precision(1);
     std::cout.setf(std::ios::fixed, std::ios::floatfield);
-    std::cout << "INFO: synapse statistics and memory usage for {{owner.name}}:\n"
-        << "\tnumber of synapses: " << syn_N << "\n"
+    std::cout << "CUDA DEBUG\tSynapse statistics and memory usage for {{owner.name}}:\n"
+        << "\t\t\tnumber of synapses: " << syn_N << "\n"
     {% if not no_or_const_delay_mode and bundle_mode %}
-        << "\tnumber of bundles: " << num_bundle_ids << "\n"
+        << "\t\t\tnumber of bundles: " << num_bundle_ids << "\n"
     {% endif %}
-        << "\tnumber of pre/post blocks: " << num_pre_post_blocks << "\n"
-        << "\tnumber of synapses over all pre/post blocks:\n"
-        << "\t\tmean: " << mean_num_elements << "\tstd: "
+        << "\t\t\tnumber of pre/post blocks: " << num_pre_post_blocks << "\n"
+        << "\t\t\tnumber of synapses over all pre/post blocks:\n"
+        << "\t\t\t\tmean: " << mean_num_elements << "\tstd: "
             << std_num_elements << "\n"
     {% if not no_or_const_delay_mode %}
-        << "\tnumber of unique delays over all pre/post blocks:\n"
-        << "\t\tmean: " << mean_num_unique_elements << "\tstd: "
+        << "\t\t\tnumber of unique delays over all pre/post blocks:\n"
+        << "\t\t\t\tmean: " << mean_num_unique_elements << "\tstd: "
             << std_num_unique_elements << "\n"
     {% if bundle_mode %}
-    << "\tbundle size over all bundles:\n"
-        << "\t\tmean: " << mean_bundle_sizes << "\tstd: "
+    << "\t\t\tbundle size over all bundles:\n"
+        << "\t\t\t\tmean: " << mean_bundle_sizes << "\tstd: "
         << std_bundle_sizes << "\n"
     {% endif %}{# bundle_mode #}
     {% endif %}{# not no_or_const_delay_mode #}
-    << "\n\tmemory usage: TOTAL: " << total_memory_MB << " MB (~"
+    << "\n\t\t\tmemory usage: TOTAL: " << total_memory_MB << " MB (~"
         << total_memory_MB / syn_N * 1024.0 * 1024.0  << " byte per synapse)"
         << std::endl;
 
@@ -832,10 +833,11 @@ __global__ void _before_run_kernel_{{codeobj_name}}(
         std::tie(name, bytes, num_elements) = tuple;
         double memory = bytes * to_MB;
         double fraction = memory / total_memory_MB * 100;
-        std::cout << "\t\t" << std::setprecision(1) << std::fixed << fraction
+        std::cout << "\t\t\t\t" << std::setprecision(1) << std::fixed << fraction
             << "%\t" << std::setprecision(3) << std::fixed << memory << " MB\t"
             << name << " [" << num_elements << "]" << std::endl;
     }
+#endif
 
 
     // Create circular eventspaces in no_or_const_delay_mode
@@ -895,31 +897,31 @@ __global__ void _before_run_kernel_{{codeobj_name}}(
     {
         // use the max num_threads before launch failure
         num_threads = funcAttrib.maxThreadsPerBlock;
-        printf("WARNING Not enough ressources available to call "
-               "_before_run_kernel_{{codeobj_name}}"
-               "with maximum possible threads per block (%u). "
-               "Reducing num_threads to %u. (Kernel needs %i "
-               "registers per block, %i bytes of "
-               "statically-allocated shared memory per block, %i "
-               "bytes of local memory per thread and a total of %i "
-               "bytes of user-allocated constant memory)\n",
-               max_threads_per_block, num_threads, funcAttrib.numRegs,
-               funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes,
-               funcAttrib.constSizeBytes);
+        LOG_WARNING("Not enough ressources available to call "
+                    "_before_run_kernel_{{codeobj_name}}"
+                    "with maximum possible threads per block (%u). "
+                    "Reducing num_threads to %u. (Kernel needs %i "
+                    "registers per block, %i bytes of "
+                    "statically-allocated shared memory per block, %i "
+                    "bytes of local memory per thread and a total of %i "
+                    "bytes of user-allocated constant memory)\n",
+                    max_threads_per_block, num_threads, funcAttrib.numRegs,
+                    funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes,
+                    funcAttrib.constSizeBytes);
     }
     else
     {
-        printf("INFO _before_run_kernel_{{codeobj_name}}\n"
-               "\t%u blocks\n"
-               "\t%u threads\n"
-               "\t%i registers per thread\n"
-               "\t%i bytes statically-allocated shared memory per block\n"
-               "\t%i bytes local memory per thread\n"
-               "\t%i bytes user-allocated constant memory\n"
-               "",
-               num_blocks, num_threads, funcAttrib.numRegs,
-               funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes,
-               funcAttrib.constSizeBytes);
+        LOG_DEBUG("_before_run_kernel_{{codeobj_name}}\n"
+                  "\t\t\t%u blocks\n"
+                  "\t\t\t%u threads\n"
+                  "\t\t\t%i registers per thread\n"
+                  "\t\t\t%i bytes statically-allocated shared memory per block\n"
+                  "\t\t\t%i bytes local memory per thread\n"
+                  "\t\t\t%i bytes user-allocated constant memory\n"
+                  "",
+                  num_blocks, num_threads, funcAttrib.numRegs,
+                  funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes,
+                  funcAttrib.constSizeBytes);
     }
 
     _before_run_kernel_{{codeobj_name}}<<<num_blocks, num_threads>>>(
@@ -970,15 +972,16 @@ __global__ void _before_run_kernel_{{codeobj_name}}(
     cudaError_t status = cudaGetLastError();
     if (status != cudaSuccess)
     {
-        printf("ERROR initialising {{owner.name}} in %s:%d %s\n",
-                __FILE__, __LINE__, cudaGetErrorString(status));
+        LOG_ERROR("ERROR initialising {{owner.name}} in %s:%d %s\n",
+                  __FILE__, __LINE__, cudaGetErrorString(status));
         _dealloc_arrays();
         exit(status);
     }
 
     CUDA_CHECK_MEMORY();
+#ifdef DEF_LOG_DEBUG
     double time_passed = (double)(std::clock() - start_timer) / CLOCKS_PER_SEC;
-    std::cout << "INFO: {{owner.name}} initialisation took " <<  time_passed << "s";
+    std::cout << "CUDA DEBUG: {{owner.name}} initialisation took " <<  time_passed << "s";
     if (used_device_memory_after_dealloc < used_device_memory_start){
         size_t freed_bytes = used_device_memory_start - used_device_memory_after_dealloc;
         std::cout << ", freed " << freed_bytes * to_MB << "MB";
@@ -988,6 +991,7 @@ __global__ void _before_run_kernel_{{codeobj_name}}(
         std::cout << " and used " << used_bytes * to_MB << "MB of device memory.";
     }
     std::cout << std::endl;
+#endif
 
     first_run = false;
 {% endblock before_run_host_maincode %}
@@ -1084,7 +1088,7 @@ _run_kernel_{{codeobj_name}}(
 {% endblock host_maincode %}
 
 {% block kernel_info_num_blocks_str %}
-"\tvariable number of blocks (depends on number of spiking neurons)\n"
+"\t\t\tvariable number of blocks (depends on number of spiking neurons)\n"
 {% endblock %}
 {% block kernel_info_num_blocks_var %}
 {% endblock %}

From 888aaf178219fb60cfee32d7b47eed9d8a32087f Mon Sep 17 00:00:00 2001
From: Denis Alevi <mail@denisalevi.de>
Date: Sun, 30 Oct 2022 22:23:37 +0100
Subject: [PATCH 3/8] Turn Brian2CUDA info logs into debug logs

---
 brian2cuda/device.py         |  8 ++++----
 brian2cuda/utils/gputools.py | 16 ++++++++--------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/brian2cuda/device.py b/brian2cuda/device.py
index 6ee84b02..8f4a2f42 100644
--- a/brian2cuda/device.py
+++ b/brian2cuda/device.py
@@ -1174,14 +1174,14 @@ def generate_makefile(self, writer, cpp_compiler, cpp_compiler_flags, cpp_linker
 
         # Log compiled GPU architecture
         if self.compute_capability is None:
-            logger.info(
+            logger.debug(
                 f"Compiling device code with manually set architecture flags "
                 f"({gpu_arch_flags}). Be aware that the minimal supported compute "
                 f"capability is {self.minimal_compute_capability} "
                 "(we are not checking your compile flags)"
             )
         else:
-            logger.info(
+            logger.debug(
                 f"Compiling device code for compute capability "
                 f"{self.compute_capability} (compiler flags: {gpu_arch_flags})"
             )
@@ -1469,10 +1469,10 @@ def build(self, directory='output',
         for net in self.networks:
             net.after_run()
 
-        logger.info("Using the following preferences for CUDA standalone:")
+        logger.debug("Using the following preferences for CUDA standalone:")
         for pref_name in prefs:
             if "devices.cuda_standalone" in pref_name:
-                logger.info(f"\t{pref_name} = {prefs[pref_name]}")
+                logger.debug(f"\t{pref_name} = {prefs[pref_name]}")
 
         logger.debug("Using the following brian preferences:")
         for pref_name in prefs:
diff --git a/brian2cuda/utils/gputools.py b/brian2cuda/utils/gputools.py
index 44129338..65f27139 100644
--- a/brian2cuda/utils/gputools.py
+++ b/brian2cuda/utils/gputools.py
@@ -178,7 +178,7 @@ def _get_cuda_path():
     # Use preference if set
     cuda_path_pref = prefs.devices.cuda_standalone.cuda_backend.cuda_path
     if cuda_path_pref is not None:
-        logger.info(
+        logger.debug(
             f"CUDA installation directory given via preference "
             f"`prefs.devices.cuda_standalone.cuda_backend.cuda_path={cuda_path_pref}`"
         )
@@ -189,7 +189,7 @@ def _get_cuda_path():
     # Use environment variable if set
     cuda_path = os.environ.get("CUDA_PATH", "")  # Nvidia default on Windows
     if os.path.exists(cuda_path):
-        logger.info(
+        logger.debug(
             "CUDA installation directory given via environment variable `CUDA_PATH={}`"
             "".format(cuda_path)
         )
@@ -199,7 +199,7 @@ def _get_cuda_path():
     nvcc_path = shutil.which("nvcc")
     if nvcc_path is not None:
         cuda_path_nvcc = os.path.dirname(os.path.dirname(nvcc_path))
-        logger.info(
+        logger.debug(
             "CUDA installation directory detected via location of `nvcc` binary: {}"
             "".format(cuda_path_nvcc)
         )
@@ -208,7 +208,7 @@ def _get_cuda_path():
     # Use standard location /usr/local/cuda
     if os.path.exists("/usr/local/cuda"):
         cuda_path_usr = "/usr/local/cuda"
-        logger.info(
+        logger.debug(
             f"CUDA installation directory found in standard location: {cuda_path_usr}"
         )
         return (cuda_path_usr, 'default')
@@ -216,7 +216,7 @@ def _get_cuda_path():
     # Use standard location /opt/cuda
     if os.path.exists("/opt/cuda"):
         cuda_path_opt = "/opt/cuda"
-        logger.info(
+        logger.debug(
             f"CUDA installation directory found in standard location: {cuda_path_opt}"
         )
         return (cuda_path_opt, 'default')
@@ -317,7 +317,7 @@ def _select_gpu():
             compute_capability = get_compute_capability(gpu_id)
         gpu_list = get_available_gpus()
     else:
-        logger.info(
+        logger.debug(
             "Automatic detection of GPU names and compute capabilities disabled, using "
             "manual preferences"
         )
@@ -336,7 +336,7 @@ def _select_gpu():
     if gpu_list is not None:
         gpu_name = f" ({gpu_list[gpu_id]})"
 
-    logger.info(
+    logger.debug(
         f"Compiling device code for GPU {gpu_id}{gpu_name}"
     )
 
@@ -485,7 +485,7 @@ def _get_compute_capability_with_device_query(gpu_id):
                 f"Brian2CUDA documentations for more details."
             )
     else:
-        logger.info(
+        logger.debug(
             "Path to `deviceQuery` binary set via "
             "`prefs.devices.cuda_standalone.cuda_backend.device_query_path = "
             f"{device_query_path}`"

From e731ce38558469f4b71188461fc50e8415860a38 Mon Sep 17 00:00:00 2001
From: Denis Alevi <mail@denisalevi.de>
Date: Sun, 30 Oct 2022 22:30:34 +0100
Subject: [PATCH 4/8] Add INFO logs for compilation and execution

---
 brian2cuda/device.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/brian2cuda/device.py b/brian2cuda/device.py
index 8f4a2f42..e34cf9cd 100644
--- a/brian2cuda/device.py
+++ b/brian2cuda/device.py
@@ -1480,8 +1480,10 @@ def build(self, directory='output',
                 logger.debug(f"\t{pref_name} = {prefs[pref_name]}")
 
         if compile:
+            logger.info("Compiling CUDA standalone project...")
             self.compile_source(directory, cpp_compiler, debug, clean)
             if run:
+                logger.info("Running CUDA standalone simulation...")
                 self.run(directory, with_output, run_args)
 
     def network_run(self, net, duration, report=None, report_period=10*second,

From 6b96167d0b145a3cfa0dbd4bf5719c8708115398 Mon Sep 17 00:00:00 2001
From: Denis Alevi <mail@denisalevi.de>
Date: Sun, 30 Oct 2022 22:53:40 +0100
Subject: [PATCH 5/8] Implement `ComputationTimeSummary`

For now, this can be enabled via `set_device(..., report_timers=True`,
but should eventually be moved into brian2
---
 brian2cuda/device.py                          | 153 +++++++++++++++++-
 .../tests/features/cuda_configuration.py      |   2 +-
 2 files changed, 152 insertions(+), 3 deletions(-)

diff --git a/brian2cuda/device.py b/brian2cuda/device.py
index e34cf9cd..26353831 100644
--- a/brian2cuda/device.py
+++ b/brian2cuda/device.py
@@ -18,13 +18,13 @@
 from brian2.core.preferences import prefs, PreferenceError
 from brian2.core.variables import ArrayVariable, DynamicArrayVariable, Constant
 from brian2.parsing.rendering import CPPNodeRenderer
-from brian2.devices.device import all_devices
+from brian2.devices.device import all_devices, get_device
 from brian2.synapses.synapses import Synapses, SynapticPathway
 from brian2.utils.filetools import copy_directory, ensure_directory
 from brian2.utils.stringtools import get_identifiers, stripped_deindented_lines
 from brian2.codegen.generators.cpp_generator import c_data_type
 from brian2.utils.logger import get_logger
-from brian2.units import second
+from brian2.units import second, msecond
 from brian2.monitors import SpikeMonitor, StateMonitor, EventMonitor
 from brian2.groups import Subgroup
 
@@ -128,6 +128,13 @@ def __init__(self):
         # List of names of all variables which are only required on host and will not
         # be copied to device memory
         self.variables_on_host_only = []
+        # Report self.timers
+        self.report_timers = False
+        self.timers_file = None
+        self.timers['run_binary'] = {
+            'initialization': None, 'simulation_loop': None, 'finalization': None
+        }
+
 
     def get_array_name(self, var, access_data=True, prefix=None):
         '''
@@ -1204,6 +1211,7 @@ def generate_makefile(self, writer, cpp_compiler, cpp_compiler_flags, cpp_linker
                 compiler_debug_flags = ''
                 linker_debug_flags = ''
 
+            # TODO: CHECK THIS IS WORKING?
             if disable_asserts:
                 # NDEBUG precompiler macro disables asserts (both for C++ and CUDA)
                 nvcc_compiler_flags += ['-NDEBUG']
@@ -1485,6 +1493,41 @@ def build(self, directory='output',
             if run:
                 logger.info("Running CUDA standalone simulation...")
                 self.run(directory, with_output, run_args)
+                if self.report_timers:
+                    # Read standalone timers from file, using same code we used for
+                    # benchmarks in brian2/tests/features/base.py (brian2.diff)
+                    assert self.timers_file is not None
+                    # Load timers from standalone project
+                    cpp_timers = {}
+                    timers_file_path = os.path.join(directory, self.timers_file)
+                    if os.path.exists(timers_file_path):
+                        with open(timers_file_path, "r") as f:
+                            for line in f.readlines():
+                                name, time = line.split()
+                                if time == "None":
+                                    time = None
+                                else:
+                                    # We record in microseconds, convert to seconds
+                                    time = float(time) / 1e6
+                                cpp_timers[name] = time
+                    else:
+                        logger.error(
+                            f"timers_file_path not found at {timers_file_path}"
+                        )
+
+                    self.timers['run_binary']['initialization'] = (
+                        cpp_timers['before_network_run'] - cpp_timers['before_start']
+                    )
+                    self.timers['run_binary']['simulation_loop'] = (
+                        cpp_timers['after_network_run']
+                        - cpp_timers['before_network_run']
+                    )
+                    self.timers['run_binary']['finalization'] = (
+                        cpp_timers['after_end'] - cpp_timers['after_network_run']
+                    )
+
+                    print()
+                    print(computation_time_summary())
 
     def network_run(self, net, duration, report=None, report_period=10*second,
                     namespace=None, profile=False, level=0, **kwds):
@@ -1511,6 +1554,9 @@ def network_run(self, net, duration, report=None, report_period=10*second,
         for clock in net._clocks:
             clock.set_interval(net.t, t_end)
 
+        if 'report_timers' in self.build_options:
+            self.report_timers = self.build_options.pop('report_timers')
+
         # Get the local namespace
         if namespace is None:
             namespace = get_local_namespace(level=level+2)
@@ -1672,6 +1718,39 @@ def network_run(self, net, duration, report=None, report_period=10*second,
             if clock not in all_clocks:
                 run_lines.append(f'{net.name}.add(&{clock.name}, NULL);')
 
+        # Insert timer code when reporting timers is enabled (this needs to happen
+        # before the before/after_network_run slots are added to run_lines
+        if self.report_timers:
+            # Need chrono header for timing functions
+            prefs.codegen.cpp.headers += ["<chrono>"]
+            from .tests.features.cuda_configuration import (
+                SETUP_TIMER, TIME_DIFF, CLOSE_TIMER
+            )
+            # Insert code for timers, file path is relative to main.cu
+            self.timers_file = os.path.join('results', 'timers')
+            self.insert_code("before_start", SETUP_TIMER.format(fname=self.timers_file))
+            self.insert_code("before_start", TIME_DIFF.format(name="before_start"))
+            self.insert_code(
+                "before_start",
+                r'LOG_INFO("%s", "Initializing standalone simulation...\n");'
+            )
+            self.insert_code(
+                "before_network_run", TIME_DIFF.format(name="before_network_run")
+            )
+            self.insert_code(
+                "before_network_run",
+                r'LOG_INFO("%s", "Starting simulation loop...\n");'
+            )
+            self.insert_code(
+                "after_network_run", TIME_DIFF.format(name="after_network_run")
+            )
+            self.insert_code(
+                "before_network_run",
+                r'LOG_INFO("%s", "Finalizing standalone simulation...\n");'
+            )
+            self.insert_code("after_end", TIME_DIFF.format(name="after_end"))
+            self.insert_code("after_end", CLOSE_TIMER)
+
         run_lines.extend(self.code_lines['before_network_run'])
         # run everything that is run on a clock
         run_lines.append(
@@ -1733,6 +1812,7 @@ def network_run(self, net, duration, report=None, report_period=10*second,
             self.build(direct_call=False, **self.build_options)
 
         self.first_run = False
+
     def fill_with_array(self, var, *args, **kwargs):
         # If the delay variable is set after the first run call, do not delete it on the
         # device (which is happening by default)
@@ -1759,6 +1839,75 @@ def network_restore(self, net, *args, **kwds):
                                    'supported in CUDA standalone'))
 
 
+class ComputationTimeSummary(object):
+    """
+    Class to nicely display the contribution of different computations times. Objects of
+    this class are returned by `computation_time_summary`.
+    """
+    def __init__(self, timers):
+        if timers is not None:
+            names = timers.keys()
+            times = timers.values()
+        else:  # Happens if report_timers is False
+            # Use a dummy entry to prevent problems with empty lists later
+            names = ['no computation tims have been recorded']
+            times = [0*second]
+        self.total_time = sum(times)
+        self.time_unit = msecond
+        if self.total_time>1*second:
+            self.time_unit = second
+        if self.total_time>0*second:
+            self.percentages = [100.0*time/self.total_time for time in times]
+        else:
+            self.percentages = [0. for _ in times]
+        self.names_maxlen = max(len(name) for name in names)
+        self.names = [name+' '*(self.names_maxlen-len(name)) for name in names]
+        self.times = times
+
+    def __repr__(self):
+        times = [f'{time / self.time_unit:.2f} {self.time_unit}' for time in self.times]
+        times_maxlen = max(len(time) for time in times)
+        times = [' '*(times_maxlen-len(time))+time for time in times]
+        percentages = [f'{percentage:.2f} %' for percentage in self.percentages]
+        percentages_maxlen = max(len(percentage) for percentage in percentages)
+        percentages = [(' '*(percentages_maxlen-len(percentage)))+percentage for percentage in percentages]
+
+        s = 'Computation time summary'
+        s += f"\n{'=' * len(s)}\n"
+        for name, time, percentage in zip(self.names, times, percentages):
+            s += f'{name}    {time}    {percentage}\n'
+        return s
+
+    def _repr_html_(self):
+        times = [f'{time / self.time_unit:.2f} {self.time_unit}' for time in self.times]
+        percentages = [f'{percentage:.2f} %' for percentage in self.percentages]
+        s = '<h2 class="brian_comp_time_summary_header">Computation time summary</h2>\n'
+        s += '<table class="brian_comp_time_summary_table">\n'
+        for name, time, percentage in zip(self.names, times, percentages):
+            s += '<tr>'
+            s += f'<td>{name}</td>'
+            s += f'<td style="text-align: right">{time}</td>'
+            s += f'<td style="text-align: right">{percentage}</td>'
+            s += '</tr>\n'
+        s += '</table>'
+        return s
+
+
+def computation_time_summary():
+    """
+    Returns a `ComputationTimeSummary` of the profiling info for a run. This object
+    can be transformed to a string explicitly but on an interactive console
+    simply calling `profiling_summary` is enough since it will
+    automatically convert the `ProfilingSummary` object.
+    """
+    device_timers = get_device().timers
+    pretty_times = {
+        'Compilation': sum(filter(None, device_timers['compile'].values())) * second,
+        'Initialization': device_timers['run_binary']['initialization'] * second,
+        'Simulation loop': device_timers['run_binary']['simulation_loop'] * second,
+        'Finalization': device_timers['run_binary']['finalization'] * second
+    }
+    return ComputationTimeSummary(pretty_times)
 
 def prepare_codeobj_code_for_rng(codeobj):
     '''
diff --git a/brian2cuda/tests/features/cuda_configuration.py b/brian2cuda/tests/features/cuda_configuration.py
index 77bd51e5..906d36b6 100644
--- a/brian2cuda/tests/features/cuda_configuration.py
+++ b/brian2cuda/tests/features/cuda_configuration.py
@@ -164,7 +164,7 @@ def after_run(self):
         with open(python_benchmark_path, "w") as file:
             # Timer for `brian.run()` call recorded in `TimedSpeedTest.timed_run()`
             file.write(f"run_brian {self.feature_test.runtime}\n")
-            # Timers for compilcation and binary execution recorded in
+            # Timers for compilation and binary execution recorded in
             # `device.compile_source()` and `device.run()`
             for key, value in device.timers.items():
                 if isinstance(value, float):

From 37d69e78c88e3ce952a0642555a291833799b2ff Mon Sep 17 00:00:00 2001
From: Denis Alevi <mail@denisalevi.de>
Date: Sun, 30 Oct 2022 23:02:30 +0100
Subject: [PATCH 6/8] Report standalone section starts independent of
 `report_timers`

---
 brian2cuda/device.py         | 14 ++------------
 brian2cuda/templates/main.cu |  1 +
 2 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/brian2cuda/device.py b/brian2cuda/device.py
index 26353831..fcd2fe9d 100644
--- a/brian2cuda/device.py
+++ b/brian2cuda/device.py
@@ -1730,32 +1730,22 @@ def network_run(self, net, duration, report=None, report_period=10*second,
             self.timers_file = os.path.join('results', 'timers')
             self.insert_code("before_start", SETUP_TIMER.format(fname=self.timers_file))
             self.insert_code("before_start", TIME_DIFF.format(name="before_start"))
-            self.insert_code(
-                "before_start",
-                r'LOG_INFO("%s", "Initializing standalone simulation...\n");'
-            )
             self.insert_code(
                 "before_network_run", TIME_DIFF.format(name="before_network_run")
             )
-            self.insert_code(
-                "before_network_run",
-                r'LOG_INFO("%s", "Starting simulation loop...\n");'
-            )
             self.insert_code(
                 "after_network_run", TIME_DIFF.format(name="after_network_run")
             )
-            self.insert_code(
-                "before_network_run",
-                r'LOG_INFO("%s", "Finalizing standalone simulation...\n");'
-            )
             self.insert_code("after_end", TIME_DIFF.format(name="after_end"))
             self.insert_code("after_end", CLOSE_TIMER)
 
         run_lines.extend(self.code_lines['before_network_run'])
+        run_lines.append(r'LOG_INFO("%s", "Starting simulation loop...\n");')
         # run everything that is run on a clock
         run_lines.append(
             f'{net.name}.run({float(duration)!r}, {report_call}, {float(report_period)!r});'
         )
+        run_lines.append(r'LOG_INFO("%s", "Finalizing standalone simulation...\n");')
         run_lines.extend(self.code_lines['after_network_run'])
         # for multiple runs, the random number buffer needs to be reset
         run_lines.append('random_number_buffer.run_finished();')
diff --git a/brian2cuda/templates/main.cu b/brian2cuda/templates/main.cu
index 5ae31f4e..e1dd2e55 100644
--- a/brian2cuda/templates/main.cu
+++ b/brian2cuda/templates/main.cu
@@ -27,6 +27,7 @@
 
 int main(int argc, char **argv)
 {
+    LOG_INFO("%s", "Initializing standalone simulation...\n");
     {{'\n'.join(code_lines['before_start'])|autoindent}}
 
     // seed variable set in Python through brian2.seed() calls can use this

From 4b21313a1a9924a6c04b45885278ad99c706bce9 Mon Sep 17 00:00:00 2001
From: Denis Alevi <mail@denisalevi.de>
Date: Tue, 1 Nov 2022 14:17:14 +0100
Subject: [PATCH 7/8] Implement `helpful` prefs for info prints

---
 brian2cuda/cuda_prefs.py     |  5 +++++
 brian2cuda/device.py         | 32 +++++++++++++++++++++++++-------
 brian2cuda/templates/main.cu |  4 ++++
 brian2cuda/utils/gputools.py |  8 +++++---
 4 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/brian2cuda/cuda_prefs.py b/brian2cuda/cuda_prefs.py
index 7bcb3998..331878ec 100644
--- a/brian2cuda/cuda_prefs.py
+++ b/brian2cuda/cuda_prefs.py
@@ -40,6 +40,11 @@ def validate_bundle_size_expression(string):
     'devices.cuda_standalone',
     'Brian2CUDA preferences',
 
+    helpful=BrianPreference(
+        default=False,
+        docs="Give basic information message to help new Brian2CUDA users",
+        validator=lambda v: v in [True, False]),
+
     SM_multiplier = BrianPreference(
         default=1,
         docs='The number of blocks per SM. By default, this value is set to 1.',
diff --git a/brian2cuda/device.py b/brian2cuda/device.py
index fcd2fe9d..3c9a9219 100644
--- a/brian2cuda/device.py
+++ b/brian2cuda/device.py
@@ -705,7 +705,8 @@ def generate_main_source(self, writer):
                                                            report_func=self.report_func,
                                                            dt=float(defaultclock.dt),
                                                            user_headers=user_headers,
-                                                           gpu_heap_size=prefs['devices.cuda_standalone.cuda_backend.gpu_heap_size']
+                                                           gpu_heap_size=prefs['devices.cuda_standalone.cuda_backend.gpu_heap_size'],
+                                                           helpful=prefs.devices.cuda_standalone.helpful
                                                           )
         writer.write('main.cu', main_tmp)
 
@@ -1488,10 +1489,16 @@ def build(self, directory='output',
                 logger.debug(f"\t{pref_name} = {prefs[pref_name]}")
 
         if compile:
-            logger.info("Compiling CUDA standalone project...")
+            if prefs.devices.cuda_standalone.helpful:
+                logger.info("Compiling CUDA standalone project...")
+            else:
+                logger.debug("Compiling CUDA standalone project...")
             self.compile_source(directory, cpp_compiler, debug, clean)
             if run:
-                logger.info("Running CUDA standalone simulation...")
+                if prefs.devices.cuda_standalone.helpful:
+                    logger.info("Running CUDA standalone simulation...")
+                else:
+                    logger.debug("Running CUDA standalone simulation...")
                 self.run(directory, with_output, run_args)
                 if self.report_timers:
                     # Read standalone timers from file, using same code we used for
@@ -1526,8 +1533,7 @@ def build(self, directory='output',
                         cpp_timers['after_end'] - cpp_timers['after_network_run']
                     )
 
-                    print()
-                    print(computation_time_summary())
+                    print(f"\n{computation_time_summary()}")
 
     def network_run(self, net, duration, report=None, report_period=10*second,
                     namespace=None, profile=False, level=0, **kwds):
@@ -1740,12 +1746,24 @@ def network_run(self, net, duration, report=None, report_period=10*second,
             self.insert_code("after_end", CLOSE_TIMER)
 
         run_lines.extend(self.code_lines['before_network_run'])
-        run_lines.append(r'LOG_INFO("%s", "Starting simulation loop...\n");')
+
+        if prefs.devices.cuda_standalone.helpful:
+            start_sim = r'LOG_INFO("%s", "Starting simulation loop...\n");'
+        else:
+            start_sim = r'LOG_DEBUG("%s", "Starting simulation loop...\n");'
+        run_lines.append(start_sim)
+
         # run everything that is run on a clock
         run_lines.append(
             f'{net.name}.run({float(duration)!r}, {report_call}, {float(report_period)!r});'
         )
-        run_lines.append(r'LOG_INFO("%s", "Finalizing standalone simulation...\n");')
+
+        if prefs.devices.cuda_standalone.helpful:
+            start_fin = r'LOG_INFO("%s", "Finalizing standalone simulation...\n");'
+        else:
+            start_fin = r'LOG_DEBUG("%s", "Finalizing standalone simulation...\n");'
+        run_lines.append(start_fin)
+
         run_lines.extend(self.code_lines['after_network_run'])
         # for multiple runs, the random number buffer needs to be reset
         run_lines.append('random_number_buffer.run_finished();')
diff --git a/brian2cuda/templates/main.cu b/brian2cuda/templates/main.cu
index e1dd2e55..dd9e08b5 100644
--- a/brian2cuda/templates/main.cu
+++ b/brian2cuda/templates/main.cu
@@ -27,8 +27,12 @@
 
 int main(int argc, char **argv)
 {
+    {% if helpful %}
     LOG_INFO("%s", "Initializing standalone simulation...\n");
+    {% else %}
+    LOG_DEBUG("%s", "Initializing standalone simulation...\n");
     {{'\n'.join(code_lines['before_start'])|autoindent}}
+    {% endif %}
 
     // seed variable set in Python through brian2.seed() calls can use this
     // variable (see device.py CUDAStandaloneDevice.generate_main_source())
diff --git a/brian2cuda/utils/gputools.py b/brian2cuda/utils/gputools.py
index 65f27139..3484d0ce 100644
--- a/brian2cuda/utils/gputools.py
+++ b/brian2cuda/utils/gputools.py
@@ -336,9 +336,11 @@ def _select_gpu():
     if gpu_list is not None:
         gpu_name = f" ({gpu_list[gpu_id]})"
 
-    logger.debug(
-        f"Compiling device code for GPU {gpu_id}{gpu_name}"
-    )
+    msg = f"Compiling device code for GPU {gpu_id}{gpu_name}"
+    if prefs.devices.cuda_standalone.helpful:
+        logger.info(msg)
+    else:
+        logger.debug(msg)
 
     return gpu_id, compute_capability
 

From 2adc797f77b3dcb305da4b048fc0f26a7a85fb73 Mon Sep 17 00:00:00 2001
From: Denis Alevi <mail@denisalevi.de>
Date: Tue, 1 Nov 2022 14:17:56 +0100
Subject: [PATCH 8/8] Report timers by default [temporarily]

---
 brian2cuda/device.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/brian2cuda/device.py b/brian2cuda/device.py
index 3c9a9219..70d728c2 100644
--- a/brian2cuda/device.py
+++ b/brian2cuda/device.py
@@ -129,7 +129,7 @@ def __init__(self):
         # be copied to device memory
         self.variables_on_host_only = []
         # Report self.timers
-        self.report_timers = False
+        self.report_timers = True
         self.timers_file = None
         self.timers['run_binary'] = {
             'initialization': None, 'simulation_loop': None, 'finalization': None