From 092ee677647a35e3967849635ebf9fb8f888d3b4 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Mon, 12 Jun 2023 15:49:59 +0100 Subject: [PATCH] Change CUDA DOT thread-blocks to 1024 This improves the performance on Ampere (A100) GPUs. Fixes #137. --- src/cuda/CUDAStream.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cuda/CUDAStream.h b/src/cuda/CUDAStream.h index 83b8c665..bb3f8665 100644 --- a/src/cuda/CUDAStream.h +++ b/src/cuda/CUDAStream.h @@ -22,7 +22,7 @@ #endif #define TBSIZE 1024 -#define DOT_NUM_BLOCKS 256 +#define DOT_NUM_BLOCKS 1024 template class CUDAStream : public Stream