From 4297ba58df5ec85085adcfeab2917dcd61d7e11e Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Wed, 5 May 2021 16:30:34 +0530 Subject: [PATCH] Add more documentation --- sgkit/distance/api.py | 2 +- sgkit/distance/metrics.py | 15 +++++++++++++++ sgkit/tests/test_distance.py | 2 +- 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/sgkit/distance/api.py b/sgkit/distance/api.py index fa780189f..896f847fd 100644 --- a/sgkit/distance/api.py +++ b/sgkit/distance/api.py @@ -63,7 +63,7 @@ def pairwise_distance( Omit to let dask heuristically decide a good default. A default can also be set globally with the split_every key in dask.config. device - The architecture to run the calculation on, either of cpu or gpu + The architecture to run the calculation on, either of "cpu" or "gpu" Returns ------- diff --git a/sgkit/distance/metrics.py b/sgkit/distance/metrics.py index 2e96472d3..5d45ce9e9 100644 --- a/sgkit/distance/metrics.py +++ b/sgkit/distance/metrics.py @@ -193,6 +193,18 @@ def call_metric_kernel( out = np.zeros((f.shape[0], g.shape[0], N_MAP_PARAM[metric]), dtype=f.dtype) d_out = cuda.to_device(out) + # https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications__technical-specifications-per-compute-capability + # These apply to compute capability 2.0 and higher and all GPUs NVIDIA has + # shipped in the past 10+ years have compute capability > 3.0. + # One way to get the compute capability programmatically is via: + # from numba import cuda + # cuda.get_current_device().compute_capability + + # In future when we have an average GPU with ability to have + # more number of threads per block, we can increase this to that value + # or parameterise this from the pairwise function or get the maximum + # possible value for a given compute capability. + threads_per_block = (32, 32) blocks_per_grid = ( math.ceil(out.shape[0] / threads_per_block[0]), @@ -211,6 +223,9 @@ def _correlation( ) -> None: # pragma: no cover. # Note: assigning variable and only saving the final value in the # array made this significantly faster. + + # aggressively making all variables explicitly typed + # makes it more performant by a factor of ~2-3x v0 = types.float32(0) v1 = types.float32(0) v2 = types.float32(0) diff --git a/sgkit/tests/test_distance.py b/sgkit/tests/test_distance.py index 97c7977fd..908109b7b 100644 --- a/sgkit/tests/test_distance.py +++ b/sgkit/tests/test_distance.py @@ -17,7 +17,7 @@ def detect_cuda_driver() -> bool: try: - return bool(len(cuda.list_devices())) + return len(cuda.list_devices()) > 0 except cuda.CudaSupportError: return False