From 4297ba58df5ec85085adcfeab2917dcd61d7e11e Mon Sep 17 00:00:00 2001
From: Amit Kumar <dtu.amit@gmail.com>
Date: Wed, 5 May 2021 16:30:34 +0530
Subject: [PATCH] Add more documentation

---
 sgkit/distance/api.py        |  2 +-
 sgkit/distance/metrics.py    | 15 +++++++++++++++
 sgkit/tests/test_distance.py |  2 +-
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/sgkit/distance/api.py b/sgkit/distance/api.py
index fa780189f..896f847fd 100644
--- a/sgkit/distance/api.py
+++ b/sgkit/distance/api.py
@@ -63,7 +63,7 @@ def pairwise_distance(
         Omit to let dask heuristically decide a good default. A default can
         also be set globally with the split_every key in dask.config.
     device
-        The architecture to run the calculation on, either of cpu or gpu
+        The architecture to run the calculation on, either of "cpu" or "gpu"
 
     Returns
     -------
diff --git a/sgkit/distance/metrics.py b/sgkit/distance/metrics.py
index 2e96472d3..5d45ce9e9 100644
--- a/sgkit/distance/metrics.py
+++ b/sgkit/distance/metrics.py
@@ -193,6 +193,18 @@ def call_metric_kernel(
     out = np.zeros((f.shape[0], g.shape[0], N_MAP_PARAM[metric]), dtype=f.dtype)
     d_out = cuda.to_device(out)
 
+    # https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications__technical-specifications-per-compute-capability
+    # These apply to compute capability 2.0 and higher and all GPUs NVIDIA has
+    # shipped in the past 10+ years have compute capability > 3.0.
+    # One way to get the compute capability programmatically is via:
+    # from numba import cuda
+    # cuda.get_current_device().compute_capability
+
+    # In future when we have an average GPU with ability to have
+    # more number of threads per block, we can increase this to that value
+    # or parameterise this from the pairwise function or get the maximum
+    # possible value for a given compute capability.
+
     threads_per_block = (32, 32)
     blocks_per_grid = (
         math.ceil(out.shape[0] / threads_per_block[0]),
@@ -211,6 +223,9 @@ def _correlation(
 ) -> None:  # pragma: no cover.
     # Note: assigning variable and only saving the final value in the
     # array made this significantly faster.
+
+    # aggressively making all variables explicitly typed
+    # makes it more performant by a factor of ~2-3x
     v0 = types.float32(0)
     v1 = types.float32(0)
     v2 = types.float32(0)
diff --git a/sgkit/tests/test_distance.py b/sgkit/tests/test_distance.py
index 97c7977fd..908109b7b 100644
--- a/sgkit/tests/test_distance.py
+++ b/sgkit/tests/test_distance.py
@@ -17,7 +17,7 @@
 
 def detect_cuda_driver() -> bool:
     try:
-        return bool(len(cuda.list_devices()))
+        return len(cuda.list_devices()) > 0
     except cuda.CudaSupportError:
         return False