From 59235bf537b2e477fd63c506aaf31a2b6402e08d Mon Sep 17 00:00:00 2001
From: Devin Robison <drobison@nvidia.com>
Date: Tue, 14 Jul 2020 15:45:26 -0600
Subject: [PATCH 01/14] WIP Test Commit

---
 cpp/src/tsne/barnes_hut.cuh     | 170 ++++++++++++++------------------
 cpp/src/tsne/bh_kernels.cuh     |  61 +++++++++---
 cpp/src/tsne/exact_kernels.cuh  |   8 +-
 cpp/src_prims/selection/knn.cuh |   1 +
 4 files changed, 125 insertions(+), 115 deletions(-)
diff --git a/cpp/src/tsne/barnes_hut.cuh b/cpp/src/tsne/barnes_hut.cuh
index 56ea5b8a0a..c308da687b 100644
--- a/cpp/src/tsne/barnes_hut.cuh
+++ b/cpp/src/tsne/barnes_hut.cuh
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <common/cudart_utils.h>
+#include <common/device_buffer.hpp>
 #include <cuml/common/logger.hpp>
 #include "bh_kernels.cuh"
 #include "utils.cuh"
@@ -55,6 +56,7 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ,
                 const int max_iter = 1000, const float min_grad_norm = 1e-7,
                 const float pre_momentum = 0.5, const float post_momentum = 0.8,
                 const long long random_state = -1) {
+  using MLCommon::device_buffer;
   auto d_alloc = handle.getDeviceAllocator();
   cudaStream_t stream = handle.getStream();
 
@@ -71,71 +73,67 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ,
   // Allocate more space
   //---------------------------------------------------
   //int *errl = (int *)d_alloc->allocate(sizeof(int), stream);
-  unsigned *limiter = (unsigned *)d_alloc->allocate(sizeof(unsigned), stream);
-  int *maxdepthd = (int *)d_alloc->allocate(sizeof(int), stream);
-  int *bottomd = (int *)d_alloc->allocate(sizeof(int), stream);
-  float *radiusd = (float *)d_alloc->allocate(sizeof(float), stream);
+  device_buffer<unsigned> limiter(d_alloc, stream, 1);
+  device_buffer<int> maxdepthd(d_alloc, stream, 1);
+  device_buffer<int> bottomd(d_alloc, stream, 1);
+  device_buffer<float> radiusd(d_alloc, stream, 1);
+
+  TSNE::InitializationKernel<<<1, 1, 0, stream>>>(/*errl,*/ limiter.data(), maxdepthd.data(),
+                                                  radiusd.data());
 
-  TSNE::InitializationKernel<<<1, 1, 0, stream>>>(/*errl,*/ limiter, maxdepthd,
-                                                  radiusd);
   CUDA_CHECK(cudaPeekAtLastError());
 
   const int FOUR_NNODES = 4 * nnodes;
   const int FOUR_N = 4 * n;
   const float theta_squared = theta * theta;
   const int NNODES = nnodes;
+  bool h_chk_nan;
 
   // Actual mallocs
-  int *startl = (int *)d_alloc->allocate(sizeof(int) * (nnodes + 1), stream);
-  int *childl =
-    (int *)d_alloc->allocate(sizeof(int) * (nnodes + 1) * 4, stream);
-  float *massl =
-    (float *)d_alloc->allocate(sizeof(float) * (nnodes + 1), stream);
-  thrust::device_ptr<float> begin_massl = thrust::device_pointer_cast(massl);
+  device_buffer<int> startl(d_alloc, stream, nnodes +1);
+  device_buffer<int> childl(d_alloc, stream, (nnodes + 1) * 4);
+  device_buffer<float> massl(d_alloc, stream, nnodes + 1);
+  thrust::device_ptr<float> begin_massl = thrust::device_pointer_cast(massl.data());
   thrust::fill(thrust::cuda::par.on(stream), begin_massl,
                begin_massl + (nnodes + 1), 1.0f);
 
-  float *maxxl =
-    (float *)d_alloc->allocate(sizeof(float) * blocks * FACTOR1, stream);
-  float *maxyl =
-    (float *)d_alloc->allocate(sizeof(float) * blocks * FACTOR1, stream);
-  float *minxl =
-    (float *)d_alloc->allocate(sizeof(float) * blocks * FACTOR1, stream);
-  float *minyl =
-    (float *)d_alloc->allocate(sizeof(float) * blocks * FACTOR1, stream);
+  device_buffer<float> maxxl(d_alloc, stream, blocks * FACTOR1);
+  device_buffer<float> maxyl(d_alloc, stream, blocks * FACTOR1);
+  device_buffer<float> minxl(d_alloc, stream, blocks * FACTOR1);
+  device_buffer<float> minyl(d_alloc, stream, blocks * FACTOR1);
 
   // SummarizationKernel
-  int *countl = (int *)d_alloc->allocate(sizeof(int) * (nnodes + 1), stream);
+  device_buffer<int> countl(d_alloc, stream, nnodes + 1);
 
   // SortKernel
-  int *sortl = (int *)d_alloc->allocate(sizeof(int) * (nnodes + 1), stream);
+  device_buffer<int> sortl(d_alloc, stream, nnodes + 1);
 
   // RepulsionKernel
-  float *rep_forces =
-    (float *)d_alloc->allocate(sizeof(float) * (nnodes + 1) * 2, stream);
-  float *attr_forces = (float *)d_alloc->allocate(
-    sizeof(float) * n * 2, stream);  // n*2 double for reduction sum
+  device_buffer<float> rep_forces(d_alloc, stream, (nnodes + 1) * 2);
+  device_buffer<float> attr_forces(d_alloc, stream, n * 2);
 
-  float *norm_add1 = (float *)d_alloc->allocate(sizeof(float) * n, stream);
-  float *norm = (float *)d_alloc->allocate(sizeof(float) * n, stream);
-  float *Z_norm = (float *)d_alloc->allocate(sizeof(float), stream);
+  device_buffer<float> norm_add1(d_alloc, stream, n);
+  device_buffer<float> norm(d_alloc, stream, n);
+  device_buffer<float> Z_norm(d_alloc, stream, 1);
 
-  float *radiusd_squared = (float *)d_alloc->allocate(sizeof(float), stream);
+  device_buffer<float> radiusd_squared(d_alloc, stream, 1);
 
   // Apply
-  float *gains_bh = (float *)d_alloc->allocate(sizeof(float) * n * 2, stream);
+  device_buffer<float> gains_bh(d_alloc, stream, n * 2);
   thrust::device_ptr<float> begin_gains_bh =
-    thrust::device_pointer_cast(gains_bh);
+    thrust::device_pointer_cast(gains_bh.data());
   thrust::fill(thrust::cuda::par.on(stream), begin_gains_bh,
                begin_gains_bh + (n * 2), 1.0f);
 
-  float *old_forces = (float *)d_alloc->allocate(sizeof(float) * n * 2, stream);
-  CUDA_CHECK(cudaMemsetAsync(old_forces, 0, sizeof(float) * n * 2, stream));
+  device_buffer<float> old_forces(d_alloc, stream, n * 2);
+  CUDA_CHECK(cudaMemsetAsync(old_forces.data(), 0, sizeof(float) * n * 2, stream));
 
-  float *YY =
-    (float *)d_alloc->allocate(sizeof(float) * (nnodes + 1) * 2, stream);
-  random_vector(YY, -0.0001f, 0.0001f, (nnodes + 1) * 2, stream, random_state);
-  ASSERT(YY != NULL && rep_forces != NULL, "[ERROR] Possibly no more memory");
+  device_buffer<float> YY(d_alloc, stream, (nnodes + 1) * 2);
+  device_buffer<float> YY_prev(d_alloc, stream, (nnodes + 1) * 2);
+  random_vector(YY.data(), -0.0001f, 0.0001f, (nnodes + 1) * 2, stream, random_state);
+  ASSERT(YY.data() != NULL
+    && YY_prev.data() != NULL
+    && rep_forces.data() != NULL, "[ERROR] Possibly no more memory");
 
   // Set cache levels for faster algorithm execution
   //---------------------------------------------------
@@ -148,7 +146,6 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ,
   cudaFuncSetCacheConfig(TSNE::RepulsionKernel, cudaFuncCachePreferL1);
   cudaFuncSetCacheConfig(TSNE::attractive_kernel_bh, cudaFuncCachePreferL1);
   cudaFuncSetCacheConfig(TSNE::IntegrationKernel, cudaFuncCachePreferL1);
-
   // Do gradient updates
   //---------------------------------------------------
   CUML_LOG_DEBUG("Start gradient updates!");
@@ -158,10 +155,13 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ,
 
   for (int iter = 0; iter < max_iter; iter++) {
     CUDA_CHECK(
-      cudaMemsetAsync(rep_forces, 0, sizeof(float) * (nnodes + 1) * 2, stream));
-    CUDA_CHECK(cudaMemsetAsync(attr_forces, 0, sizeof(float) * n * 2, stream));
-    TSNE::Reset_Normalization<<<1, 1, 0, stream>>>(Z_norm, radiusd_squared,
-                                                   bottomd, NNODES, radiusd);
+      cudaMemsetAsync(rep_forces.data(), 0, sizeof(float) * (nnodes + 1) * 2, stream));
+    CUDA_CHECK(cudaMemsetAsync(attr_forces.data(), 0, sizeof(float) * n * 2, stream));
+
+    MLCommon::copy(YY_prev.data(), YY.data(), (nnodes + 1) * 2, stream);
+    TSNE::Reset_Normalization<<<1, 1, 0, stream>>>(Z_norm.data(), radiusd_squared.data(),
+                                                   bottomd.data(),
+                                                   NNODES, radiusd.data());
     CUDA_CHECK(cudaPeekAtLastError());
 
     if (iter == exaggeration_iter) {
@@ -173,14 +173,14 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ,
 
     START_TIMER;
     TSNE::BoundingBoxKernel<<<blocks * FACTOR1, THREADS1, 0, stream>>>(
-      startl, childl, massl, YY, YY + nnodes + 1, maxxl, maxyl, minxl, minyl,
-      FOUR_NNODES, NNODES, n, limiter, radiusd);
+      startl.data(), childl.data(), massl.data(), YY.data(), YY.data() + nnodes + 1, maxxl.data(), maxyl.data(),
+      minxl.data(), minyl.data(), FOUR_NNODES, NNODES, n, limiter.data(), radiusd.data());
     CUDA_CHECK(cudaPeekAtLastError());
 
     END_TIMER(BoundingBoxKernel_time);
 
     START_TIMER;
-    TSNE::ClearKernel1<<<blocks, 1024, 0, stream>>>(childl, FOUR_NNODES,
+    TSNE::ClearKernel1<<<blocks, 1024, 0, stream>>>(childl.data(), FOUR_NNODES,
                                                     FOUR_N);
     CUDA_CHECK(cudaPeekAtLastError());
 
@@ -188,113 +188,89 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ,
 
     START_TIMER;
     TSNE::TreeBuildingKernel<<<blocks * FACTOR2, THREADS2, 0, stream>>>(
-      /*errl,*/ childl, YY, YY + nnodes + 1, NNODES, n, maxdepthd, bottomd,
-      radiusd);
+      /*errl,*/ childl.data(), YY.data(), YY.data() + nnodes + 1, NNODES, n, maxdepthd.data(), bottomd.data(),
+      radiusd.data());
     CUDA_CHECK(cudaPeekAtLastError());
 
     END_TIMER(TreeBuildingKernel_time);
 
     START_TIMER;
-    TSNE::ClearKernel2<<<blocks * 1, 1024, 0, stream>>>(startl, massl, NNODES,
-                                                        bottomd);
+    TSNE::ClearKernel2<<<blocks * 1, 1024, 0, stream>>>(startl.data(), massl.data(), NNODES,
+                                                        bottomd.data());
     CUDA_CHECK(cudaPeekAtLastError());
 
     END_TIMER(ClearKernel2_time);
 
     START_TIMER;
     TSNE::SummarizationKernel<<<blocks * FACTOR3, THREADS3, 0, stream>>>(
-      countl, childl, massl, YY, YY + nnodes + 1, NNODES, n, bottomd);
+      countl.data(), childl.data(), massl.data(), YY.data(), YY.data() + nnodes + 1, NNODES, n, bottomd.data());
     CUDA_CHECK(cudaPeekAtLastError());
 
     END_TIMER(SummarizationKernel_time);
 
     START_TIMER;
     TSNE::SortKernel<<<blocks * FACTOR4, THREADS4, 0, stream>>>(
-      sortl, countl, startl, childl, NNODES, n, bottomd);
+      sortl.data(), countl.data(), startl.data(), childl.data(), NNODES, n, bottomd.data());
     CUDA_CHECK(cudaPeekAtLastError());
 
     END_TIMER(SortKernel_time);
 
     START_TIMER;
     TSNE::RepulsionKernel<<<blocks * FACTOR5, THREADS5, 0, stream>>>(
-      /*errl,*/ theta, epssq, sortl, childl, massl, YY, YY + nnodes + 1,
-      rep_forces, rep_forces + nnodes + 1, Z_norm, theta_squared, NNODES,
-      FOUR_NNODES, n, radiusd_squared, maxdepthd);
+      /*errl,*/ theta, epssq, sortl.data(), childl.data(), massl.data(), YY.data(), YY.data() + nnodes + 1,
+      rep_forces.data(), rep_forces.data() + nnodes + 1, Z_norm.data(), theta_squared, NNODES,
+      FOUR_NNODES, n, radiusd_squared.data(), maxdepthd.data());
     CUDA_CHECK(cudaPeekAtLastError());
 
     END_TIMER(RepulsionTime);
 
     START_TIMER;
-    TSNE::Find_Normalization<<<1, 1, 0, stream>>>(Z_norm, n);
+    TSNE::Find_Normalization<<<1, 1, 0, stream>>>(Z_norm.data(), n);
     CUDA_CHECK(cudaPeekAtLastError());
 
     END_TIMER(Reduction_time);
 
     START_TIMER;
     TSNE::get_norm<<<MLCommon::ceildiv(n, 1024), 1024, 0, stream>>>(
-      YY, YY + nnodes + 1, norm, norm_add1, n);
+      YY.data(), YY.data() + nnodes + 1, norm.data(), norm_add1.data(), n);
     CUDA_CHECK(cudaPeekAtLastError());
 
     // TODO: Calculate Kullback-Leibler divergence
     // For general embedding dimensions
     TSNE::
       attractive_kernel_bh<<<MLCommon::ceildiv(NNZ, 1024), 1024, 0, stream>>>(
-        VAL, COL, ROW, YY, YY + nnodes + 1, norm, norm_add1, attr_forces,
-        attr_forces + n, NNZ);
+        VAL, COL, ROW, YY.data(), YY.data() + nnodes + 1, norm.data(), norm_add1.data(), attr_forces.data(),
+        attr_forces.data() + n, NNZ);
     CUDA_CHECK(cudaPeekAtLastError());
     END_TIMER(attractive_time);
 
     START_TIMER;
     TSNE::IntegrationKernel<<<blocks * FACTOR6, THREADS6, 0, stream>>>(
-      learning_rate, momentum, early_exaggeration, YY, YY + nnodes + 1,
-      attr_forces, attr_forces + n, rep_forces, rep_forces + nnodes + 1,
-      gains_bh, gains_bh + n, old_forces, old_forces + n, Z_norm, n);
+      learning_rate, momentum, early_exaggeration, YY.data(), YY.data() + nnodes + 1,
+      attr_forces.data(), attr_forces.data() + n, rep_forces.data(), rep_forces.data() + nnodes + 1,
+      gains_bh.data(), gains_bh.data() + n, old_forces.data(), old_forces.data() + n, Z_norm.data(), n);
     CUDA_CHECK(cudaPeekAtLastError());
 
     END_TIMER(IntegrationKernel_time);
+
+    h_chk_nan = thrust::transform_reduce(thrust::cuda::par.on(stream), YY.data(), YY.data() + (nnodes + 1) * 2,
+                                         NaNTestKernel(), 0, thrust::plus<bool>());
+    if (h_chk_nan) {
+      CUML_LOG_DEBUG("NaN result detected during Barnes Hut iteration: %d, returning last known good positions.", iter);
+      MLCommon::copy(YY.data(), YY_prev.data(), (nnodes + 1) * 2, stream);
+      break;
+    }
   }
   PRINT_TIMES;
 
   // Copy final YY into true output Y
   thrust::device_ptr<float> Y_begin = thrust::device_pointer_cast(Y);
-  thrust::copy(thrust::cuda::par.on(stream), YY, YY + n, Y_begin);
-  CUDA_CHECK(cudaPeekAtLastError());
 
-  thrust::copy(thrust::cuda::par.on(stream), YY + nnodes + 1,
-               YY + nnodes + 1 + n, Y_begin + n);
+  MLCommon::copy(Y, YY.data(), n, stream);
   CUDA_CHECK(cudaPeekAtLastError());
 
-  // Deallocate everything
-  //d_alloc->deallocate(errl, sizeof(int), stream);
-  d_alloc->deallocate(limiter, sizeof(unsigned), stream);
-  d_alloc->deallocate(maxdepthd, sizeof(int), stream);
-  d_alloc->deallocate(bottomd, sizeof(int), stream);
-  d_alloc->deallocate(radiusd, sizeof(float), stream);
-
-  d_alloc->deallocate(startl, sizeof(int) * (nnodes + 1), stream);
-  d_alloc->deallocate(childl, sizeof(int) * (nnodes + 1) * 4, stream);
-  d_alloc->deallocate(massl, sizeof(float) * (nnodes + 1), stream);
-
-  d_alloc->deallocate(maxxl, sizeof(float) * blocks * FACTOR1, stream);
-  d_alloc->deallocate(maxyl, sizeof(float) * blocks * FACTOR1, stream);
-  d_alloc->deallocate(minxl, sizeof(float) * blocks * FACTOR1, stream);
-  d_alloc->deallocate(minyl, sizeof(float) * blocks * FACTOR1, stream);
-
-  d_alloc->deallocate(countl, sizeof(int) * (nnodes + 1), stream);
-  d_alloc->deallocate(sortl, sizeof(int) * (nnodes + 1), stream);
-
-  d_alloc->deallocate(rep_forces, sizeof(float) * (nnodes + 1) * 2, stream);
-  d_alloc->deallocate(attr_forces, sizeof(float) * n * 2, stream);
-  d_alloc->deallocate(norm, sizeof(float) * n, stream);
-  d_alloc->deallocate(norm_add1, sizeof(float) * n, stream);
-
-  d_alloc->deallocate(Z_norm, sizeof(float), stream);
-  d_alloc->deallocate(radiusd_squared, sizeof(float), stream);
-
-  d_alloc->deallocate(gains_bh, sizeof(float) * n * 2, stream);
-  d_alloc->deallocate(old_forces, sizeof(float) * n * 2, stream);
-
-  d_alloc->deallocate(YY, sizeof(float) * (nnodes + 1) * 2, stream);
+  MLCommon::copy(Y + n, YY.data() + nnodes + 1, n, stream);
+  CUDA_CHECK(cudaPeekAtLastError());
 }
 
 }  // namespace TSNE
diff --git a/cpp/src/tsne/bh_kernels.cuh b/cpp/src/tsne/bh_kernels.cuh
index 27f27a5ca8..53ae0b0e84 100644
--- a/cpp/src/tsne/bh_kernels.cuh
+++ b/cpp/src/tsne/bh_kernels.cuh
@@ -40,6 +40,16 @@
 namespace ML {
 namespace TSNE {
 
+/**
+ * Unary op intended to be used as a Thrust transform, check if array elements are NaNs.
+ */
+struct NaNTestKernel{
+  __host__ __device__ bool operator()(const float a) const
+  {
+    return isnan(a);
+  }
+};
+
 /**
  * Intializes the states of objects. This speeds the overall kernel up.
  */
@@ -171,7 +181,8 @@ __global__ __launch_bounds__(1024, 1) void ClearKernel1(int *restrict childd,
 }
 
 /**
- * Build the actual KD Tree.
+ * Build the actual QuadTree.
+ * See: https://iss.oden.utexas.edu/Publications/Papers/burtscher11.pdf
  */
 __global__ __launch_bounds__(
   THREADS2, FACTOR2) void TreeBuildingKernel(/* int *restrict errd, */
@@ -194,10 +205,11 @@ __global__ __launch_bounds__(
 
   int localmaxdepth = 1;
   int skip = 1;
+
   const int inc = blockDim.x * gridDim.x;
   int i = threadIdx.x + blockIdx.x * blockDim.x;
 
-  // iterate over all bodies assigned to thread
+  // Iterate over all bodies assigned to thread
   while (i < N) {
     if (skip != 0) {
       // new body, so start traversing at root
@@ -206,28 +218,34 @@ __global__ __launch_bounds__(
       depth = 1;
       r = radius * 0.5f;
 
+      /* Select child node 'j'
+                    rootx < px  rootx > px
+       * rooty < py   1 -> 3    0 -> 2
+       * rooty > py   1 -> 1    0 -> 0
+       */
       x = rootx + ((rootx < (px = posxd[i])) ? (j = 1, r) : (j = 0, -r));
 
       y = rooty + ((rooty < (py = posyd[i])) ? (j |= 2, r) : (-r));
     }
 
-    // follow path to leaf cell
+    // Follow path to leaf cell
     while ((ch = childd[n * 4 + j]) >= N) {
       n = ch;
       depth++;
       r *= 0.5f;
 
-      // determine which child to follow
       x += ((x < px) ? (j = 1, r) : (j = 0, -r));
 
       y += ((y < py) ? (j |= 2, r) : (-r));
     }
 
-    if (ch != -2) {
-      // skip if child pointer is locked and try again later
-      locked = n * 4 + j;
+    // (ch)ild will be '-1' (nullptr), '-2' (locked), or an Integer corresponding to a body offset
+    // in the lower [0, N) blocks of childd
+    if (ch != -2) { // skip if child pointer was locked when we examined it, and try again later.
+      locked = n * 4 + j; // store the locked position in case we need to patch in a cell later.
 
       if (ch == -1) {
+        // Child is a nullptr ('-1'), so we write our body index to the leaf, and move on to the next body.
         if (atomicCAS(&childd[locked], -1, i) == -1) {
           if (depth > localmaxdepth) localmaxdepth = depth;
 
@@ -235,23 +253,34 @@ __global__ __launch_bounds__(
           skip = 1;
         }
       } else {
+        // Child node isn't empty, so we store the current value of the child, lock the leaf, and patch in a new cell
         if (ch == atomicCAS(&childd[locked], ch, -2)) {
-          // try to lock
           patch = -1;
 
           while (ch >= 0) {
             depth++;
 
             const int cell = atomicSub(bottomd, 1) - 1;
-            if (cell <= N) {
+            if (cell == N) {
               // atomicExch(errd, 1);
-              atomicExch(bottomd, NNODES);
+              int prev_bottom = atomicExch(bottomd, NNODES);
+              //printf("Cell Allocation Overflow, depth=%d, r=%f, rbound=%f, N=%d, NNODES=%d, bottomd=%d\n",
+              //    depth, r, radius, N, NNODES, prev_bottom);
+            }
+            else if (cell < N) {
+              depth--;
+              continue;
             }
 
-            if (patch != -1) childd[n * 4 + j] = cell;
+            if (patch != -1) {
+              childd[n * 4 + j] = cell;
+            }
 
-            if (cell > patch) patch = cell;
+            if (cell > patch) {
+              patch = cell;
+            }
 
+            // Insert migrated child node
             j = (x < posxd[ch]) ? 1 : 0;
             if (y < posyd[ch]) j |= 2;
 
@@ -259,12 +288,16 @@ __global__ __launch_bounds__(
             n = cell;
             r *= 0.5f;
 
+
             x += ((x < px) ? (j = 1, r) : (j = 0, -r));
 
             y += ((y < py) ? (j |= 2, r) : (-r));
 
+            // Check the value of the j'th child at offset 'n' N <= n < 2*N, in childd
             ch = childd[n * 4 + j];
-            if (r <= 1e-10) break;
+            if (r <= 1e-10) {
+              break;
+            }
           }
 
           childd[n * 4 + j] = i;
@@ -276,12 +309,12 @@ __global__ __launch_bounds__(
         }
       }
     }
+
     __threadfence();
 
     if (skip == 2) childd[locked] = patch;
   }
 
-  // record maximum tree depth
   // if (localmaxdepth >= THREADS5)
   //   localmaxdepth = THREADS5 - 1;
   if (localmaxdepth > 32) localmaxdepth = 32;
diff --git a/cpp/src/tsne/exact_kernels.cuh b/cpp/src/tsne/exact_kernels.cuh
index b16ade068f..df0117f334 100644
--- a/cpp/src/tsne/exact_kernels.cuh
+++ b/cpp/src/tsne/exact_kernels.cuh
@@ -26,7 +26,7 @@ namespace ML {
 namespace TSNE {
 
 /****************************************/
-/* Finds the best guassian bandwith for
+/* Finds the best Gaussian bandwidth for
     each row in the dataset             */
 __global__ void sigmas_kernel(const float *restrict distances,
                               float *restrict P, const float perplexity,
@@ -45,7 +45,7 @@ __global__ void sigmas_kernel(const float *restrict distances,
   for (int step = 0; step < epochs; step++) {
     float sum_Pi = FLT_EPSILON;
 
-    // Exponentiate to get guassian
+    // Exponentiate to get Gaussian
     for (int j = 0; j < k; j++) {
       P[ik + j] = __expf(-distances[ik + j] * beta);
       sum_Pi += P[ik + j];
@@ -84,7 +84,7 @@ __global__ void sigmas_kernel(const float *restrict distances,
 }
 
 /****************************************/
-/* Finds the best guassian bandwith for
+/* Finds the best Gaussian bandwith for
     each row in the dataset             */
 __global__ void sigmas_kernel_2d(const float *restrict distances,
                                  float *restrict P, const float perplexity,
@@ -101,7 +101,7 @@ __global__ void sigmas_kernel_2d(const float *restrict distances,
   register const int ik = i * 2;
 
   for (int step = 0; step < epochs; step++) {
-    // Exponentiate to get guassian
+    // Exponentiate to get Gaussian
     P[ik] = __expf(-distances[ik] * beta);
     P[ik + 1] = __expf(-distances[ik + 1] * beta);
     const float sum_Pi = FLT_EPSILON + P[ik] + P[ik + 1];
diff --git a/cpp/src_prims/selection/knn.cuh b/cpp/src_prims/selection/knn.cuh
index 841090d78c..9386b71f78 100644
--- a/cpp/src_prims/selection/knn.cuh
+++ b/cpp/src_prims/selection/knn.cuh
@@ -216,6 +216,7 @@ void brute_force_knn(std::vector<float *> &input, std::vector<int> &sizes,
   ASSERT(input.size() == sizes.size(),
          "input and sizes vectors should be the same size");
 
+
   faiss::MetricType m = build_faiss_metric(metric);
 
   std::vector<int64_t> *id_ranges;

From 0d3884ee9abea1fd474948e66873e8cbc0ea208f Mon Sep 17 00:00:00 2001
From: Devin Robison <drobison@nvidia.com>
Date: Wed, 15 Jul 2020 13:51:31 -0600
Subject: [PATCH 02/14] Add clang formatting fixes

---
 CHANGELOG.md                    |  1 +
 cpp/src/tsne/barnes_hut.cuh     | 78 +++++++++++++++++++--------------
 cpp/src/tsne/bh_kernels.cuh     | 33 ++++++--------
 cpp/src_prims/selection/knn.cuh |  1 -
 4 files changed, 60 insertions(+), 53 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 901e75f46e..a81c93425a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -83,6 +83,7 @@
 - PR #2535: Fix issue with incorrect docker image being used in local build script
 - PR #2542: Fix small memory leak in TSNE
 - PR #2552: Fixed the length argument of updateDevice calls in RF test
+- PR #2358: Fix cell allocation code to avoid loops in quad-tree. Prevent NaNs causing infinite descent
 
 # cuML 0.14.0 (03 Jun 2020)
 
diff --git a/cpp/src/tsne/barnes_hut.cuh b/cpp/src/tsne/barnes_hut.cuh
index c308da687b..c9e15feec4 100644
--- a/cpp/src/tsne/barnes_hut.cuh
+++ b/cpp/src/tsne/barnes_hut.cuh
@@ -78,8 +78,8 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ,
   device_buffer<int> bottomd(d_alloc, stream, 1);
   device_buffer<float> radiusd(d_alloc, stream, 1);
 
-  TSNE::InitializationKernel<<<1, 1, 0, stream>>>(/*errl,*/ limiter.data(), maxdepthd.data(),
-                                                  radiusd.data());
+  TSNE::InitializationKernel<<<1, 1, 0, stream>>>(
+    /*errl,*/ limiter.data(), maxdepthd.data(), radiusd.data());
 
   CUDA_CHECK(cudaPeekAtLastError());
 
@@ -90,10 +90,11 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ,
   bool h_chk_nan;
 
   // Actual mallocs
-  device_buffer<int> startl(d_alloc, stream, nnodes +1);
+  device_buffer<int> startl(d_alloc, stream, nnodes + 1);
   device_buffer<int> childl(d_alloc, stream, (nnodes + 1) * 4);
   device_buffer<float> massl(d_alloc, stream, nnodes + 1);
-  thrust::device_ptr<float> begin_massl = thrust::device_pointer_cast(massl.data());
+  thrust::device_ptr<float> begin_massl =
+    thrust::device_pointer_cast(massl.data());
   thrust::fill(thrust::cuda::par.on(stream), begin_massl,
                begin_massl + (nnodes + 1), 1.0f);
 
@@ -126,14 +127,16 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ,
                begin_gains_bh + (n * 2), 1.0f);
 
   device_buffer<float> old_forces(d_alloc, stream, n * 2);
-  CUDA_CHECK(cudaMemsetAsync(old_forces.data(), 0, sizeof(float) * n * 2, stream));
+  CUDA_CHECK(
+    cudaMemsetAsync(old_forces.data(), 0, sizeof(float) * n * 2, stream));
 
   device_buffer<float> YY(d_alloc, stream, (nnodes + 1) * 2);
   device_buffer<float> YY_prev(d_alloc, stream, (nnodes + 1) * 2);
-  random_vector(YY.data(), -0.0001f, 0.0001f, (nnodes + 1) * 2, stream, random_state);
-  ASSERT(YY.data() != NULL
-    && YY_prev.data() != NULL
-    && rep_forces.data() != NULL, "[ERROR] Possibly no more memory");
+  random_vector(YY.data(), -0.0001f, 0.0001f, (nnodes + 1) * 2, stream,
+                random_state);
+  ASSERT(
+    YY.data() != NULL && YY_prev.data() != NULL && rep_forces.data() != NULL,
+    "[ERROR] Possibly no more memory");
 
   // Set cache levels for faster algorithm execution
   //---------------------------------------------------
@@ -154,14 +157,15 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ,
   float learning_rate = pre_learning_rate;
 
   for (int iter = 0; iter < max_iter; iter++) {
+    CUDA_CHECK(cudaMemsetAsync(rep_forces.data(), 0,
+                               sizeof(float) * (nnodes + 1) * 2, stream));
     CUDA_CHECK(
-      cudaMemsetAsync(rep_forces.data(), 0, sizeof(float) * (nnodes + 1) * 2, stream));
-    CUDA_CHECK(cudaMemsetAsync(attr_forces.data(), 0, sizeof(float) * n * 2, stream));
+      cudaMemsetAsync(attr_forces.data(), 0, sizeof(float) * n * 2, stream));
 
     MLCommon::copy(YY_prev.data(), YY.data(), (nnodes + 1) * 2, stream);
-    TSNE::Reset_Normalization<<<1, 1, 0, stream>>>(Z_norm.data(), radiusd_squared.data(),
-                                                   bottomd.data(),
-                                                   NNODES, radiusd.data());
+    TSNE::Reset_Normalization<<<1, 1, 0, stream>>>(
+      Z_norm.data(), radiusd_squared.data(), bottomd.data(), NNODES,
+      radiusd.data());
     CUDA_CHECK(cudaPeekAtLastError());
 
     if (iter == exaggeration_iter) {
@@ -173,8 +177,9 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ,
 
     START_TIMER;
     TSNE::BoundingBoxKernel<<<blocks * FACTOR1, THREADS1, 0, stream>>>(
-      startl.data(), childl.data(), massl.data(), YY.data(), YY.data() + nnodes + 1, maxxl.data(), maxyl.data(),
-      minxl.data(), minyl.data(), FOUR_NNODES, NNODES, n, limiter.data(), radiusd.data());
+      startl.data(), childl.data(), massl.data(), YY.data(),
+      YY.data() + nnodes + 1, maxxl.data(), maxyl.data(), minxl.data(),
+      minyl.data(), FOUR_NNODES, NNODES, n, limiter.data(), radiusd.data());
     CUDA_CHECK(cudaPeekAtLastError());
 
     END_TIMER(BoundingBoxKernel_time);
@@ -188,37 +193,40 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ,
 
     START_TIMER;
     TSNE::TreeBuildingKernel<<<blocks * FACTOR2, THREADS2, 0, stream>>>(
-      /*errl,*/ childl.data(), YY.data(), YY.data() + nnodes + 1, NNODES, n, maxdepthd.data(), bottomd.data(),
-      radiusd.data());
+      /*errl,*/ childl.data(), YY.data(), YY.data() + nnodes + 1, NNODES, n,
+      maxdepthd.data(), bottomd.data(), radiusd.data());
     CUDA_CHECK(cudaPeekAtLastError());
 
     END_TIMER(TreeBuildingKernel_time);
 
     START_TIMER;
-    TSNE::ClearKernel2<<<blocks * 1, 1024, 0, stream>>>(startl.data(), massl.data(), NNODES,
-                                                        bottomd.data());
+    TSNE::ClearKernel2<<<blocks * 1, 1024, 0, stream>>>(
+      startl.data(), massl.data(), NNODES, bottomd.data());
     CUDA_CHECK(cudaPeekAtLastError());
 
     END_TIMER(ClearKernel2_time);
 
     START_TIMER;
     TSNE::SummarizationKernel<<<blocks * FACTOR3, THREADS3, 0, stream>>>(
-      countl.data(), childl.data(), massl.data(), YY.data(), YY.data() + nnodes + 1, NNODES, n, bottomd.data());
+      countl.data(), childl.data(), massl.data(), YY.data(),
+      YY.data() + nnodes + 1, NNODES, n, bottomd.data());
     CUDA_CHECK(cudaPeekAtLastError());
 
     END_TIMER(SummarizationKernel_time);
 
     START_TIMER;
     TSNE::SortKernel<<<blocks * FACTOR4, THREADS4, 0, stream>>>(
-      sortl.data(), countl.data(), startl.data(), childl.data(), NNODES, n, bottomd.data());
+      sortl.data(), countl.data(), startl.data(), childl.data(), NNODES, n,
+      bottomd.data());
     CUDA_CHECK(cudaPeekAtLastError());
 
     END_TIMER(SortKernel_time);
 
     START_TIMER;
     TSNE::RepulsionKernel<<<blocks * FACTOR5, THREADS5, 0, stream>>>(
-      /*errl,*/ theta, epssq, sortl.data(), childl.data(), massl.data(), YY.data(), YY.data() + nnodes + 1,
-      rep_forces.data(), rep_forces.data() + nnodes + 1, Z_norm.data(), theta_squared, NNODES,
+      /*errl,*/ theta, epssq, sortl.data(), childl.data(), massl.data(),
+      YY.data(), YY.data() + nnodes + 1, rep_forces.data(),
+      rep_forces.data() + nnodes + 1, Z_norm.data(), theta_squared, NNODES,
       FOUR_NNODES, n, radiusd_squared.data(), maxdepthd.data());
     CUDA_CHECK(cudaPeekAtLastError());
 
@@ -239,24 +247,30 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ,
     // For general embedding dimensions
     TSNE::
       attractive_kernel_bh<<<MLCommon::ceildiv(NNZ, 1024), 1024, 0, stream>>>(
-        VAL, COL, ROW, YY.data(), YY.data() + nnodes + 1, norm.data(), norm_add1.data(), attr_forces.data(),
-        attr_forces.data() + n, NNZ);
+        VAL, COL, ROW, YY.data(), YY.data() + nnodes + 1, norm.data(),
+        norm_add1.data(), attr_forces.data(), attr_forces.data() + n, NNZ);
     CUDA_CHECK(cudaPeekAtLastError());
     END_TIMER(attractive_time);
 
     START_TIMER;
     TSNE::IntegrationKernel<<<blocks * FACTOR6, THREADS6, 0, stream>>>(
-      learning_rate, momentum, early_exaggeration, YY.data(), YY.data() + nnodes + 1,
-      attr_forces.data(), attr_forces.data() + n, rep_forces.data(), rep_forces.data() + nnodes + 1,
-      gains_bh.data(), gains_bh.data() + n, old_forces.data(), old_forces.data() + n, Z_norm.data(), n);
+      learning_rate, momentum, early_exaggeration, YY.data(),
+      YY.data() + nnodes + 1, attr_forces.data(), attr_forces.data() + n,
+      rep_forces.data(), rep_forces.data() + nnodes + 1, gains_bh.data(),
+      gains_bh.data() + n, old_forces.data(), old_forces.data() + n,
+      Z_norm.data(), n);
     CUDA_CHECK(cudaPeekAtLastError());
 
     END_TIMER(IntegrationKernel_time);
 
-    h_chk_nan = thrust::transform_reduce(thrust::cuda::par.on(stream), YY.data(), YY.data() + (nnodes + 1) * 2,
-                                         NaNTestKernel(), 0, thrust::plus<bool>());
+    h_chk_nan = thrust::transform_reduce(
+      thrust::cuda::par.on(stream), YY.data(), YY.data() + (nnodes + 1) * 2,
+      NaNTestKernel(), 0, thrust::plus<bool>());
     if (h_chk_nan) {
-      CUML_LOG_DEBUG("NaN result detected during Barnes Hut iteration: %d, returning last known good positions.", iter);
+      CUML_LOG_DEBUG(
+        "NaN result detected during Barnes Hut iteration: %d, returning last "
+        "known good positions.",
+        iter);
       MLCommon::copy(YY.data(), YY_prev.data(), (nnodes + 1) * 2, stream);
       break;
     }
diff --git a/cpp/src/tsne/bh_kernels.cuh b/cpp/src/tsne/bh_kernels.cuh
index 53ae0b0e84..de1cbc2920 100644
--- a/cpp/src/tsne/bh_kernels.cuh
+++ b/cpp/src/tsne/bh_kernels.cuh
@@ -43,11 +43,8 @@ namespace TSNE {
 /**
  * Unary op intended to be used as a Thrust transform, check if array elements are NaNs.
  */
-struct NaNTestKernel{
-  __host__ __device__ bool operator()(const float a) const
-  {
-    return isnan(a);
-  }
+struct NaNTestKernel {
+  __host__ __device__ bool operator()(const float a) const { return isnan(a); }
 };
 
 /**
@@ -209,7 +206,7 @@ __global__ __launch_bounds__(
   const int inc = blockDim.x * gridDim.x;
   int i = threadIdx.x + blockIdx.x * blockDim.x;
 
-  // Iterate over all bodies assigned to thread
+  // iterate over all bodies assigned to thread
   while (i < N) {
     if (skip != 0) {
       // new body, so start traversing at root
@@ -228,7 +225,7 @@ __global__ __launch_bounds__(
       y = rooty + ((rooty < (py = posyd[i])) ? (j |= 2, r) : (-r));
     }
 
-    // Follow path to leaf cell
+    // follow path to leaf cell
     while ((ch = childd[n * 4 + j]) >= N) {
       n = ch;
       depth++;
@@ -241,8 +238,10 @@ __global__ __launch_bounds__(
 
     // (ch)ild will be '-1' (nullptr), '-2' (locked), or an Integer corresponding to a body offset
     // in the lower [0, N) blocks of childd
-    if (ch != -2) { // skip if child pointer was locked when we examined it, and try again later.
-      locked = n * 4 + j; // store the locked position in case we need to patch in a cell later.
+    if (ch != -2) {
+      // skip if child pointer was locked when we examined it, and try again later.
+      locked = n * 4 + j;
+      // store the locked position in case we need to patch in a cell later.
 
       if (ch == -1) {
         // Child is a nullptr ('-1'), so we write our body index to the leaf, and move on to the next body.
@@ -263,22 +262,17 @@ __global__ __launch_bounds__(
             const int cell = atomicSub(bottomd, 1) - 1;
             if (cell == N) {
               // atomicExch(errd, 1);
-              int prev_bottom = atomicExch(bottomd, NNODES);
+              atomicExch(bottomd, NNODES);
               //printf("Cell Allocation Overflow, depth=%d, r=%f, rbound=%f, N=%d, NNODES=%d, bottomd=%d\n",
               //    depth, r, radius, N, NNODES, prev_bottom);
-            }
-            else if (cell < N) {
+            } else if (cell < N) {
               depth--;
               continue;
             }
 
-            if (patch != -1) {
-              childd[n * 4 + j] = cell;
-            }
+            if (patch != -1) childd[n * 4 + j] = cell;
 
-            if (cell > patch) {
-              patch = cell;
-            }
+            if (cell > patch) patch = cell;
 
             // Insert migrated child node
             j = (x < posxd[ch]) ? 1 : 0;
@@ -288,12 +282,10 @@ __global__ __launch_bounds__(
             n = cell;
             r *= 0.5f;
 
-
             x += ((x < px) ? (j = 1, r) : (j = 0, -r));
 
             y += ((y < py) ? (j |= 2, r) : (-r));
 
-            // Check the value of the j'th child at offset 'n' N <= n < 2*N, in childd
             ch = childd[n * 4 + j];
             if (r <= 1e-10) {
               break;
@@ -315,6 +307,7 @@ __global__ __launch_bounds__(
     if (skip == 2) childd[locked] = patch;
   }
 
+  // record maximum tree depth
   // if (localmaxdepth >= THREADS5)
   //   localmaxdepth = THREADS5 - 1;
   if (localmaxdepth > 32) localmaxdepth = 32;
diff --git a/cpp/src_prims/selection/knn.cuh b/cpp/src_prims/selection/knn.cuh
index 552f8cc2ea..1a6dd50018 100644
--- a/cpp/src_prims/selection/knn.cuh
+++ b/cpp/src_prims/selection/knn.cuh
@@ -216,7 +216,6 @@ void brute_force_knn(std::vector<float *> &input, std::vector<int> &sizes,
   ASSERT(input.size() == sizes.size(),
          "input and sizes vectors should be the same size");
 
-
   faiss::MetricType m = build_faiss_metric(metric);
 
   std::vector<int64_t> *id_ranges;

From 06e5c0fb775b57d1baeb74e77994d1117d68a2fb Mon Sep 17 00:00:00 2001
From: Devin Robison <drobison@nvidia.com>
Date: Wed, 15 Jul 2020 16:03:00 -0600
Subject: [PATCH 03/14] Update to check for any non-finite values

---
 cpp/src/tsne/barnes_hut.cuh | 12 ++++++------
 cpp/src/tsne/bh_kernels.cuh |  6 ++++--
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/cpp/src/tsne/barnes_hut.cuh b/cpp/src/tsne/barnes_hut.cuh
index c9e15feec4..86e0ec2caa 100644
--- a/cpp/src/tsne/barnes_hut.cuh
+++ b/cpp/src/tsne/barnes_hut.cuh
@@ -87,7 +87,7 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ,
   const int FOUR_N = 4 * n;
   const float theta_squared = theta * theta;
   const int NNODES = nnodes;
-  bool h_chk_nan;
+  bool h_chk_finite;
 
   // Actual mallocs
   device_buffer<int> startl(d_alloc, stream, nnodes + 1);
@@ -263,12 +263,12 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ,
 
     END_TIMER(IntegrationKernel_time);
 
-    h_chk_nan = thrust::transform_reduce(
+    h_chk_finite = thrust::transform_reduce(
       thrust::cuda::par.on(stream), YY.data(), YY.data() + (nnodes + 1) * 2,
-      NaNTestKernel(), 0, thrust::plus<bool>());
-    if (h_chk_nan) {
-      CUML_LOG_DEBUG(
-        "NaN result detected during Barnes Hut iteration: %d, returning last "
+      FiniteTestUnary(), 0, thrust::plus<bool>());
+    if (h_chk_finite) {
+      CUML_LOG_WARN(
+        "Non-finite result detected during Barnes Hut iteration: %d, returning last "
         "known good positions.",
         iter);
       MLCommon::copy(YY.data(), YY_prev.data(), (nnodes + 1) * 2, stream);
diff --git a/cpp/src/tsne/bh_kernels.cuh b/cpp/src/tsne/bh_kernels.cuh
index de1cbc2920..3a811be7cd 100644
--- a/cpp/src/tsne/bh_kernels.cuh
+++ b/cpp/src/tsne/bh_kernels.cuh
@@ -40,13 +40,15 @@
 namespace ML {
 namespace TSNE {
 
+
 /**
  * Unary op intended to be used as a Thrust transform, check if array elements are NaNs.
  */
-struct NaNTestKernel {
-  __host__ __device__ bool operator()(const float a) const { return isnan(a); }
+struct FiniteTestUnary {
+  __host__ __device__ bool operator()(const float x) const { return !isfinite(x); }
 };
 
+
 /**
  * Intializes the states of objects. This speeds the overall kernel up.
  */

From d78833d1a966933c9cb4c5548129bf87bd0238fa Mon Sep 17 00:00:00 2001
From: Devin Robison <drobison@nvidia.com>
Date: Wed, 15 Jul 2020 16:05:42 -0600
Subject: [PATCH 04/14] Description tweak

---
 cpp/src/tsne/bh_kernels.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/tsne/bh_kernels.cuh b/cpp/src/tsne/bh_kernels.cuh
index 3a811be7cd..60749196e5 100644
--- a/cpp/src/tsne/bh_kernels.cuh
+++ b/cpp/src/tsne/bh_kernels.cuh
@@ -42,7 +42,7 @@ namespace TSNE {
 
 
 /**
- * Unary op intended to be used as a Thrust transform, check if array elements are NaNs.
+ * Unary op intended to be used as a Thrust transform, check if array elements non-finite.
  */
 struct FiniteTestUnary {
   __host__ __device__ bool operator()(const float x) const { return !isfinite(x); }

From a396a5273ee1e7d796c19a7f7909848603824e33 Mon Sep 17 00:00:00 2001
From: Devin Robison <drobison@nvidia.com>
Date: Wed, 15 Jul 2020 16:10:44 -0600
Subject: [PATCH 05/14] PR # update

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a81c93425a..356af27ced 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -83,7 +83,7 @@
 - PR #2535: Fix issue with incorrect docker image being used in local build script
 - PR #2542: Fix small memory leak in TSNE
 - PR #2552: Fixed the length argument of updateDevice calls in RF test
-- PR #2358: Fix cell allocation code to avoid loops in quad-tree. Prevent NaNs causing infinite descent
+- PR #2565: Fix cell allocation code to avoid loops in quad-tree. Prevent NaNs causing infinite descent
 
 # cuML 0.14.0 (03 Jun 2020)
 

From 6167b6c1b116bbc470503ab788ba81ee6e505124 Mon Sep 17 00:00:00 2001
From: Devin Robison <drobison@nvidia.com>
Date: Wed, 22 Jul 2020 15:23:44 -0600
Subject: [PATCH 06/14] remove unnecessary pointer cast

---
 cpp/src/tsne/barnes_hut.cuh | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/cpp/src/tsne/barnes_hut.cuh b/cpp/src/tsne/barnes_hut.cuh
index 86e0ec2caa..482c547a1a 100644
--- a/cpp/src/tsne/barnes_hut.cuh
+++ b/cpp/src/tsne/barnes_hut.cuh
@@ -277,9 +277,6 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ,
   }
   PRINT_TIMES;
 
-  // Copy final YY into true output Y
-  thrust::device_ptr<float> Y_begin = thrust::device_pointer_cast(Y);
-
   MLCommon::copy(Y, YY.data(), n, stream);
   CUDA_CHECK(cudaPeekAtLastError());
 

From 9131614ffba3b2ed353d6eaa6ca71403632ce9b7 Mon Sep 17 00:00:00 2001
From: Devin Robison <drobison@nvidia.com>
Date: Thu, 23 Jul 2020 15:14:21 -0600
Subject: [PATCH 07/14] Add double precision fallback, and final failure work
 around

---
 cpp/src/tsne/barnes_hut.cuh | 86 ++-----------------------------------
 cpp/src/tsne/bh_kernels.cuh | 21 +++++----
 2 files changed, 13 insertions(+), 94 deletions(-)

diff --git a/cpp/src/tsne/barnes_hut.cuh b/cpp/src/tsne/barnes_hut.cuh
index 69015c04b4..7c068f9437 100644
--- a/cpp/src/tsne/barnes_hut.cuh
+++ b/cpp/src/tsne/barnes_hut.cuh
@@ -71,18 +71,6 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ,
   CUML_LOG_DEBUG("N_nodes = %d blocks = %d", nnodes, blocks);
 
   // Allocate more space
-  //---------------------------------------------------
-<<<<<<< HEAD
-  //int *errl = (int *)d_alloc->allocate(sizeof(int), stream);
-  device_buffer<unsigned> limiter(d_alloc, stream, 1);
-  device_buffer<int> maxdepthd(d_alloc, stream, 1);
-  device_buffer<int> bottomd(d_alloc, stream, 1);
-  device_buffer<float> radiusd(d_alloc, stream, 1);
-
-  TSNE::InitializationKernel<<<1, 1, 0, stream>>>(
-    /*errl,*/ limiter.data(), maxdepthd.data(), radiusd.data());
-
-=======
   // MLCommon::device_buffer<unsigned> errl(d_alloc, stream, 1);
   MLCommon::device_buffer<unsigned> limiter(d_alloc, stream, 1);
   MLCommon::device_buffer<int> maxdepthd(d_alloc, stream, 1);
@@ -93,57 +81,23 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ,
                                                   limiter.data(),
                                                   maxdepthd.data(),
                                                   radiusd.data());
->>>>>>> 3aacbc93e6367b1cac99e66d7604020985f1ff52
   CUDA_CHECK(cudaPeekAtLastError());
 
   const int FOUR_NNODES = 4 * nnodes;
   const int FOUR_N = 4 * n;
   const float theta_squared = theta * theta;
   const int NNODES = nnodes;
-  bool h_chk_finite;
-
-<<<<<<< HEAD
-  // Actual mallocs
-  device_buffer<int> startl(d_alloc, stream, nnodes + 1);
-  device_buffer<int> childl(d_alloc, stream, (nnodes + 1) * 4);
-  device_buffer<float> massl(d_alloc, stream, nnodes + 1);
-=======
+
   // Actual allocations
   MLCommon::device_buffer<int> startl(d_alloc, stream, nnodes + 1);
   MLCommon::device_buffer<int> childl(d_alloc, stream, (nnodes + 1) * 4);
   MLCommon::device_buffer<float> massl(d_alloc, stream, nnodes + 1);
 
->>>>>>> 3aacbc93e6367b1cac99e66d7604020985f1ff52
   thrust::device_ptr<float> begin_massl =
     thrust::device_pointer_cast(massl.data());
   thrust::fill(thrust::cuda::par.on(stream), begin_massl,
                begin_massl + (nnodes + 1), 1.0f);
 
-<<<<<<< HEAD
-  device_buffer<float> maxxl(d_alloc, stream, blocks * FACTOR1);
-  device_buffer<float> maxyl(d_alloc, stream, blocks * FACTOR1);
-  device_buffer<float> minxl(d_alloc, stream, blocks * FACTOR1);
-  device_buffer<float> minyl(d_alloc, stream, blocks * FACTOR1);
-
-  // SummarizationKernel
-  device_buffer<int> countl(d_alloc, stream, nnodes + 1);
-
-  // SortKernel
-  device_buffer<int> sortl(d_alloc, stream, nnodes + 1);
-
-  // RepulsionKernel
-  device_buffer<float> rep_forces(d_alloc, stream, (nnodes + 1) * 2);
-  device_buffer<float> attr_forces(d_alloc, stream, n * 2);
-
-  device_buffer<float> norm_add1(d_alloc, stream, n);
-  device_buffer<float> norm(d_alloc, stream, n);
-  device_buffer<float> Z_norm(d_alloc, stream, 1);
-
-  device_buffer<float> radiusd_squared(d_alloc, stream, 1);
-
-  // Apply
-  device_buffer<float> gains_bh(d_alloc, stream, n * 2);
-=======
   MLCommon::device_buffer<float> maxxl(d_alloc, stream, blocks * FACTOR1);
   MLCommon::device_buffer<float> maxyl(d_alloc, stream, blocks * FACTOR1);
   MLCommon::device_buffer<float> minxl(d_alloc, stream, blocks * FACTOR1);
@@ -168,25 +122,12 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ,
 
   // Apply
   MLCommon::device_buffer<float> gains_bh(d_alloc, stream, n * 2);
->>>>>>> 3aacbc93e6367b1cac99e66d7604020985f1ff52
+
   thrust::device_ptr<float> begin_gains_bh =
     thrust::device_pointer_cast(gains_bh.data());
   thrust::fill(thrust::cuda::par.on(stream), begin_gains_bh,
                begin_gains_bh + (n * 2), 1.0f);
 
-<<<<<<< HEAD
-  device_buffer<float> old_forces(d_alloc, stream, n * 2);
-  CUDA_CHECK(
-    cudaMemsetAsync(old_forces.data(), 0, sizeof(float) * n * 2, stream));
-
-  device_buffer<float> YY(d_alloc, stream, (nnodes + 1) * 2);
-  device_buffer<float> YY_prev(d_alloc, stream, (nnodes + 1) * 2);
-  random_vector(YY.data(), -0.0001f, 0.0001f, (nnodes + 1) * 2, stream,
-                random_state);
-  ASSERT(
-    YY.data() != NULL && YY_prev.data() != NULL && rep_forces.data() != NULL,
-    "[ERROR] Possibly no more memory");
-=======
   MLCommon::device_buffer<float> old_forces(d_alloc, stream, n * 2);
   CUDA_CHECK(
     cudaMemsetAsync(old_forces.data(), 0, sizeof(float) * n * 2, stream));
@@ -195,7 +136,6 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ,
   // TODO bug #2549: this should be conditional on bool initialize_embeddings.
   random_vector(YY.data(), -0.0001f, 0.0001f, (nnodes + 1) * 2, stream,
                 random_state);
->>>>>>> 3aacbc93e6367b1cac99e66d7604020985f1ff52
 
   // Set cache levels for faster algorithm execution
   //---------------------------------------------------
@@ -216,21 +156,13 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ,
   float learning_rate = pre_learning_rate;
 
   for (int iter = 0; iter < max_iter; iter++) {
-<<<<<<< HEAD
-    CUDA_CHECK(cudaMemsetAsync(rep_forces.data(), 0,
-                               sizeof(float) * (nnodes + 1) * 2, stream));
-    CUDA_CHECK(
-      cudaMemsetAsync(attr_forces.data(), 0, sizeof(float) * n * 2, stream));
-
-    MLCommon::copy(YY_prev.data(), YY.data(), (nnodes + 1) * 2, stream);
-=======
     CUDA_CHECK(cudaMemsetAsync(static_cast<void *>(rep_forces.data()), 0,
                                rep_forces.size() * sizeof(*rep_forces.data()),
                                stream));
     CUDA_CHECK(cudaMemsetAsync(static_cast<void *>(attr_forces.data()), 0,
                                attr_forces.size() * sizeof(*attr_forces.data()),
                                stream));
->>>>>>> 3aacbc93e6367b1cac99e66d7604020985f1ff52
+
     TSNE::Reset_Normalization<<<1, 1, 0, stream>>>(
       Z_norm.data(), radiusd_squared.data(), bottomd.data(), NNODES,
       radiusd.data());
@@ -331,18 +263,6 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ,
     CUDA_CHECK(cudaPeekAtLastError());
 
     END_TIMER(IntegrationKernel_time);
-
-    h_chk_finite = thrust::transform_reduce(
-      thrust::cuda::par.on(stream), YY.data(), YY.data() + (nnodes + 1) * 2,
-      FiniteTestUnary(), 0, thrust::plus<bool>());
-    if (h_chk_finite) {
-      CUML_LOG_WARN(
-        "Non-finite result detected during Barnes Hut iteration: %d, returning last "
-        "known good positions.",
-        iter);
-      MLCommon::copy(YY.data(), YY_prev.data(), (nnodes + 1) * 2, stream);
-      break;
-    }
   }
   PRINT_TIMES;
 
diff --git a/cpp/src/tsne/bh_kernels.cuh b/cpp/src/tsne/bh_kernels.cuh
index 60749196e5..03355a5f87 100644
--- a/cpp/src/tsne/bh_kernels.cuh
+++ b/cpp/src/tsne/bh_kernels.cuh
@@ -41,14 +41,6 @@ namespace ML {
 namespace TSNE {
 
 
-/**
- * Unary op intended to be used as a Thrust transform, check if array elements non-finite.
- */
-struct FiniteTestUnary {
-  __host__ __device__ bool operator()(const float x) const { return !isfinite(x); }
-};
-
-
 /**
  * Intializes the states of objects. This speeds the overall kernel up.
  */
@@ -670,12 +662,19 @@ __global__ void attractive_kernel_bh(
   if (index >= NNZ) return;
   const int i = ROW[index];
   const int j = COL[index];
+  float PQ;
 
   // TODO: Calculate Kullback-Leibler divergence
   // TODO: Convert attractive forces to CSR format
-  const float PQ = __fdividef(
-    VAL[index],
-    norm_add1[i] + norm[j] - 2.0f * (Y1[i] * Y1[j] + Y2[i] * Y2[j]));  // P*Q
+  // Try single precision compute first
+  float denominator = __fmaf_rn(-2.0f, (Y1[i] * Y1[j]), norm_add1[i]) + __fmaf_rn(-2.0f, (Y2[i] * Y2[j]), norm[j]);
+
+  if (denominator == 0) {
+    /* repeat with double precision */
+    double dbl_denominator = __fma_rn(-2.0f, Y1[i] * Y1[j], norm_add1[i]) + __fma_rn(-2.0f, Y2[i] * Y2[j], norm[j]);
+    denominator = (dbl_denominator != 0) ? static_cast<float>(dbl_denominator) : FLT_EPSILON;
+  }
+  PQ = __fdividef(VAL[index], denominator);
 
   // Apply forces
   atomicAdd(&attract1[i], PQ * (Y1[i] - Y1[j]));

From ea794b486ae03c75ea3d1b358df6defd8325bad0 Mon Sep 17 00:00:00 2001
From: Devin Robison <drobison@nvidia.com>
Date: Mon, 27 Jul 2020 16:34:38 -0600
Subject: [PATCH 08/14] Update attractive kernel to use a double fallback, or
 clamp to 1 if that fails

---
 cpp/src/tsne/barnes_hut.cuh |  4 ----
 cpp/src/tsne/bh_kernels.cuh | 22 +++++++++++++++-------
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/cpp/src/tsne/barnes_hut.cuh b/cpp/src/tsne/barnes_hut.cuh
index 7c068f9437..f71bcb967a 100644
--- a/cpp/src/tsne/barnes_hut.cuh
+++ b/cpp/src/tsne/barnes_hut.cuh
@@ -56,7 +56,6 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ,
                 const int max_iter = 1000, const float min_grad_norm = 1e-7,
                 const float pre_momentum = 0.5, const float post_momentum = 0.8,
                 const long long random_state = -1) {
-  using MLCommon::device_buffer;
   auto d_alloc = handle.getDeviceAllocator();
   cudaStream_t stream = handle.getStream();
 
@@ -267,10 +266,7 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ,
   PRINT_TIMES;
 
   MLCommon::copy(Y, YY.data(), n, stream);
-  CUDA_CHECK(cudaPeekAtLastError());
-
   MLCommon::copy(Y + n, YY.data() + nnodes + 1, n, stream);
-  CUDA_CHECK(cudaPeekAtLastError());
 }
 
 }  // namespace TSNE
diff --git a/cpp/src/tsne/bh_kernels.cuh b/cpp/src/tsne/bh_kernels.cuh
index 03355a5f87..b79b34687e 100644
--- a/cpp/src/tsne/bh_kernels.cuh
+++ b/cpp/src/tsne/bh_kernels.cuh
@@ -35,7 +35,6 @@
 
 #include <common/cudart_utils.h>
 #include <float.h>
-#include <math.h>
 
 namespace ML {
 namespace TSNE {
@@ -662,19 +661,28 @@ __global__ void attractive_kernel_bh(
   if (index >= NNZ) return;
   const int i = ROW[index];
   const int j = COL[index];
-  float PQ;
 
   // TODO: Calculate Kullback-Leibler divergence
   // TODO: Convert attractive forces to CSR format
   // Try single precision compute first
   float denominator = __fmaf_rn(-2.0f, (Y1[i] * Y1[j]), norm_add1[i]) + __fmaf_rn(-2.0f, (Y2[i] * Y2[j]), norm[j]);
 
-  if (denominator == 0) {
-    /* repeat with double precision */
-    double dbl_denominator = __fma_rn(-2.0f, Y1[i] * Y1[j], norm_add1[i]) + __fma_rn(-2.0f, Y2[i] * Y2[j], norm[j]);
-    denominator = (dbl_denominator != 0) ? static_cast<float>(dbl_denominator) : FLT_EPSILON;
+  if (__builtin_expect(denominator == 0, false)) {
+    double _Y1 = static_cast<double>(Y1[i] * Y1[j]);
+    double _Y2 = static_cast<double>(Y2[i] * Y2[j]);
+    double dbl_denominator = __fma_rn(-2.0f, _Y1, norm_add1[i]) + __fma_rn(-2.0f, _Y2, norm[j]);
+
+    if (__builtin_expect(dbl_denominator == 0, false)) {
+      printf("Detected zero in denominator with __fma_rn(-2.0f, %lf, %lf) + __fma_rn(-2.0f, %lf, %lf)\n",
+          _Y1, norm_add1[i], _Y2, norm[j]);
+
+      dbl_denominator = 1.0f;
+      printf("dbl_denominator: %lf\n", dbl_denominator);
+    }
+
+    denominator = dbl_denominator;
   }
-  PQ = __fdividef(VAL[index], denominator);
+  const float PQ = __fdividef(VAL[index], denominator);
 
   // Apply forces
   atomicAdd(&attract1[i], PQ * (Y1[i] - Y1[j]));

From 6cf29f9bf2cf43dcb9bfc698749c43030927aaf4 Mon Sep 17 00:00:00 2001
From: Devin Robison <drobison@nvidia.com>
Date: Mon, 27 Jul 2020 17:21:51 -0600
Subject: [PATCH 09/14] Disable debug prints, add error checking/logging +
 early return from barnes hut

---
 cpp/src/tsne/barnes_hut.cuh | 14 ++++++++++++--
 cpp/src/tsne/bh_kernels.cuh | 14 +++++++++-----
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/cpp/src/tsne/barnes_hut.cuh b/cpp/src/tsne/barnes_hut.cuh
index f71bcb967a..f89d41df93 100644
--- a/cpp/src/tsne/barnes_hut.cuh
+++ b/cpp/src/tsne/barnes_hut.cuh
@@ -62,6 +62,7 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ,
   // Get device properites
   //---------------------------------------------------
   const int blocks = MLCommon::getMultiProcessorCount();
+  int h_flag;
 
   int nnodes = n * 2;
   if (nnodes < 1024 * blocks) nnodes = 1024 * blocks;
@@ -75,11 +76,13 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ,
   MLCommon::device_buffer<int> maxdepthd(d_alloc, stream, 1);
   MLCommon::device_buffer<int> bottomd(d_alloc, stream, 1);
   MLCommon::device_buffer<float> radiusd(d_alloc, stream, 1);
+  MLCommon::device_buffer<int> flag_unstable_computation(d_alloc, stream, 1);
 
   TSNE::InitializationKernel<<<1, 1, 0, stream>>>(/*errl.data(),*/
                                                   limiter.data(),
                                                   maxdepthd.data(),
-                                                  radiusd.data());
+                                                  radiusd.data(),
+                                                  flag_unstable_computation.data());
   CUDA_CHECK(cudaPeekAtLastError());
 
   const int FOUR_NNODES = 4 * nnodes;
@@ -248,10 +251,17 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ,
     TSNE::
       attractive_kernel_bh<<<MLCommon::ceildiv(NNZ, 1024), 1024, 0, stream>>>(
         VAL, COL, ROW, YY.data(), YY.data() + nnodes + 1, norm.data(),
-        norm_add1.data(), attr_forces.data(), attr_forces.data() + n, NNZ);
+        norm_add1.data(), attr_forces.data(), attr_forces.data() + n, NNZ, flag_unstable_computation.data());
     CUDA_CHECK(cudaPeekAtLastError());
     END_TIMER(attractive_time);
 
+    MLCommon::copy(&h_flag, flag_unstable_computation.data(), 1, stream);
+    if (h_flag) {
+      CUML_LOG_ERROR("Detected zero divisor in attractive force kernel, returning early."
+                    " Results may not be accurate.");
+      break;
+    }
+
     START_TIMER;
     TSNE::IntegrationKernel<<<blocks * FACTOR6, THREADS6, 0, stream>>>(
       learning_rate, momentum, early_exaggeration, YY.data(),
diff --git a/cpp/src/tsne/bh_kernels.cuh b/cpp/src/tsne/bh_kernels.cuh
index b79b34687e..42fe1a5a80 100644
--- a/cpp/src/tsne/bh_kernels.cuh
+++ b/cpp/src/tsne/bh_kernels.cuh
@@ -46,11 +46,13 @@ namespace TSNE {
 __global__ void InitializationKernel(/*int *restrict errd, */
                                      unsigned *restrict limiter,
                                      int *restrict maxdepthd,
-                                     float *restrict radiusd) {
+                                     float *restrict radiusd,
+                                     int *restrict flag_unstable_computation) {
   // errd[0] = 0;
   maxdepthd[0] = 1;
   limiter[0] = 0;
   radiusd[0] = 0.0f;
+  flag_unstable_computation[0] = 0;
 }
 
 /**
@@ -656,7 +658,8 @@ __global__ void attractive_kernel_bh(
   const float *restrict VAL, const int *restrict COL, const int *restrict ROW,
   const float *restrict Y1, const float *restrict Y2,
   const float *restrict norm, const float *restrict norm_add1,
-  float *restrict attract1, float *restrict attract2, const int NNZ) {
+  float *restrict attract1, float *restrict attract2, const int NNZ,
+  int *restrict flag_unstable_computation) {
   const int index = (blockIdx.x * blockDim.x) + threadIdx.x;
   if (index >= NNZ) return;
   const int i = ROW[index];
@@ -673,11 +676,12 @@ __global__ void attractive_kernel_bh(
     double dbl_denominator = __fma_rn(-2.0f, _Y1, norm_add1[i]) + __fma_rn(-2.0f, _Y2, norm[j]);
 
     if (__builtin_expect(dbl_denominator == 0, false)) {
-      printf("Detected zero in denominator with __fma_rn(-2.0f, %lf, %lf) + __fma_rn(-2.0f, %lf, %lf)\n",
-          _Y1, norm_add1[i], _Y2, norm[j]);
+      //printf("Detected zero in attractive force kernel denominator with __fma_rn(-2.0f, %lf, %lf) + "
+      //       "__fma_rn(-2.0f, %lf, %lf)\nClamping denominator to 1.0: tSNE results may not be accurate.",
+      //    _Y1, norm_add1[i], _Y2, norm[j]);
 
       dbl_denominator = 1.0f;
-      printf("dbl_denominator: %lf\n", dbl_denominator);
+      flag_unstable_computation[0] = 1;
     }
 
     denominator = dbl_denominator;

From 8cedd0923c1b25736b971abff2b62f1b2609d061 Mon Sep 17 00:00:00 2001
From: Devin Robison <drobison@nvidia.com>
Date: Mon, 27 Jul 2020 19:19:29 -0600
Subject: [PATCH 10/14] Formatting updates

---
 cpp/src/tsne/barnes_hut.cuh | 11 +++++++----
 cpp/src/tsne/bh_kernels.cuh |  7 ++++---
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/cpp/src/tsne/barnes_hut.cuh b/cpp/src/tsne/barnes_hut.cuh
index ccfab7121a..ec53b92c03 100644
--- a/cpp/src/tsne/barnes_hut.cuh
+++ b/cpp/src/tsne/barnes_hut.cuh
@@ -82,7 +82,8 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ,
                                                   limiter.data(),
                                                   maxdepthd.data(),
                                                   radiusd.data(),
-                                                  flag_unstable_computation.data());
+                                                  flag_unstable_computation
+                                                    .data());
   CUDA_CHECK(cudaPeekAtLastError());
 
   const int FOUR_NNODES = 4 * nnodes;
@@ -250,14 +251,16 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ,
     TSNE::
       attractive_kernel_bh<<<MLCommon::ceildiv(NNZ, 1024), 1024, 0, stream>>>(
         VAL, COL, ROW, YY.data(), YY.data() + nnodes + 1, norm.data(),
-        attr_forces.data(), attr_forces.data() + n, NNZ, flag_unstable_computation.data());
+        attr_forces.data(), attr_forces.data() + n, NNZ,
+        flag_unstable_computation.data());
     CUDA_CHECK(cudaPeekAtLastError());
     END_TIMER(attractive_time);
 
     MLCommon::copy(&h_flag, flag_unstable_computation.data(), 1, stream);
     if (h_flag) {
-      CUML_LOG_ERROR("Detected zero divisor in attractive force kernel, returning early."
-                    " Results may not be accurate.");
+      CUML_LOG_ERROR(
+        "Detected zero divisor in attractive force kernel, returning early."
+        " Results may not be accurate.");
       break;
     }
 
diff --git a/cpp/src/tsne/bh_kernels.cuh b/cpp/src/tsne/bh_kernels.cuh
index 722b13b0e2..0302afc5a4 100644
--- a/cpp/src/tsne/bh_kernels.cuh
+++ b/cpp/src/tsne/bh_kernels.cuh
@@ -39,7 +39,6 @@
 namespace ML {
 namespace TSNE {
 
-
 /**
  * Intializes the states of objects. This speeds the overall kernel up.
  */
@@ -666,12 +665,14 @@ __global__ void attractive_kernel_bh(
   // TODO: Calculate Kullback-Leibler divergence
   // TODO: Convert attractive forces to CSR format
   // Try single precision compute first
-  float denominator = __fmaf_rn(-2.0f, (Y1[i] * Y1[j]), norm_add1[i]) + __fmaf_rn(-2.0f, (Y2[i] * Y2[j]), norm[j]);
+  float denominator = __fmaf_rn(-2.0f, (Y1[i] * Y1[j]), norm_add1[i]) +
+                      __fmaf_rn(-2.0f, (Y2[i] * Y2[j]), norm[j]);
 
   if (__builtin_expect(denominator == 0, false)) {
     double _Y1 = static_cast<double>(Y1[i] * Y1[j]);
     double _Y2 = static_cast<double>(Y2[i] * Y2[j]);
-    double dbl_denominator = __fma_rn(-2.0f, _Y1, norm_add1[i]) + __fma_rn(-2.0f, _Y2, norm[j]);
+    double dbl_denominator =
+      __fma_rn(-2.0f, _Y1, norm_add1[i]) + __fma_rn(-2.0f, _Y2, norm[j]);
 
     if (__builtin_expect(dbl_denominator == 0, false)) {
       //printf("Detected zero in attractive force kernel denominator with __fma_rn(-2.0f, %lf, %lf) + "

From 17f9529b73ee53cc24279bd3bb053723569e7b24 Mon Sep 17 00:00:00 2001
From: Devin Robison <drobison@nvidia.com>
Date: Mon, 27 Jul 2020 19:28:05 -0600
Subject: [PATCH 11/14] Merge fix

---
 cpp/src/tsne/bh_kernels.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/tsne/bh_kernels.cuh b/cpp/src/tsne/bh_kernels.cuh
index 0302afc5a4..4278a30b37 100644
--- a/cpp/src/tsne/bh_kernels.cuh
+++ b/cpp/src/tsne/bh_kernels.cuh
@@ -672,7 +672,7 @@ __global__ void attractive_kernel_bh(
     double _Y1 = static_cast<double>(Y1[i] * Y1[j]);
     double _Y2 = static_cast<double>(Y2[i] * Y2[j]);
     double dbl_denominator =
-      __fma_rn(-2.0f, _Y1, norm_add1[i]) + __fma_rn(-2.0f, _Y2, norm[j]);
+      __fma_rn(-2.0f, _Y1, norm[i] + 1.0f) + __fma_rn(-2.0f, _Y2, norm[j]);
 
     if (__builtin_expect(dbl_denominator == 0, false)) {
       //printf("Detected zero in attractive force kernel denominator with __fma_rn(-2.0f, %lf, %lf) + "

From 9fec2631f54186b6362df017dfc6e75d841efdec Mon Sep 17 00:00:00 2001
From: Devin Robison <drobison@nvidia.com>
Date: Mon, 27 Jul 2020 20:45:06 -0600
Subject: [PATCH 12/14] Fix typo

---
 cpp/src/tsne/bh_kernels.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/tsne/bh_kernels.cuh b/cpp/src/tsne/bh_kernels.cuh
index 4278a30b37..bc80238b2f 100644
--- a/cpp/src/tsne/bh_kernels.cuh
+++ b/cpp/src/tsne/bh_kernels.cuh
@@ -665,7 +665,7 @@ __global__ void attractive_kernel_bh(
   // TODO: Calculate Kullback-Leibler divergence
   // TODO: Convert attractive forces to CSR format
   // Try single precision compute first
-  float denominator = __fmaf_rn(-2.0f, (Y1[i] * Y1[j]), norm_add1[i]) +
+  float denominator = __fmaf_rn(-2.0f, (Y1[i] * Y1[j]), norm[i] + 1.0f) +
                       __fmaf_rn(-2.0f, (Y2[i] * Y2[j]), norm[j]);
 
   if (__builtin_expect(denominator == 0, false)) {

From 9af3ca8ddfe2fdda5118d043c2272c185c664514 Mon Sep 17 00:00:00 2001
From: Devin Robison <drobison@nvidia.com>
Date: Tue, 28 Jul 2020 11:42:44 -0600
Subject: [PATCH 13/14] Improve error description, fix double-fallback multiply

---
 cpp/src/tsne/barnes_hut.cuh |  7 +++++--
 cpp/src/tsne/bh_kernels.cuh | 11 ++---------
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/cpp/src/tsne/barnes_hut.cuh b/cpp/src/tsne/barnes_hut.cuh
index ec53b92c03..83efca82d3 100644
--- a/cpp/src/tsne/barnes_hut.cuh
+++ b/cpp/src/tsne/barnes_hut.cuh
@@ -259,8 +259,10 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ,
     MLCommon::copy(&h_flag, flag_unstable_computation.data(), 1, stream);
     if (h_flag) {
       CUML_LOG_ERROR(
-        "Detected zero divisor in attractive force kernel, returning early."
-        " Results may not be accurate.");
+        "Detected zero divisor in attractive force kernel after '%d' iterations;"
+        " returning early. Your final results may not be accurate. In some cases"
+        " this error can be resolved by increasing perplexity, and n_neighbors;"
+        " if the problem persists, please use 'method=exact'.", iter);
       break;
     }
 
@@ -277,6 +279,7 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ,
   }
   PRINT_TIMES;
 
+  // Copy final YY into true output Y
   MLCommon::copy(Y, YY.data(), n, stream);
   MLCommon::copy(Y + n, YY.data() + nnodes + 1, n, stream);
 }
diff --git a/cpp/src/tsne/bh_kernels.cuh b/cpp/src/tsne/bh_kernels.cuh
index bc80238b2f..c4ca3b657d 100644
--- a/cpp/src/tsne/bh_kernels.cuh
+++ b/cpp/src/tsne/bh_kernels.cuh
@@ -255,10 +255,7 @@ __global__ __launch_bounds__(
 
             const int cell = atomicSub(bottomd, 1) - 1;
             if (cell == N) {
-              // atomicExch(errd, 1);
               atomicExch(bottomd, NNODES);
-              //printf("Cell Allocation Overflow, depth=%d, r=%f, rbound=%f, N=%d, NNODES=%d, bottomd=%d\n",
-              //    depth, r, radius, N, NNODES, prev_bottom);
             } else if (cell < N) {
               depth--;
               continue;
@@ -669,16 +666,12 @@ __global__ void attractive_kernel_bh(
                       __fmaf_rn(-2.0f, (Y2[i] * Y2[j]), norm[j]);
 
   if (__builtin_expect(denominator == 0, false)) {
-    double _Y1 = static_cast<double>(Y1[i] * Y1[j]);
-    double _Y2 = static_cast<double>(Y2[i] * Y2[j]);
+    double _Y1 = static_cast<double>(Y1[i]) * static_cast<double>(Y1[j]);
+    double _Y2 = static_cast<double>(Y2[i]) * static_cast<double>(Y2[j]);
     double dbl_denominator =
       __fma_rn(-2.0f, _Y1, norm[i] + 1.0f) + __fma_rn(-2.0f, _Y2, norm[j]);
 
     if (__builtin_expect(dbl_denominator == 0, false)) {
-      //printf("Detected zero in attractive force kernel denominator with __fma_rn(-2.0f, %lf, %lf) + "
-      //       "__fma_rn(-2.0f, %lf, %lf)\nClamping denominator to 1.0: tSNE results may not be accurate.",
-      //    _Y1, norm_add1[i], _Y2, norm[j]);
-
       dbl_denominator = 1.0f;
       flag_unstable_computation[0] = 1;
     }

From 911efccdec267af45f57767fcb00ea7de44f11d7 Mon Sep 17 00:00:00 2001
From: Devin Robison <drobison@nvidia.com>
Date: Tue, 28 Jul 2020 11:51:47 -0600
Subject: [PATCH 14/14] Style fix

---
 cpp/src/tsne/barnes_hut.cuh | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/cpp/src/tsne/barnes_hut.cuh b/cpp/src/tsne/barnes_hut.cuh
index 83efca82d3..642339ccf4 100644
--- a/cpp/src/tsne/barnes_hut.cuh
+++ b/cpp/src/tsne/barnes_hut.cuh
@@ -259,10 +259,13 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ,
     MLCommon::copy(&h_flag, flag_unstable_computation.data(), 1, stream);
     if (h_flag) {
       CUML_LOG_ERROR(
-        "Detected zero divisor in attractive force kernel after '%d' iterations;"
-        " returning early. Your final results may not be accurate. In some cases"
+        "Detected zero divisor in attractive force kernel after '%d' "
+        "iterations;"
+        " returning early. Your final results may not be accurate. In some "
+        "cases"
         " this error can be resolved by increasing perplexity, and n_neighbors;"
-        " if the problem persists, please use 'method=exact'.", iter);
+        " if the problem persists, please use 'method=exact'.",
+        iter);
       break;
     }