From 59235bf537b2e477fd63c506aaf31a2b6402e08d Mon Sep 17 00:00:00 2001 From: Devin Robison Date: Tue, 14 Jul 2020 15:45:26 -0600 Subject: [PATCH 01/14] WIP Test Commit --- cpp/src/tsne/barnes_hut.cuh | 170 ++++++++++++++------------------ cpp/src/tsne/bh_kernels.cuh | 61 +++++++++--- cpp/src/tsne/exact_kernels.cuh | 8 +- cpp/src_prims/selection/knn.cuh | 1 + 4 files changed, 125 insertions(+), 115 deletions(-) diff --git a/cpp/src/tsne/barnes_hut.cuh b/cpp/src/tsne/barnes_hut.cuh index 56ea5b8a0a..c308da687b 100644 --- a/cpp/src/tsne/barnes_hut.cuh +++ b/cpp/src/tsne/barnes_hut.cuh @@ -16,6 +16,7 @@ #pragma once #include +#include #include #include "bh_kernels.cuh" #include "utils.cuh" @@ -55,6 +56,7 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ, const int max_iter = 1000, const float min_grad_norm = 1e-7, const float pre_momentum = 0.5, const float post_momentum = 0.8, const long long random_state = -1) { + using MLCommon::device_buffer; auto d_alloc = handle.getDeviceAllocator(); cudaStream_t stream = handle.getStream(); @@ -71,71 +73,67 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ, // Allocate more space //--------------------------------------------------- //int *errl = (int *)d_alloc->allocate(sizeof(int), stream); - unsigned *limiter = (unsigned *)d_alloc->allocate(sizeof(unsigned), stream); - int *maxdepthd = (int *)d_alloc->allocate(sizeof(int), stream); - int *bottomd = (int *)d_alloc->allocate(sizeof(int), stream); - float *radiusd = (float *)d_alloc->allocate(sizeof(float), stream); + device_buffer limiter(d_alloc, stream, 1); + device_buffer maxdepthd(d_alloc, stream, 1); + device_buffer bottomd(d_alloc, stream, 1); + device_buffer radiusd(d_alloc, stream, 1); + + TSNE::InitializationKernel<<<1, 1, 0, stream>>>(/*errl,*/ limiter.data(), maxdepthd.data(), + radiusd.data()); - TSNE::InitializationKernel<<<1, 1, 0, stream>>>(/*errl,*/ limiter, maxdepthd, - radiusd); CUDA_CHECK(cudaPeekAtLastError()); const int FOUR_NNODES = 4 * nnodes; const int FOUR_N = 4 * n; const float theta_squared = theta * theta; const int NNODES = nnodes; + bool h_chk_nan; // Actual mallocs - int *startl = (int *)d_alloc->allocate(sizeof(int) * (nnodes + 1), stream); - int *childl = - (int *)d_alloc->allocate(sizeof(int) * (nnodes + 1) * 4, stream); - float *massl = - (float *)d_alloc->allocate(sizeof(float) * (nnodes + 1), stream); - thrust::device_ptr begin_massl = thrust::device_pointer_cast(massl); + device_buffer startl(d_alloc, stream, nnodes +1); + device_buffer childl(d_alloc, stream, (nnodes + 1) * 4); + device_buffer massl(d_alloc, stream, nnodes + 1); + thrust::device_ptr begin_massl = thrust::device_pointer_cast(massl.data()); thrust::fill(thrust::cuda::par.on(stream), begin_massl, begin_massl + (nnodes + 1), 1.0f); - float *maxxl = - (float *)d_alloc->allocate(sizeof(float) * blocks * FACTOR1, stream); - float *maxyl = - (float *)d_alloc->allocate(sizeof(float) * blocks * FACTOR1, stream); - float *minxl = - (float *)d_alloc->allocate(sizeof(float) * blocks * FACTOR1, stream); - float *minyl = - (float *)d_alloc->allocate(sizeof(float) * blocks * FACTOR1, stream); + device_buffer maxxl(d_alloc, stream, blocks * FACTOR1); + device_buffer maxyl(d_alloc, stream, blocks * FACTOR1); + device_buffer minxl(d_alloc, stream, blocks * FACTOR1); + device_buffer minyl(d_alloc, stream, blocks * FACTOR1); // SummarizationKernel - int *countl = (int *)d_alloc->allocate(sizeof(int) * (nnodes + 1), stream); + device_buffer countl(d_alloc, stream, nnodes + 1); // SortKernel - int *sortl = (int *)d_alloc->allocate(sizeof(int) * (nnodes + 1), stream); + device_buffer sortl(d_alloc, stream, nnodes + 1); // RepulsionKernel - float *rep_forces = - (float *)d_alloc->allocate(sizeof(float) * (nnodes + 1) * 2, stream); - float *attr_forces = (float *)d_alloc->allocate( - sizeof(float) * n * 2, stream); // n*2 double for reduction sum + device_buffer rep_forces(d_alloc, stream, (nnodes + 1) * 2); + device_buffer attr_forces(d_alloc, stream, n * 2); - float *norm_add1 = (float *)d_alloc->allocate(sizeof(float) * n, stream); - float *norm = (float *)d_alloc->allocate(sizeof(float) * n, stream); - float *Z_norm = (float *)d_alloc->allocate(sizeof(float), stream); + device_buffer norm_add1(d_alloc, stream, n); + device_buffer norm(d_alloc, stream, n); + device_buffer Z_norm(d_alloc, stream, 1); - float *radiusd_squared = (float *)d_alloc->allocate(sizeof(float), stream); + device_buffer radiusd_squared(d_alloc, stream, 1); // Apply - float *gains_bh = (float *)d_alloc->allocate(sizeof(float) * n * 2, stream); + device_buffer gains_bh(d_alloc, stream, n * 2); thrust::device_ptr begin_gains_bh = - thrust::device_pointer_cast(gains_bh); + thrust::device_pointer_cast(gains_bh.data()); thrust::fill(thrust::cuda::par.on(stream), begin_gains_bh, begin_gains_bh + (n * 2), 1.0f); - float *old_forces = (float *)d_alloc->allocate(sizeof(float) * n * 2, stream); - CUDA_CHECK(cudaMemsetAsync(old_forces, 0, sizeof(float) * n * 2, stream)); + device_buffer old_forces(d_alloc, stream, n * 2); + CUDA_CHECK(cudaMemsetAsync(old_forces.data(), 0, sizeof(float) * n * 2, stream)); - float *YY = - (float *)d_alloc->allocate(sizeof(float) * (nnodes + 1) * 2, stream); - random_vector(YY, -0.0001f, 0.0001f, (nnodes + 1) * 2, stream, random_state); - ASSERT(YY != NULL && rep_forces != NULL, "[ERROR] Possibly no more memory"); + device_buffer YY(d_alloc, stream, (nnodes + 1) * 2); + device_buffer YY_prev(d_alloc, stream, (nnodes + 1) * 2); + random_vector(YY.data(), -0.0001f, 0.0001f, (nnodes + 1) * 2, stream, random_state); + ASSERT(YY.data() != NULL + && YY_prev.data() != NULL + && rep_forces.data() != NULL, "[ERROR] Possibly no more memory"); // Set cache levels for faster algorithm execution //--------------------------------------------------- @@ -148,7 +146,6 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ, cudaFuncSetCacheConfig(TSNE::RepulsionKernel, cudaFuncCachePreferL1); cudaFuncSetCacheConfig(TSNE::attractive_kernel_bh, cudaFuncCachePreferL1); cudaFuncSetCacheConfig(TSNE::IntegrationKernel, cudaFuncCachePreferL1); - // Do gradient updates //--------------------------------------------------- CUML_LOG_DEBUG("Start gradient updates!"); @@ -158,10 +155,13 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ, for (int iter = 0; iter < max_iter; iter++) { CUDA_CHECK( - cudaMemsetAsync(rep_forces, 0, sizeof(float) * (nnodes + 1) * 2, stream)); - CUDA_CHECK(cudaMemsetAsync(attr_forces, 0, sizeof(float) * n * 2, stream)); - TSNE::Reset_Normalization<<<1, 1, 0, stream>>>(Z_norm, radiusd_squared, - bottomd, NNODES, radiusd); + cudaMemsetAsync(rep_forces.data(), 0, sizeof(float) * (nnodes + 1) * 2, stream)); + CUDA_CHECK(cudaMemsetAsync(attr_forces.data(), 0, sizeof(float) * n * 2, stream)); + + MLCommon::copy(YY_prev.data(), YY.data(), (nnodes + 1) * 2, stream); + TSNE::Reset_Normalization<<<1, 1, 0, stream>>>(Z_norm.data(), radiusd_squared.data(), + bottomd.data(), + NNODES, radiusd.data()); CUDA_CHECK(cudaPeekAtLastError()); if (iter == exaggeration_iter) { @@ -173,14 +173,14 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ, START_TIMER; TSNE::BoundingBoxKernel<<>>( - startl, childl, massl, YY, YY + nnodes + 1, maxxl, maxyl, minxl, minyl, - FOUR_NNODES, NNODES, n, limiter, radiusd); + startl.data(), childl.data(), massl.data(), YY.data(), YY.data() + nnodes + 1, maxxl.data(), maxyl.data(), + minxl.data(), minyl.data(), FOUR_NNODES, NNODES, n, limiter.data(), radiusd.data()); CUDA_CHECK(cudaPeekAtLastError()); END_TIMER(BoundingBoxKernel_time); START_TIMER; - TSNE::ClearKernel1<<>>(childl, FOUR_NNODES, + TSNE::ClearKernel1<<>>(childl.data(), FOUR_NNODES, FOUR_N); CUDA_CHECK(cudaPeekAtLastError()); @@ -188,113 +188,89 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ, START_TIMER; TSNE::TreeBuildingKernel<<>>( - /*errl,*/ childl, YY, YY + nnodes + 1, NNODES, n, maxdepthd, bottomd, - radiusd); + /*errl,*/ childl.data(), YY.data(), YY.data() + nnodes + 1, NNODES, n, maxdepthd.data(), bottomd.data(), + radiusd.data()); CUDA_CHECK(cudaPeekAtLastError()); END_TIMER(TreeBuildingKernel_time); START_TIMER; - TSNE::ClearKernel2<<>>(startl, massl, NNODES, - bottomd); + TSNE::ClearKernel2<<>>(startl.data(), massl.data(), NNODES, + bottomd.data()); CUDA_CHECK(cudaPeekAtLastError()); END_TIMER(ClearKernel2_time); START_TIMER; TSNE::SummarizationKernel<<>>( - countl, childl, massl, YY, YY + nnodes + 1, NNODES, n, bottomd); + countl.data(), childl.data(), massl.data(), YY.data(), YY.data() + nnodes + 1, NNODES, n, bottomd.data()); CUDA_CHECK(cudaPeekAtLastError()); END_TIMER(SummarizationKernel_time); START_TIMER; TSNE::SortKernel<<>>( - sortl, countl, startl, childl, NNODES, n, bottomd); + sortl.data(), countl.data(), startl.data(), childl.data(), NNODES, n, bottomd.data()); CUDA_CHECK(cudaPeekAtLastError()); END_TIMER(SortKernel_time); START_TIMER; TSNE::RepulsionKernel<<>>( - /*errl,*/ theta, epssq, sortl, childl, massl, YY, YY + nnodes + 1, - rep_forces, rep_forces + nnodes + 1, Z_norm, theta_squared, NNODES, - FOUR_NNODES, n, radiusd_squared, maxdepthd); + /*errl,*/ theta, epssq, sortl.data(), childl.data(), massl.data(), YY.data(), YY.data() + nnodes + 1, + rep_forces.data(), rep_forces.data() + nnodes + 1, Z_norm.data(), theta_squared, NNODES, + FOUR_NNODES, n, radiusd_squared.data(), maxdepthd.data()); CUDA_CHECK(cudaPeekAtLastError()); END_TIMER(RepulsionTime); START_TIMER; - TSNE::Find_Normalization<<<1, 1, 0, stream>>>(Z_norm, n); + TSNE::Find_Normalization<<<1, 1, 0, stream>>>(Z_norm.data(), n); CUDA_CHECK(cudaPeekAtLastError()); END_TIMER(Reduction_time); START_TIMER; TSNE::get_norm<<>>( - YY, YY + nnodes + 1, norm, norm_add1, n); + YY.data(), YY.data() + nnodes + 1, norm.data(), norm_add1.data(), n); CUDA_CHECK(cudaPeekAtLastError()); // TODO: Calculate Kullback-Leibler divergence // For general embedding dimensions TSNE:: attractive_kernel_bh<<>>( - VAL, COL, ROW, YY, YY + nnodes + 1, norm, norm_add1, attr_forces, - attr_forces + n, NNZ); + VAL, COL, ROW, YY.data(), YY.data() + nnodes + 1, norm.data(), norm_add1.data(), attr_forces.data(), + attr_forces.data() + n, NNZ); CUDA_CHECK(cudaPeekAtLastError()); END_TIMER(attractive_time); START_TIMER; TSNE::IntegrationKernel<<>>( - learning_rate, momentum, early_exaggeration, YY, YY + nnodes + 1, - attr_forces, attr_forces + n, rep_forces, rep_forces + nnodes + 1, - gains_bh, gains_bh + n, old_forces, old_forces + n, Z_norm, n); + learning_rate, momentum, early_exaggeration, YY.data(), YY.data() + nnodes + 1, + attr_forces.data(), attr_forces.data() + n, rep_forces.data(), rep_forces.data() + nnodes + 1, + gains_bh.data(), gains_bh.data() + n, old_forces.data(), old_forces.data() + n, Z_norm.data(), n); CUDA_CHECK(cudaPeekAtLastError()); END_TIMER(IntegrationKernel_time); + + h_chk_nan = thrust::transform_reduce(thrust::cuda::par.on(stream), YY.data(), YY.data() + (nnodes + 1) * 2, + NaNTestKernel(), 0, thrust::plus()); + if (h_chk_nan) { + CUML_LOG_DEBUG("NaN result detected during Barnes Hut iteration: %d, returning last known good positions.", iter); + MLCommon::copy(YY.data(), YY_prev.data(), (nnodes + 1) * 2, stream); + break; + } } PRINT_TIMES; // Copy final YY into true output Y thrust::device_ptr Y_begin = thrust::device_pointer_cast(Y); - thrust::copy(thrust::cuda::par.on(stream), YY, YY + n, Y_begin); - CUDA_CHECK(cudaPeekAtLastError()); - thrust::copy(thrust::cuda::par.on(stream), YY + nnodes + 1, - YY + nnodes + 1 + n, Y_begin + n); + MLCommon::copy(Y, YY.data(), n, stream); CUDA_CHECK(cudaPeekAtLastError()); - // Deallocate everything - //d_alloc->deallocate(errl, sizeof(int), stream); - d_alloc->deallocate(limiter, sizeof(unsigned), stream); - d_alloc->deallocate(maxdepthd, sizeof(int), stream); - d_alloc->deallocate(bottomd, sizeof(int), stream); - d_alloc->deallocate(radiusd, sizeof(float), stream); - - d_alloc->deallocate(startl, sizeof(int) * (nnodes + 1), stream); - d_alloc->deallocate(childl, sizeof(int) * (nnodes + 1) * 4, stream); - d_alloc->deallocate(massl, sizeof(float) * (nnodes + 1), stream); - - d_alloc->deallocate(maxxl, sizeof(float) * blocks * FACTOR1, stream); - d_alloc->deallocate(maxyl, sizeof(float) * blocks * FACTOR1, stream); - d_alloc->deallocate(minxl, sizeof(float) * blocks * FACTOR1, stream); - d_alloc->deallocate(minyl, sizeof(float) * blocks * FACTOR1, stream); - - d_alloc->deallocate(countl, sizeof(int) * (nnodes + 1), stream); - d_alloc->deallocate(sortl, sizeof(int) * (nnodes + 1), stream); - - d_alloc->deallocate(rep_forces, sizeof(float) * (nnodes + 1) * 2, stream); - d_alloc->deallocate(attr_forces, sizeof(float) * n * 2, stream); - d_alloc->deallocate(norm, sizeof(float) * n, stream); - d_alloc->deallocate(norm_add1, sizeof(float) * n, stream); - - d_alloc->deallocate(Z_norm, sizeof(float), stream); - d_alloc->deallocate(radiusd_squared, sizeof(float), stream); - - d_alloc->deallocate(gains_bh, sizeof(float) * n * 2, stream); - d_alloc->deallocate(old_forces, sizeof(float) * n * 2, stream); - - d_alloc->deallocate(YY, sizeof(float) * (nnodes + 1) * 2, stream); + MLCommon::copy(Y + n, YY.data() + nnodes + 1, n, stream); + CUDA_CHECK(cudaPeekAtLastError()); } } // namespace TSNE diff --git a/cpp/src/tsne/bh_kernels.cuh b/cpp/src/tsne/bh_kernels.cuh index 27f27a5ca8..53ae0b0e84 100644 --- a/cpp/src/tsne/bh_kernels.cuh +++ b/cpp/src/tsne/bh_kernels.cuh @@ -40,6 +40,16 @@ namespace ML { namespace TSNE { +/** + * Unary op intended to be used as a Thrust transform, check if array elements are NaNs. + */ +struct NaNTestKernel{ + __host__ __device__ bool operator()(const float a) const + { + return isnan(a); + } +}; + /** * Intializes the states of objects. This speeds the overall kernel up. */ @@ -171,7 +181,8 @@ __global__ __launch_bounds__(1024, 1) void ClearKernel1(int *restrict childd, } /** - * Build the actual KD Tree. + * Build the actual QuadTree. + * See: https://iss.oden.utexas.edu/Publications/Papers/burtscher11.pdf */ __global__ __launch_bounds__( THREADS2, FACTOR2) void TreeBuildingKernel(/* int *restrict errd, */ @@ -194,10 +205,11 @@ __global__ __launch_bounds__( int localmaxdepth = 1; int skip = 1; + const int inc = blockDim.x * gridDim.x; int i = threadIdx.x + blockIdx.x * blockDim.x; - // iterate over all bodies assigned to thread + // Iterate over all bodies assigned to thread while (i < N) { if (skip != 0) { // new body, so start traversing at root @@ -206,28 +218,34 @@ __global__ __launch_bounds__( depth = 1; r = radius * 0.5f; + /* Select child node 'j' + rootx < px rootx > px + * rooty < py 1 -> 3 0 -> 2 + * rooty > py 1 -> 1 0 -> 0 + */ x = rootx + ((rootx < (px = posxd[i])) ? (j = 1, r) : (j = 0, -r)); y = rooty + ((rooty < (py = posyd[i])) ? (j |= 2, r) : (-r)); } - // follow path to leaf cell + // Follow path to leaf cell while ((ch = childd[n * 4 + j]) >= N) { n = ch; depth++; r *= 0.5f; - // determine which child to follow x += ((x < px) ? (j = 1, r) : (j = 0, -r)); y += ((y < py) ? (j |= 2, r) : (-r)); } - if (ch != -2) { - // skip if child pointer is locked and try again later - locked = n * 4 + j; + // (ch)ild will be '-1' (nullptr), '-2' (locked), or an Integer corresponding to a body offset + // in the lower [0, N) blocks of childd + if (ch != -2) { // skip if child pointer was locked when we examined it, and try again later. + locked = n * 4 + j; // store the locked position in case we need to patch in a cell later. if (ch == -1) { + // Child is a nullptr ('-1'), so we write our body index to the leaf, and move on to the next body. if (atomicCAS(&childd[locked], -1, i) == -1) { if (depth > localmaxdepth) localmaxdepth = depth; @@ -235,23 +253,34 @@ __global__ __launch_bounds__( skip = 1; } } else { + // Child node isn't empty, so we store the current value of the child, lock the leaf, and patch in a new cell if (ch == atomicCAS(&childd[locked], ch, -2)) { - // try to lock patch = -1; while (ch >= 0) { depth++; const int cell = atomicSub(bottomd, 1) - 1; - if (cell <= N) { + if (cell == N) { // atomicExch(errd, 1); - atomicExch(bottomd, NNODES); + int prev_bottom = atomicExch(bottomd, NNODES); + //printf("Cell Allocation Overflow, depth=%d, r=%f, rbound=%f, N=%d, NNODES=%d, bottomd=%d\n", + // depth, r, radius, N, NNODES, prev_bottom); + } + else if (cell < N) { + depth--; + continue; } - if (patch != -1) childd[n * 4 + j] = cell; + if (patch != -1) { + childd[n * 4 + j] = cell; + } - if (cell > patch) patch = cell; + if (cell > patch) { + patch = cell; + } + // Insert migrated child node j = (x < posxd[ch]) ? 1 : 0; if (y < posyd[ch]) j |= 2; @@ -259,12 +288,16 @@ __global__ __launch_bounds__( n = cell; r *= 0.5f; + x += ((x < px) ? (j = 1, r) : (j = 0, -r)); y += ((y < py) ? (j |= 2, r) : (-r)); + // Check the value of the j'th child at offset 'n' N <= n < 2*N, in childd ch = childd[n * 4 + j]; - if (r <= 1e-10) break; + if (r <= 1e-10) { + break; + } } childd[n * 4 + j] = i; @@ -276,12 +309,12 @@ __global__ __launch_bounds__( } } } + __threadfence(); if (skip == 2) childd[locked] = patch; } - // record maximum tree depth // if (localmaxdepth >= THREADS5) // localmaxdepth = THREADS5 - 1; if (localmaxdepth > 32) localmaxdepth = 32; diff --git a/cpp/src/tsne/exact_kernels.cuh b/cpp/src/tsne/exact_kernels.cuh index b16ade068f..df0117f334 100644 --- a/cpp/src/tsne/exact_kernels.cuh +++ b/cpp/src/tsne/exact_kernels.cuh @@ -26,7 +26,7 @@ namespace ML { namespace TSNE { /****************************************/ -/* Finds the best guassian bandwith for +/* Finds the best Gaussian bandwidth for each row in the dataset */ __global__ void sigmas_kernel(const float *restrict distances, float *restrict P, const float perplexity, @@ -45,7 +45,7 @@ __global__ void sigmas_kernel(const float *restrict distances, for (int step = 0; step < epochs; step++) { float sum_Pi = FLT_EPSILON; - // Exponentiate to get guassian + // Exponentiate to get Gaussian for (int j = 0; j < k; j++) { P[ik + j] = __expf(-distances[ik + j] * beta); sum_Pi += P[ik + j]; @@ -84,7 +84,7 @@ __global__ void sigmas_kernel(const float *restrict distances, } /****************************************/ -/* Finds the best guassian bandwith for +/* Finds the best Gaussian bandwith for each row in the dataset */ __global__ void sigmas_kernel_2d(const float *restrict distances, float *restrict P, const float perplexity, @@ -101,7 +101,7 @@ __global__ void sigmas_kernel_2d(const float *restrict distances, register const int ik = i * 2; for (int step = 0; step < epochs; step++) { - // Exponentiate to get guassian + // Exponentiate to get Gaussian P[ik] = __expf(-distances[ik] * beta); P[ik + 1] = __expf(-distances[ik + 1] * beta); const float sum_Pi = FLT_EPSILON + P[ik] + P[ik + 1]; diff --git a/cpp/src_prims/selection/knn.cuh b/cpp/src_prims/selection/knn.cuh index 841090d78c..9386b71f78 100644 --- a/cpp/src_prims/selection/knn.cuh +++ b/cpp/src_prims/selection/knn.cuh @@ -216,6 +216,7 @@ void brute_force_knn(std::vector &input, std::vector &sizes, ASSERT(input.size() == sizes.size(), "input and sizes vectors should be the same size"); + faiss::MetricType m = build_faiss_metric(metric); std::vector *id_ranges; From 0d3884ee9abea1fd474948e66873e8cbc0ea208f Mon Sep 17 00:00:00 2001 From: Devin Robison Date: Wed, 15 Jul 2020 13:51:31 -0600 Subject: [PATCH 02/14] Add clang formatting fixes --- CHANGELOG.md | 1 + cpp/src/tsne/barnes_hut.cuh | 78 +++++++++++++++++++-------------- cpp/src/tsne/bh_kernels.cuh | 33 ++++++-------- cpp/src_prims/selection/knn.cuh | 1 - 4 files changed, 60 insertions(+), 53 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 901e75f46e..a81c93425a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -83,6 +83,7 @@ - PR #2535: Fix issue with incorrect docker image being used in local build script - PR #2542: Fix small memory leak in TSNE - PR #2552: Fixed the length argument of updateDevice calls in RF test +- PR #2358: Fix cell allocation code to avoid loops in quad-tree. Prevent NaNs causing infinite descent # cuML 0.14.0 (03 Jun 2020) diff --git a/cpp/src/tsne/barnes_hut.cuh b/cpp/src/tsne/barnes_hut.cuh index c308da687b..c9e15feec4 100644 --- a/cpp/src/tsne/barnes_hut.cuh +++ b/cpp/src/tsne/barnes_hut.cuh @@ -78,8 +78,8 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ, device_buffer bottomd(d_alloc, stream, 1); device_buffer radiusd(d_alloc, stream, 1); - TSNE::InitializationKernel<<<1, 1, 0, stream>>>(/*errl,*/ limiter.data(), maxdepthd.data(), - radiusd.data()); + TSNE::InitializationKernel<<<1, 1, 0, stream>>>( + /*errl,*/ limiter.data(), maxdepthd.data(), radiusd.data()); CUDA_CHECK(cudaPeekAtLastError()); @@ -90,10 +90,11 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ, bool h_chk_nan; // Actual mallocs - device_buffer startl(d_alloc, stream, nnodes +1); + device_buffer startl(d_alloc, stream, nnodes + 1); device_buffer childl(d_alloc, stream, (nnodes + 1) * 4); device_buffer massl(d_alloc, stream, nnodes + 1); - thrust::device_ptr begin_massl = thrust::device_pointer_cast(massl.data()); + thrust::device_ptr begin_massl = + thrust::device_pointer_cast(massl.data()); thrust::fill(thrust::cuda::par.on(stream), begin_massl, begin_massl + (nnodes + 1), 1.0f); @@ -126,14 +127,16 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ, begin_gains_bh + (n * 2), 1.0f); device_buffer old_forces(d_alloc, stream, n * 2); - CUDA_CHECK(cudaMemsetAsync(old_forces.data(), 0, sizeof(float) * n * 2, stream)); + CUDA_CHECK( + cudaMemsetAsync(old_forces.data(), 0, sizeof(float) * n * 2, stream)); device_buffer YY(d_alloc, stream, (nnodes + 1) * 2); device_buffer YY_prev(d_alloc, stream, (nnodes + 1) * 2); - random_vector(YY.data(), -0.0001f, 0.0001f, (nnodes + 1) * 2, stream, random_state); - ASSERT(YY.data() != NULL - && YY_prev.data() != NULL - && rep_forces.data() != NULL, "[ERROR] Possibly no more memory"); + random_vector(YY.data(), -0.0001f, 0.0001f, (nnodes + 1) * 2, stream, + random_state); + ASSERT( + YY.data() != NULL && YY_prev.data() != NULL && rep_forces.data() != NULL, + "[ERROR] Possibly no more memory"); // Set cache levels for faster algorithm execution //--------------------------------------------------- @@ -154,14 +157,15 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ, float learning_rate = pre_learning_rate; for (int iter = 0; iter < max_iter; iter++) { + CUDA_CHECK(cudaMemsetAsync(rep_forces.data(), 0, + sizeof(float) * (nnodes + 1) * 2, stream)); CUDA_CHECK( - cudaMemsetAsync(rep_forces.data(), 0, sizeof(float) * (nnodes + 1) * 2, stream)); - CUDA_CHECK(cudaMemsetAsync(attr_forces.data(), 0, sizeof(float) * n * 2, stream)); + cudaMemsetAsync(attr_forces.data(), 0, sizeof(float) * n * 2, stream)); MLCommon::copy(YY_prev.data(), YY.data(), (nnodes + 1) * 2, stream); - TSNE::Reset_Normalization<<<1, 1, 0, stream>>>(Z_norm.data(), radiusd_squared.data(), - bottomd.data(), - NNODES, radiusd.data()); + TSNE::Reset_Normalization<<<1, 1, 0, stream>>>( + Z_norm.data(), radiusd_squared.data(), bottomd.data(), NNODES, + radiusd.data()); CUDA_CHECK(cudaPeekAtLastError()); if (iter == exaggeration_iter) { @@ -173,8 +177,9 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ, START_TIMER; TSNE::BoundingBoxKernel<<>>( - startl.data(), childl.data(), massl.data(), YY.data(), YY.data() + nnodes + 1, maxxl.data(), maxyl.data(), - minxl.data(), minyl.data(), FOUR_NNODES, NNODES, n, limiter.data(), radiusd.data()); + startl.data(), childl.data(), massl.data(), YY.data(), + YY.data() + nnodes + 1, maxxl.data(), maxyl.data(), minxl.data(), + minyl.data(), FOUR_NNODES, NNODES, n, limiter.data(), radiusd.data()); CUDA_CHECK(cudaPeekAtLastError()); END_TIMER(BoundingBoxKernel_time); @@ -188,37 +193,40 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ, START_TIMER; TSNE::TreeBuildingKernel<<>>( - /*errl,*/ childl.data(), YY.data(), YY.data() + nnodes + 1, NNODES, n, maxdepthd.data(), bottomd.data(), - radiusd.data()); + /*errl,*/ childl.data(), YY.data(), YY.data() + nnodes + 1, NNODES, n, + maxdepthd.data(), bottomd.data(), radiusd.data()); CUDA_CHECK(cudaPeekAtLastError()); END_TIMER(TreeBuildingKernel_time); START_TIMER; - TSNE::ClearKernel2<<>>(startl.data(), massl.data(), NNODES, - bottomd.data()); + TSNE::ClearKernel2<<>>( + startl.data(), massl.data(), NNODES, bottomd.data()); CUDA_CHECK(cudaPeekAtLastError()); END_TIMER(ClearKernel2_time); START_TIMER; TSNE::SummarizationKernel<<>>( - countl.data(), childl.data(), massl.data(), YY.data(), YY.data() + nnodes + 1, NNODES, n, bottomd.data()); + countl.data(), childl.data(), massl.data(), YY.data(), + YY.data() + nnodes + 1, NNODES, n, bottomd.data()); CUDA_CHECK(cudaPeekAtLastError()); END_TIMER(SummarizationKernel_time); START_TIMER; TSNE::SortKernel<<>>( - sortl.data(), countl.data(), startl.data(), childl.data(), NNODES, n, bottomd.data()); + sortl.data(), countl.data(), startl.data(), childl.data(), NNODES, n, + bottomd.data()); CUDA_CHECK(cudaPeekAtLastError()); END_TIMER(SortKernel_time); START_TIMER; TSNE::RepulsionKernel<<>>( - /*errl,*/ theta, epssq, sortl.data(), childl.data(), massl.data(), YY.data(), YY.data() + nnodes + 1, - rep_forces.data(), rep_forces.data() + nnodes + 1, Z_norm.data(), theta_squared, NNODES, + /*errl,*/ theta, epssq, sortl.data(), childl.data(), massl.data(), + YY.data(), YY.data() + nnodes + 1, rep_forces.data(), + rep_forces.data() + nnodes + 1, Z_norm.data(), theta_squared, NNODES, FOUR_NNODES, n, radiusd_squared.data(), maxdepthd.data()); CUDA_CHECK(cudaPeekAtLastError()); @@ -239,24 +247,30 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ, // For general embedding dimensions TSNE:: attractive_kernel_bh<<>>( - VAL, COL, ROW, YY.data(), YY.data() + nnodes + 1, norm.data(), norm_add1.data(), attr_forces.data(), - attr_forces.data() + n, NNZ); + VAL, COL, ROW, YY.data(), YY.data() + nnodes + 1, norm.data(), + norm_add1.data(), attr_forces.data(), attr_forces.data() + n, NNZ); CUDA_CHECK(cudaPeekAtLastError()); END_TIMER(attractive_time); START_TIMER; TSNE::IntegrationKernel<<>>( - learning_rate, momentum, early_exaggeration, YY.data(), YY.data() + nnodes + 1, - attr_forces.data(), attr_forces.data() + n, rep_forces.data(), rep_forces.data() + nnodes + 1, - gains_bh.data(), gains_bh.data() + n, old_forces.data(), old_forces.data() + n, Z_norm.data(), n); + learning_rate, momentum, early_exaggeration, YY.data(), + YY.data() + nnodes + 1, attr_forces.data(), attr_forces.data() + n, + rep_forces.data(), rep_forces.data() + nnodes + 1, gains_bh.data(), + gains_bh.data() + n, old_forces.data(), old_forces.data() + n, + Z_norm.data(), n); CUDA_CHECK(cudaPeekAtLastError()); END_TIMER(IntegrationKernel_time); - h_chk_nan = thrust::transform_reduce(thrust::cuda::par.on(stream), YY.data(), YY.data() + (nnodes + 1) * 2, - NaNTestKernel(), 0, thrust::plus()); + h_chk_nan = thrust::transform_reduce( + thrust::cuda::par.on(stream), YY.data(), YY.data() + (nnodes + 1) * 2, + NaNTestKernel(), 0, thrust::plus()); if (h_chk_nan) { - CUML_LOG_DEBUG("NaN result detected during Barnes Hut iteration: %d, returning last known good positions.", iter); + CUML_LOG_DEBUG( + "NaN result detected during Barnes Hut iteration: %d, returning last " + "known good positions.", + iter); MLCommon::copy(YY.data(), YY_prev.data(), (nnodes + 1) * 2, stream); break; } diff --git a/cpp/src/tsne/bh_kernels.cuh b/cpp/src/tsne/bh_kernels.cuh index 53ae0b0e84..de1cbc2920 100644 --- a/cpp/src/tsne/bh_kernels.cuh +++ b/cpp/src/tsne/bh_kernels.cuh @@ -43,11 +43,8 @@ namespace TSNE { /** * Unary op intended to be used as a Thrust transform, check if array elements are NaNs. */ -struct NaNTestKernel{ - __host__ __device__ bool operator()(const float a) const - { - return isnan(a); - } +struct NaNTestKernel { + __host__ __device__ bool operator()(const float a) const { return isnan(a); } }; /** @@ -209,7 +206,7 @@ __global__ __launch_bounds__( const int inc = blockDim.x * gridDim.x; int i = threadIdx.x + blockIdx.x * blockDim.x; - // Iterate over all bodies assigned to thread + // iterate over all bodies assigned to thread while (i < N) { if (skip != 0) { // new body, so start traversing at root @@ -228,7 +225,7 @@ __global__ __launch_bounds__( y = rooty + ((rooty < (py = posyd[i])) ? (j |= 2, r) : (-r)); } - // Follow path to leaf cell + // follow path to leaf cell while ((ch = childd[n * 4 + j]) >= N) { n = ch; depth++; @@ -241,8 +238,10 @@ __global__ __launch_bounds__( // (ch)ild will be '-1' (nullptr), '-2' (locked), or an Integer corresponding to a body offset // in the lower [0, N) blocks of childd - if (ch != -2) { // skip if child pointer was locked when we examined it, and try again later. - locked = n * 4 + j; // store the locked position in case we need to patch in a cell later. + if (ch != -2) { + // skip if child pointer was locked when we examined it, and try again later. + locked = n * 4 + j; + // store the locked position in case we need to patch in a cell later. if (ch == -1) { // Child is a nullptr ('-1'), so we write our body index to the leaf, and move on to the next body. @@ -263,22 +262,17 @@ __global__ __launch_bounds__( const int cell = atomicSub(bottomd, 1) - 1; if (cell == N) { // atomicExch(errd, 1); - int prev_bottom = atomicExch(bottomd, NNODES); + atomicExch(bottomd, NNODES); //printf("Cell Allocation Overflow, depth=%d, r=%f, rbound=%f, N=%d, NNODES=%d, bottomd=%d\n", // depth, r, radius, N, NNODES, prev_bottom); - } - else if (cell < N) { + } else if (cell < N) { depth--; continue; } - if (patch != -1) { - childd[n * 4 + j] = cell; - } + if (patch != -1) childd[n * 4 + j] = cell; - if (cell > patch) { - patch = cell; - } + if (cell > patch) patch = cell; // Insert migrated child node j = (x < posxd[ch]) ? 1 : 0; @@ -288,12 +282,10 @@ __global__ __launch_bounds__( n = cell; r *= 0.5f; - x += ((x < px) ? (j = 1, r) : (j = 0, -r)); y += ((y < py) ? (j |= 2, r) : (-r)); - // Check the value of the j'th child at offset 'n' N <= n < 2*N, in childd ch = childd[n * 4 + j]; if (r <= 1e-10) { break; @@ -315,6 +307,7 @@ __global__ __launch_bounds__( if (skip == 2) childd[locked] = patch; } + // record maximum tree depth // if (localmaxdepth >= THREADS5) // localmaxdepth = THREADS5 - 1; if (localmaxdepth > 32) localmaxdepth = 32; diff --git a/cpp/src_prims/selection/knn.cuh b/cpp/src_prims/selection/knn.cuh index 552f8cc2ea..1a6dd50018 100644 --- a/cpp/src_prims/selection/knn.cuh +++ b/cpp/src_prims/selection/knn.cuh @@ -216,7 +216,6 @@ void brute_force_knn(std::vector &input, std::vector &sizes, ASSERT(input.size() == sizes.size(), "input and sizes vectors should be the same size"); - faiss::MetricType m = build_faiss_metric(metric); std::vector *id_ranges; From 06e5c0fb775b57d1baeb74e77994d1117d68a2fb Mon Sep 17 00:00:00 2001 From: Devin Robison Date: Wed, 15 Jul 2020 16:03:00 -0600 Subject: [PATCH 03/14] Update to check for any non-finite values --- cpp/src/tsne/barnes_hut.cuh | 12 ++++++------ cpp/src/tsne/bh_kernels.cuh | 6 ++++-- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/cpp/src/tsne/barnes_hut.cuh b/cpp/src/tsne/barnes_hut.cuh index c9e15feec4..86e0ec2caa 100644 --- a/cpp/src/tsne/barnes_hut.cuh +++ b/cpp/src/tsne/barnes_hut.cuh @@ -87,7 +87,7 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ, const int FOUR_N = 4 * n; const float theta_squared = theta * theta; const int NNODES = nnodes; - bool h_chk_nan; + bool h_chk_finite; // Actual mallocs device_buffer startl(d_alloc, stream, nnodes + 1); @@ -263,12 +263,12 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ, END_TIMER(IntegrationKernel_time); - h_chk_nan = thrust::transform_reduce( + h_chk_finite = thrust::transform_reduce( thrust::cuda::par.on(stream), YY.data(), YY.data() + (nnodes + 1) * 2, - NaNTestKernel(), 0, thrust::plus()); - if (h_chk_nan) { - CUML_LOG_DEBUG( - "NaN result detected during Barnes Hut iteration: %d, returning last " + FiniteTestUnary(), 0, thrust::plus()); + if (h_chk_finite) { + CUML_LOG_WARN( + "Non-finite result detected during Barnes Hut iteration: %d, returning last " "known good positions.", iter); MLCommon::copy(YY.data(), YY_prev.data(), (nnodes + 1) * 2, stream); diff --git a/cpp/src/tsne/bh_kernels.cuh b/cpp/src/tsne/bh_kernels.cuh index de1cbc2920..3a811be7cd 100644 --- a/cpp/src/tsne/bh_kernels.cuh +++ b/cpp/src/tsne/bh_kernels.cuh @@ -40,13 +40,15 @@ namespace ML { namespace TSNE { + /** * Unary op intended to be used as a Thrust transform, check if array elements are NaNs. */ -struct NaNTestKernel { - __host__ __device__ bool operator()(const float a) const { return isnan(a); } +struct FiniteTestUnary { + __host__ __device__ bool operator()(const float x) const { return !isfinite(x); } }; + /** * Intializes the states of objects. This speeds the overall kernel up. */ From d78833d1a966933c9cb4c5548129bf87bd0238fa Mon Sep 17 00:00:00 2001 From: Devin Robison Date: Wed, 15 Jul 2020 16:05:42 -0600 Subject: [PATCH 04/14] Description tweak --- cpp/src/tsne/bh_kernels.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/tsne/bh_kernels.cuh b/cpp/src/tsne/bh_kernels.cuh index 3a811be7cd..60749196e5 100644 --- a/cpp/src/tsne/bh_kernels.cuh +++ b/cpp/src/tsne/bh_kernels.cuh @@ -42,7 +42,7 @@ namespace TSNE { /** - * Unary op intended to be used as a Thrust transform, check if array elements are NaNs. + * Unary op intended to be used as a Thrust transform, check if array elements non-finite. */ struct FiniteTestUnary { __host__ __device__ bool operator()(const float x) const { return !isfinite(x); } From a396a5273ee1e7d796c19a7f7909848603824e33 Mon Sep 17 00:00:00 2001 From: Devin Robison Date: Wed, 15 Jul 2020 16:10:44 -0600 Subject: [PATCH 05/14] PR # update --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a81c93425a..356af27ced 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -83,7 +83,7 @@ - PR #2535: Fix issue with incorrect docker image being used in local build script - PR #2542: Fix small memory leak in TSNE - PR #2552: Fixed the length argument of updateDevice calls in RF test -- PR #2358: Fix cell allocation code to avoid loops in quad-tree. Prevent NaNs causing infinite descent +- PR #2565: Fix cell allocation code to avoid loops in quad-tree. Prevent NaNs causing infinite descent # cuML 0.14.0 (03 Jun 2020) From 6167b6c1b116bbc470503ab788ba81ee6e505124 Mon Sep 17 00:00:00 2001 From: Devin Robison Date: Wed, 22 Jul 2020 15:23:44 -0600 Subject: [PATCH 06/14] remove unnecessary pointer cast --- cpp/src/tsne/barnes_hut.cuh | 3 --- 1 file changed, 3 deletions(-) diff --git a/cpp/src/tsne/barnes_hut.cuh b/cpp/src/tsne/barnes_hut.cuh index 86e0ec2caa..482c547a1a 100644 --- a/cpp/src/tsne/barnes_hut.cuh +++ b/cpp/src/tsne/barnes_hut.cuh @@ -277,9 +277,6 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ, } PRINT_TIMES; - // Copy final YY into true output Y - thrust::device_ptr Y_begin = thrust::device_pointer_cast(Y); - MLCommon::copy(Y, YY.data(), n, stream); CUDA_CHECK(cudaPeekAtLastError()); From 9131614ffba3b2ed353d6eaa6ca71403632ce9b7 Mon Sep 17 00:00:00 2001 From: Devin Robison Date: Thu, 23 Jul 2020 15:14:21 -0600 Subject: [PATCH 07/14] Add double precision fallback, and final failure work around --- cpp/src/tsne/barnes_hut.cuh | 86 ++----------------------------------- cpp/src/tsne/bh_kernels.cuh | 21 +++++---- 2 files changed, 13 insertions(+), 94 deletions(-) diff --git a/cpp/src/tsne/barnes_hut.cuh b/cpp/src/tsne/barnes_hut.cuh index 69015c04b4..7c068f9437 100644 --- a/cpp/src/tsne/barnes_hut.cuh +++ b/cpp/src/tsne/barnes_hut.cuh @@ -71,18 +71,6 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ, CUML_LOG_DEBUG("N_nodes = %d blocks = %d", nnodes, blocks); // Allocate more space - //--------------------------------------------------- -<<<<<<< HEAD - //int *errl = (int *)d_alloc->allocate(sizeof(int), stream); - device_buffer limiter(d_alloc, stream, 1); - device_buffer maxdepthd(d_alloc, stream, 1); - device_buffer bottomd(d_alloc, stream, 1); - device_buffer radiusd(d_alloc, stream, 1); - - TSNE::InitializationKernel<<<1, 1, 0, stream>>>( - /*errl,*/ limiter.data(), maxdepthd.data(), radiusd.data()); - -======= // MLCommon::device_buffer errl(d_alloc, stream, 1); MLCommon::device_buffer limiter(d_alloc, stream, 1); MLCommon::device_buffer maxdepthd(d_alloc, stream, 1); @@ -93,57 +81,23 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ, limiter.data(), maxdepthd.data(), radiusd.data()); ->>>>>>> 3aacbc93e6367b1cac99e66d7604020985f1ff52 CUDA_CHECK(cudaPeekAtLastError()); const int FOUR_NNODES = 4 * nnodes; const int FOUR_N = 4 * n; const float theta_squared = theta * theta; const int NNODES = nnodes; - bool h_chk_finite; - -<<<<<<< HEAD - // Actual mallocs - device_buffer startl(d_alloc, stream, nnodes + 1); - device_buffer childl(d_alloc, stream, (nnodes + 1) * 4); - device_buffer massl(d_alloc, stream, nnodes + 1); -======= + // Actual allocations MLCommon::device_buffer startl(d_alloc, stream, nnodes + 1); MLCommon::device_buffer childl(d_alloc, stream, (nnodes + 1) * 4); MLCommon::device_buffer massl(d_alloc, stream, nnodes + 1); ->>>>>>> 3aacbc93e6367b1cac99e66d7604020985f1ff52 thrust::device_ptr begin_massl = thrust::device_pointer_cast(massl.data()); thrust::fill(thrust::cuda::par.on(stream), begin_massl, begin_massl + (nnodes + 1), 1.0f); -<<<<<<< HEAD - device_buffer maxxl(d_alloc, stream, blocks * FACTOR1); - device_buffer maxyl(d_alloc, stream, blocks * FACTOR1); - device_buffer minxl(d_alloc, stream, blocks * FACTOR1); - device_buffer minyl(d_alloc, stream, blocks * FACTOR1); - - // SummarizationKernel - device_buffer countl(d_alloc, stream, nnodes + 1); - - // SortKernel - device_buffer sortl(d_alloc, stream, nnodes + 1); - - // RepulsionKernel - device_buffer rep_forces(d_alloc, stream, (nnodes + 1) * 2); - device_buffer attr_forces(d_alloc, stream, n * 2); - - device_buffer norm_add1(d_alloc, stream, n); - device_buffer norm(d_alloc, stream, n); - device_buffer Z_norm(d_alloc, stream, 1); - - device_buffer radiusd_squared(d_alloc, stream, 1); - - // Apply - device_buffer gains_bh(d_alloc, stream, n * 2); -======= MLCommon::device_buffer maxxl(d_alloc, stream, blocks * FACTOR1); MLCommon::device_buffer maxyl(d_alloc, stream, blocks * FACTOR1); MLCommon::device_buffer minxl(d_alloc, stream, blocks * FACTOR1); @@ -168,25 +122,12 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ, // Apply MLCommon::device_buffer gains_bh(d_alloc, stream, n * 2); ->>>>>>> 3aacbc93e6367b1cac99e66d7604020985f1ff52 + thrust::device_ptr begin_gains_bh = thrust::device_pointer_cast(gains_bh.data()); thrust::fill(thrust::cuda::par.on(stream), begin_gains_bh, begin_gains_bh + (n * 2), 1.0f); -<<<<<<< HEAD - device_buffer old_forces(d_alloc, stream, n * 2); - CUDA_CHECK( - cudaMemsetAsync(old_forces.data(), 0, sizeof(float) * n * 2, stream)); - - device_buffer YY(d_alloc, stream, (nnodes + 1) * 2); - device_buffer YY_prev(d_alloc, stream, (nnodes + 1) * 2); - random_vector(YY.data(), -0.0001f, 0.0001f, (nnodes + 1) * 2, stream, - random_state); - ASSERT( - YY.data() != NULL && YY_prev.data() != NULL && rep_forces.data() != NULL, - "[ERROR] Possibly no more memory"); -======= MLCommon::device_buffer old_forces(d_alloc, stream, n * 2); CUDA_CHECK( cudaMemsetAsync(old_forces.data(), 0, sizeof(float) * n * 2, stream)); @@ -195,7 +136,6 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ, // TODO bug #2549: this should be conditional on bool initialize_embeddings. random_vector(YY.data(), -0.0001f, 0.0001f, (nnodes + 1) * 2, stream, random_state); ->>>>>>> 3aacbc93e6367b1cac99e66d7604020985f1ff52 // Set cache levels for faster algorithm execution //--------------------------------------------------- @@ -216,21 +156,13 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ, float learning_rate = pre_learning_rate; for (int iter = 0; iter < max_iter; iter++) { -<<<<<<< HEAD - CUDA_CHECK(cudaMemsetAsync(rep_forces.data(), 0, - sizeof(float) * (nnodes + 1) * 2, stream)); - CUDA_CHECK( - cudaMemsetAsync(attr_forces.data(), 0, sizeof(float) * n * 2, stream)); - - MLCommon::copy(YY_prev.data(), YY.data(), (nnodes + 1) * 2, stream); -======= CUDA_CHECK(cudaMemsetAsync(static_cast(rep_forces.data()), 0, rep_forces.size() * sizeof(*rep_forces.data()), stream)); CUDA_CHECK(cudaMemsetAsync(static_cast(attr_forces.data()), 0, attr_forces.size() * sizeof(*attr_forces.data()), stream)); ->>>>>>> 3aacbc93e6367b1cac99e66d7604020985f1ff52 + TSNE::Reset_Normalization<<<1, 1, 0, stream>>>( Z_norm.data(), radiusd_squared.data(), bottomd.data(), NNODES, radiusd.data()); @@ -331,18 +263,6 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ, CUDA_CHECK(cudaPeekAtLastError()); END_TIMER(IntegrationKernel_time); - - h_chk_finite = thrust::transform_reduce( - thrust::cuda::par.on(stream), YY.data(), YY.data() + (nnodes + 1) * 2, - FiniteTestUnary(), 0, thrust::plus()); - if (h_chk_finite) { - CUML_LOG_WARN( - "Non-finite result detected during Barnes Hut iteration: %d, returning last " - "known good positions.", - iter); - MLCommon::copy(YY.data(), YY_prev.data(), (nnodes + 1) * 2, stream); - break; - } } PRINT_TIMES; diff --git a/cpp/src/tsne/bh_kernels.cuh b/cpp/src/tsne/bh_kernels.cuh index 60749196e5..03355a5f87 100644 --- a/cpp/src/tsne/bh_kernels.cuh +++ b/cpp/src/tsne/bh_kernels.cuh @@ -41,14 +41,6 @@ namespace ML { namespace TSNE { -/** - * Unary op intended to be used as a Thrust transform, check if array elements non-finite. - */ -struct FiniteTestUnary { - __host__ __device__ bool operator()(const float x) const { return !isfinite(x); } -}; - - /** * Intializes the states of objects. This speeds the overall kernel up. */ @@ -670,12 +662,19 @@ __global__ void attractive_kernel_bh( if (index >= NNZ) return; const int i = ROW[index]; const int j = COL[index]; + float PQ; // TODO: Calculate Kullback-Leibler divergence // TODO: Convert attractive forces to CSR format - const float PQ = __fdividef( - VAL[index], - norm_add1[i] + norm[j] - 2.0f * (Y1[i] * Y1[j] + Y2[i] * Y2[j])); // P*Q + // Try single precision compute first + float denominator = __fmaf_rn(-2.0f, (Y1[i] * Y1[j]), norm_add1[i]) + __fmaf_rn(-2.0f, (Y2[i] * Y2[j]), norm[j]); + + if (denominator == 0) { + /* repeat with double precision */ + double dbl_denominator = __fma_rn(-2.0f, Y1[i] * Y1[j], norm_add1[i]) + __fma_rn(-2.0f, Y2[i] * Y2[j], norm[j]); + denominator = (dbl_denominator != 0) ? static_cast(dbl_denominator) : FLT_EPSILON; + } + PQ = __fdividef(VAL[index], denominator); // Apply forces atomicAdd(&attract1[i], PQ * (Y1[i] - Y1[j])); From ea794b486ae03c75ea3d1b358df6defd8325bad0 Mon Sep 17 00:00:00 2001 From: Devin Robison Date: Mon, 27 Jul 2020 16:34:38 -0600 Subject: [PATCH 08/14] Update attractive kernel to use a double fallback, or clamp to 1 if that fails --- cpp/src/tsne/barnes_hut.cuh | 4 ---- cpp/src/tsne/bh_kernels.cuh | 22 +++++++++++++++------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/cpp/src/tsne/barnes_hut.cuh b/cpp/src/tsne/barnes_hut.cuh index 7c068f9437..f71bcb967a 100644 --- a/cpp/src/tsne/barnes_hut.cuh +++ b/cpp/src/tsne/barnes_hut.cuh @@ -56,7 +56,6 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ, const int max_iter = 1000, const float min_grad_norm = 1e-7, const float pre_momentum = 0.5, const float post_momentum = 0.8, const long long random_state = -1) { - using MLCommon::device_buffer; auto d_alloc = handle.getDeviceAllocator(); cudaStream_t stream = handle.getStream(); @@ -267,10 +266,7 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ, PRINT_TIMES; MLCommon::copy(Y, YY.data(), n, stream); - CUDA_CHECK(cudaPeekAtLastError()); - MLCommon::copy(Y + n, YY.data() + nnodes + 1, n, stream); - CUDA_CHECK(cudaPeekAtLastError()); } } // namespace TSNE diff --git a/cpp/src/tsne/bh_kernels.cuh b/cpp/src/tsne/bh_kernels.cuh index 03355a5f87..b79b34687e 100644 --- a/cpp/src/tsne/bh_kernels.cuh +++ b/cpp/src/tsne/bh_kernels.cuh @@ -35,7 +35,6 @@ #include #include -#include namespace ML { namespace TSNE { @@ -662,19 +661,28 @@ __global__ void attractive_kernel_bh( if (index >= NNZ) return; const int i = ROW[index]; const int j = COL[index]; - float PQ; // TODO: Calculate Kullback-Leibler divergence // TODO: Convert attractive forces to CSR format // Try single precision compute first float denominator = __fmaf_rn(-2.0f, (Y1[i] * Y1[j]), norm_add1[i]) + __fmaf_rn(-2.0f, (Y2[i] * Y2[j]), norm[j]); - if (denominator == 0) { - /* repeat with double precision */ - double dbl_denominator = __fma_rn(-2.0f, Y1[i] * Y1[j], norm_add1[i]) + __fma_rn(-2.0f, Y2[i] * Y2[j], norm[j]); - denominator = (dbl_denominator != 0) ? static_cast(dbl_denominator) : FLT_EPSILON; + if (__builtin_expect(denominator == 0, false)) { + double _Y1 = static_cast(Y1[i] * Y1[j]); + double _Y2 = static_cast(Y2[i] * Y2[j]); + double dbl_denominator = __fma_rn(-2.0f, _Y1, norm_add1[i]) + __fma_rn(-2.0f, _Y2, norm[j]); + + if (__builtin_expect(dbl_denominator == 0, false)) { + printf("Detected zero in denominator with __fma_rn(-2.0f, %lf, %lf) + __fma_rn(-2.0f, %lf, %lf)\n", + _Y1, norm_add1[i], _Y2, norm[j]); + + dbl_denominator = 1.0f; + printf("dbl_denominator: %lf\n", dbl_denominator); + } + + denominator = dbl_denominator; } - PQ = __fdividef(VAL[index], denominator); + const float PQ = __fdividef(VAL[index], denominator); // Apply forces atomicAdd(&attract1[i], PQ * (Y1[i] - Y1[j])); From 6cf29f9bf2cf43dcb9bfc698749c43030927aaf4 Mon Sep 17 00:00:00 2001 From: Devin Robison Date: Mon, 27 Jul 2020 17:21:51 -0600 Subject: [PATCH 09/14] Disable debug prints, add error checking/logging + early return from barnes hut --- cpp/src/tsne/barnes_hut.cuh | 14 ++++++++++++-- cpp/src/tsne/bh_kernels.cuh | 14 +++++++++----- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/cpp/src/tsne/barnes_hut.cuh b/cpp/src/tsne/barnes_hut.cuh index f71bcb967a..f89d41df93 100644 --- a/cpp/src/tsne/barnes_hut.cuh +++ b/cpp/src/tsne/barnes_hut.cuh @@ -62,6 +62,7 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ, // Get device properites //--------------------------------------------------- const int blocks = MLCommon::getMultiProcessorCount(); + int h_flag; int nnodes = n * 2; if (nnodes < 1024 * blocks) nnodes = 1024 * blocks; @@ -75,11 +76,13 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ, MLCommon::device_buffer maxdepthd(d_alloc, stream, 1); MLCommon::device_buffer bottomd(d_alloc, stream, 1); MLCommon::device_buffer radiusd(d_alloc, stream, 1); + MLCommon::device_buffer flag_unstable_computation(d_alloc, stream, 1); TSNE::InitializationKernel<<<1, 1, 0, stream>>>(/*errl.data(),*/ limiter.data(), maxdepthd.data(), - radiusd.data()); + radiusd.data(), + flag_unstable_computation.data()); CUDA_CHECK(cudaPeekAtLastError()); const int FOUR_NNODES = 4 * nnodes; @@ -248,10 +251,17 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ, TSNE:: attractive_kernel_bh<<>>( VAL, COL, ROW, YY.data(), YY.data() + nnodes + 1, norm.data(), - norm_add1.data(), attr_forces.data(), attr_forces.data() + n, NNZ); + norm_add1.data(), attr_forces.data(), attr_forces.data() + n, NNZ, flag_unstable_computation.data()); CUDA_CHECK(cudaPeekAtLastError()); END_TIMER(attractive_time); + MLCommon::copy(&h_flag, flag_unstable_computation.data(), 1, stream); + if (h_flag) { + CUML_LOG_ERROR("Detected zero divisor in attractive force kernel, returning early." + " Results may not be accurate."); + break; + } + START_TIMER; TSNE::IntegrationKernel<<>>( learning_rate, momentum, early_exaggeration, YY.data(), diff --git a/cpp/src/tsne/bh_kernels.cuh b/cpp/src/tsne/bh_kernels.cuh index b79b34687e..42fe1a5a80 100644 --- a/cpp/src/tsne/bh_kernels.cuh +++ b/cpp/src/tsne/bh_kernels.cuh @@ -46,11 +46,13 @@ namespace TSNE { __global__ void InitializationKernel(/*int *restrict errd, */ unsigned *restrict limiter, int *restrict maxdepthd, - float *restrict radiusd) { + float *restrict radiusd, + int *restrict flag_unstable_computation) { // errd[0] = 0; maxdepthd[0] = 1; limiter[0] = 0; radiusd[0] = 0.0f; + flag_unstable_computation[0] = 0; } /** @@ -656,7 +658,8 @@ __global__ void attractive_kernel_bh( const float *restrict VAL, const int *restrict COL, const int *restrict ROW, const float *restrict Y1, const float *restrict Y2, const float *restrict norm, const float *restrict norm_add1, - float *restrict attract1, float *restrict attract2, const int NNZ) { + float *restrict attract1, float *restrict attract2, const int NNZ, + int *restrict flag_unstable_computation) { const int index = (blockIdx.x * blockDim.x) + threadIdx.x; if (index >= NNZ) return; const int i = ROW[index]; @@ -673,11 +676,12 @@ __global__ void attractive_kernel_bh( double dbl_denominator = __fma_rn(-2.0f, _Y1, norm_add1[i]) + __fma_rn(-2.0f, _Y2, norm[j]); if (__builtin_expect(dbl_denominator == 0, false)) { - printf("Detected zero in denominator with __fma_rn(-2.0f, %lf, %lf) + __fma_rn(-2.0f, %lf, %lf)\n", - _Y1, norm_add1[i], _Y2, norm[j]); + //printf("Detected zero in attractive force kernel denominator with __fma_rn(-2.0f, %lf, %lf) + " + // "__fma_rn(-2.0f, %lf, %lf)\nClamping denominator to 1.0: tSNE results may not be accurate.", + // _Y1, norm_add1[i], _Y2, norm[j]); dbl_denominator = 1.0f; - printf("dbl_denominator: %lf\n", dbl_denominator); + flag_unstable_computation[0] = 1; } denominator = dbl_denominator; From 8cedd0923c1b25736b971abff2b62f1b2609d061 Mon Sep 17 00:00:00 2001 From: Devin Robison Date: Mon, 27 Jul 2020 19:19:29 -0600 Subject: [PATCH 10/14] Formatting updates --- cpp/src/tsne/barnes_hut.cuh | 11 +++++++---- cpp/src/tsne/bh_kernels.cuh | 7 ++++--- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/cpp/src/tsne/barnes_hut.cuh b/cpp/src/tsne/barnes_hut.cuh index ccfab7121a..ec53b92c03 100644 --- a/cpp/src/tsne/barnes_hut.cuh +++ b/cpp/src/tsne/barnes_hut.cuh @@ -82,7 +82,8 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ, limiter.data(), maxdepthd.data(), radiusd.data(), - flag_unstable_computation.data()); + flag_unstable_computation + .data()); CUDA_CHECK(cudaPeekAtLastError()); const int FOUR_NNODES = 4 * nnodes; @@ -250,14 +251,16 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ, TSNE:: attractive_kernel_bh<<>>( VAL, COL, ROW, YY.data(), YY.data() + nnodes + 1, norm.data(), - attr_forces.data(), attr_forces.data() + n, NNZ, flag_unstable_computation.data()); + attr_forces.data(), attr_forces.data() + n, NNZ, + flag_unstable_computation.data()); CUDA_CHECK(cudaPeekAtLastError()); END_TIMER(attractive_time); MLCommon::copy(&h_flag, flag_unstable_computation.data(), 1, stream); if (h_flag) { - CUML_LOG_ERROR("Detected zero divisor in attractive force kernel, returning early." - " Results may not be accurate."); + CUML_LOG_ERROR( + "Detected zero divisor in attractive force kernel, returning early." + " Results may not be accurate."); break; } diff --git a/cpp/src/tsne/bh_kernels.cuh b/cpp/src/tsne/bh_kernels.cuh index 722b13b0e2..0302afc5a4 100644 --- a/cpp/src/tsne/bh_kernels.cuh +++ b/cpp/src/tsne/bh_kernels.cuh @@ -39,7 +39,6 @@ namespace ML { namespace TSNE { - /** * Intializes the states of objects. This speeds the overall kernel up. */ @@ -666,12 +665,14 @@ __global__ void attractive_kernel_bh( // TODO: Calculate Kullback-Leibler divergence // TODO: Convert attractive forces to CSR format // Try single precision compute first - float denominator = __fmaf_rn(-2.0f, (Y1[i] * Y1[j]), norm_add1[i]) + __fmaf_rn(-2.0f, (Y2[i] * Y2[j]), norm[j]); + float denominator = __fmaf_rn(-2.0f, (Y1[i] * Y1[j]), norm_add1[i]) + + __fmaf_rn(-2.0f, (Y2[i] * Y2[j]), norm[j]); if (__builtin_expect(denominator == 0, false)) { double _Y1 = static_cast(Y1[i] * Y1[j]); double _Y2 = static_cast(Y2[i] * Y2[j]); - double dbl_denominator = __fma_rn(-2.0f, _Y1, norm_add1[i]) + __fma_rn(-2.0f, _Y2, norm[j]); + double dbl_denominator = + __fma_rn(-2.0f, _Y1, norm_add1[i]) + __fma_rn(-2.0f, _Y2, norm[j]); if (__builtin_expect(dbl_denominator == 0, false)) { //printf("Detected zero in attractive force kernel denominator with __fma_rn(-2.0f, %lf, %lf) + " From 17f9529b73ee53cc24279bd3bb053723569e7b24 Mon Sep 17 00:00:00 2001 From: Devin Robison Date: Mon, 27 Jul 2020 19:28:05 -0600 Subject: [PATCH 11/14] Merge fix --- cpp/src/tsne/bh_kernels.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/tsne/bh_kernels.cuh b/cpp/src/tsne/bh_kernels.cuh index 0302afc5a4..4278a30b37 100644 --- a/cpp/src/tsne/bh_kernels.cuh +++ b/cpp/src/tsne/bh_kernels.cuh @@ -672,7 +672,7 @@ __global__ void attractive_kernel_bh( double _Y1 = static_cast(Y1[i] * Y1[j]); double _Y2 = static_cast(Y2[i] * Y2[j]); double dbl_denominator = - __fma_rn(-2.0f, _Y1, norm_add1[i]) + __fma_rn(-2.0f, _Y2, norm[j]); + __fma_rn(-2.0f, _Y1, norm[i] + 1.0f) + __fma_rn(-2.0f, _Y2, norm[j]); if (__builtin_expect(dbl_denominator == 0, false)) { //printf("Detected zero in attractive force kernel denominator with __fma_rn(-2.0f, %lf, %lf) + " From 9fec2631f54186b6362df017dfc6e75d841efdec Mon Sep 17 00:00:00 2001 From: Devin Robison Date: Mon, 27 Jul 2020 20:45:06 -0600 Subject: [PATCH 12/14] Fix typo --- cpp/src/tsne/bh_kernels.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/tsne/bh_kernels.cuh b/cpp/src/tsne/bh_kernels.cuh index 4278a30b37..bc80238b2f 100644 --- a/cpp/src/tsne/bh_kernels.cuh +++ b/cpp/src/tsne/bh_kernels.cuh @@ -665,7 +665,7 @@ __global__ void attractive_kernel_bh( // TODO: Calculate Kullback-Leibler divergence // TODO: Convert attractive forces to CSR format // Try single precision compute first - float denominator = __fmaf_rn(-2.0f, (Y1[i] * Y1[j]), norm_add1[i]) + + float denominator = __fmaf_rn(-2.0f, (Y1[i] * Y1[j]), norm[i] + 1.0f) + __fmaf_rn(-2.0f, (Y2[i] * Y2[j]), norm[j]); if (__builtin_expect(denominator == 0, false)) { From 9af3ca8ddfe2fdda5118d043c2272c185c664514 Mon Sep 17 00:00:00 2001 From: Devin Robison Date: Tue, 28 Jul 2020 11:42:44 -0600 Subject: [PATCH 13/14] Improve error description, fix double-fallback multiply --- cpp/src/tsne/barnes_hut.cuh | 7 +++++-- cpp/src/tsne/bh_kernels.cuh | 11 ++--------- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/cpp/src/tsne/barnes_hut.cuh b/cpp/src/tsne/barnes_hut.cuh index ec53b92c03..83efca82d3 100644 --- a/cpp/src/tsne/barnes_hut.cuh +++ b/cpp/src/tsne/barnes_hut.cuh @@ -259,8 +259,10 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ, MLCommon::copy(&h_flag, flag_unstable_computation.data(), 1, stream); if (h_flag) { CUML_LOG_ERROR( - "Detected zero divisor in attractive force kernel, returning early." - " Results may not be accurate."); + "Detected zero divisor in attractive force kernel after '%d' iterations;" + " returning early. Your final results may not be accurate. In some cases" + " this error can be resolved by increasing perplexity, and n_neighbors;" + " if the problem persists, please use 'method=exact'.", iter); break; } @@ -277,6 +279,7 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ, } PRINT_TIMES; + // Copy final YY into true output Y MLCommon::copy(Y, YY.data(), n, stream); MLCommon::copy(Y + n, YY.data() + nnodes + 1, n, stream); } diff --git a/cpp/src/tsne/bh_kernels.cuh b/cpp/src/tsne/bh_kernels.cuh index bc80238b2f..c4ca3b657d 100644 --- a/cpp/src/tsne/bh_kernels.cuh +++ b/cpp/src/tsne/bh_kernels.cuh @@ -255,10 +255,7 @@ __global__ __launch_bounds__( const int cell = atomicSub(bottomd, 1) - 1; if (cell == N) { - // atomicExch(errd, 1); atomicExch(bottomd, NNODES); - //printf("Cell Allocation Overflow, depth=%d, r=%f, rbound=%f, N=%d, NNODES=%d, bottomd=%d\n", - // depth, r, radius, N, NNODES, prev_bottom); } else if (cell < N) { depth--; continue; @@ -669,16 +666,12 @@ __global__ void attractive_kernel_bh( __fmaf_rn(-2.0f, (Y2[i] * Y2[j]), norm[j]); if (__builtin_expect(denominator == 0, false)) { - double _Y1 = static_cast(Y1[i] * Y1[j]); - double _Y2 = static_cast(Y2[i] * Y2[j]); + double _Y1 = static_cast(Y1[i]) * static_cast(Y1[j]); + double _Y2 = static_cast(Y2[i]) * static_cast(Y2[j]); double dbl_denominator = __fma_rn(-2.0f, _Y1, norm[i] + 1.0f) + __fma_rn(-2.0f, _Y2, norm[j]); if (__builtin_expect(dbl_denominator == 0, false)) { - //printf("Detected zero in attractive force kernel denominator with __fma_rn(-2.0f, %lf, %lf) + " - // "__fma_rn(-2.0f, %lf, %lf)\nClamping denominator to 1.0: tSNE results may not be accurate.", - // _Y1, norm_add1[i], _Y2, norm[j]); - dbl_denominator = 1.0f; flag_unstable_computation[0] = 1; } From 911efccdec267af45f57767fcb00ea7de44f11d7 Mon Sep 17 00:00:00 2001 From: Devin Robison Date: Tue, 28 Jul 2020 11:51:47 -0600 Subject: [PATCH 14/14] Style fix --- cpp/src/tsne/barnes_hut.cuh | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/cpp/src/tsne/barnes_hut.cuh b/cpp/src/tsne/barnes_hut.cuh index 83efca82d3..642339ccf4 100644 --- a/cpp/src/tsne/barnes_hut.cuh +++ b/cpp/src/tsne/barnes_hut.cuh @@ -259,10 +259,13 @@ void Barnes_Hut(float *VAL, const int *COL, const int *ROW, const int NNZ, MLCommon::copy(&h_flag, flag_unstable_computation.data(), 1, stream); if (h_flag) { CUML_LOG_ERROR( - "Detected zero divisor in attractive force kernel after '%d' iterations;" - " returning early. Your final results may not be accurate. In some cases" + "Detected zero divisor in attractive force kernel after '%d' " + "iterations;" + " returning early. Your final results may not be accurate. In some " + "cases" " this error can be resolved by increasing perplexity, and n_neighbors;" - " if the problem persists, please use 'method=exact'.", iter); + " if the problem persists, please use 'method=exact'.", + iter); break; }