From bb4f2d22a2c64711a23df9b1592c3fb607fc0755 Mon Sep 17 00:00:00 2001 From: Konstantinos Parasyris Date: Fri, 27 Oct 2023 18:00:12 -0700 Subject: [PATCH] Add tests for FAISS and release memory (#15) * Add faiss-index test and fix faiss-gpu syncrhonization issue. * Delete HDCache and free example memory * Delete index only if loaded --- examples/main.cpp | 9 + src/ml/hdcache.hpp | 33 ++- src/wf/cuda/utilities.cuh | 575 +++++++++++++++++++++--------------- src/wf/resource_manager.hpp | 2 +- tests/CMakeLists.txt | 13 + tests/faiss_debug.pt | Bin 0 -> 16045 bytes tests/generate_faiss.py | 73 +++++ tests/test_hdcache.cpp | 182 ++++++++++++ 8 files changed, 643 insertions(+), 244 deletions(-) create mode 100644 tests/faiss_debug.pt create mode 100644 tests/generate_faiss.py create mode 100644 tests/test_hdcache.cpp diff --git a/examples/main.cpp b/examples/main.cpp index e7ae3e87..4c6e740f 100644 --- a/examples/main.cpp +++ b/examples/main.cpp @@ -700,6 +700,15 @@ int main(int argc, char **argv) CALIPER(CALI_MARK_END("Cycle");) MPI_CALL(MPI_Barrier(MPI_COMM_WORLD)); } + + delete[] workflow; + + // TODO: Add smart-pointers + for (int mat_idx = 0; mat_idx < num_mats; ++mat_idx) { + delete eoses[mat_idx]; + eoses[mat_idx] = nullptr; + } + CALIPER(CALI_MARK_END("TimeStepLoop");); MPI_CALL(MPI_Finalize()); return 0; diff --git a/src/ml/hdcache.hpp b/src/ml/hdcache.hpp index a610fcff..cfaa9548 100644 --- a/src/ml/hdcache.hpp +++ b/src/ml/hdcache.hpp @@ -248,7 +248,15 @@ class HDCache return new_cache; } - ~HDCache() { DBG(Surrogate, "Destroying UQ-cache") } + ~HDCache() + { + DBG(UQModule, "Deleting UQ-Module"); + if (m_index) { + DBG(UQModule, "Deleting HD-Cache"); + m_index->reset(); + delete m_index; + } + } //! ------------------------------------------------------------------------ //! simple queries @@ -400,6 +408,7 @@ class HDCache } else { _evaluate(ndata, data, is_acceptable); } + DBG(UQModule, "Done with evalution of uq") } //! train on data that comes separate features (a vector of pointers) @@ -431,6 +440,7 @@ class HDCache _evaluate(ndata, lin_data, is_acceptable); ams::ResourceManager::deallocate(lin_data, defaultRes); } + DBG(UQModule, "Done with evalution of uq"); } private: @@ -529,28 +539,35 @@ class HDCache for (int start = 0; start < ndata; start += MAGIC_NUMBER) { unsigned int nElems = ((ndata - start) < MAGIC_NUMBER) ? ndata - start : MAGIC_NUMBER; + DBG(UQModule, "Running for %d elements %d %d", nElems, start, m_dim); m_index->search(nElems, - &data[start], + &data[start * m_dim], knbrs, &kdists[start * knbrs], &kidxs[start * knbrs]); } +#ifdef __ENABLE_CUDA__ + faiss::gpu::synchronizeAllDevices(); +#endif // compute means if (defaultRes == AMSResourceType::HOST) { - TypeValue total_dist = 0; for (size_t i = 0; i < ndata; ++i) { CFATAL(UQModule, m_policy == AMSUQPolicy::DeltaUQ, "DeltaUQ is not supported yet"); if (m_policy == AMSUQPolicy::FAISSMean) { - total_dist = - std::accumulate(kdists + i * knbrs, kdists + (i + 1) * knbrs, 0.); - is_acceptable[i] = (ook * total_dist) < acceptable_error; + TypeValue mean_dist = std::accumulate(kdists + i * knbrs, + kdists + (i + 1) * knbrs, + 0.) * + ook; + is_acceptable[i] = mean_dist < acceptable_error; } else if (m_policy == AMSUQPolicy::FAISSMax) { // Take the furtherst cluster as the distance metric - total_dist = kdists[i * knbrs + knbrs - 1]; - is_acceptable[i] = (total_dist) < acceptable_error; + TypeValue max_dist = + *std::max_element(&kdists[i * knbrs], + &kdists[i * knbrs + knbrs - 1]); + is_acceptable[i] = (max_dist) < acceptable_error; } } } else { diff --git a/src/wf/cuda/utilities.cuh b/src/wf/cuda/utilities.cuh index 76818857..7f10f26c 100644 --- a/src/wf/cuda/utilities.cuh +++ b/src/wf/cuda/utilities.cuh @@ -23,13 +23,9 @@ const int warpSize = 32; const unsigned int fullMask = 0xffffffff; -__host__ int divup(int x, int y) { - return (x + y - 1) / y; -} +__host__ int divup(int x, int y) { return (x + y - 1) / y; } -__device__ __inline__ int pow2i(int e) { - return 1 << e; -} +__device__ __inline__ int pow2i(int e) { return 1 << e; } // Define this to turn on error checking #define CUDA_ERROR_CHECK @@ -37,207 +33,247 @@ __device__ __inline__ int pow2i(int e) { #define CUDASAFECALL(err) __cudaSafeCall(err, __FILE__, __LINE__) #define CUDACHECKERROR() __cudaCheckError(__FILE__, __LINE__) -inline void __cudaSafeCall(cudaError err, const char* file, const int line) { +inline void __cudaSafeCall(cudaError err, const char* file, const int line) +{ #ifdef CUDA_ERROR_CHECK - if (cudaSuccess != err) { - fprintf(stderr, "cudaSafeCall() failed at %s:%i : %s\n", file, line, - cudaGetErrorString(err)); - - fprintf(stdout, "cudaSafeCall() failed at %s:%i : %s\n", file, line, - cudaGetErrorString(err)); - exit(-1); - } + if (cudaSuccess != err) { + fprintf(stderr, + "cudaSafeCall() failed at %s:%i : %s\n", + file, + line, + cudaGetErrorString(err)); + + fprintf(stdout, + "cudaSafeCall() failed at %s:%i : %s\n", + file, + line, + cudaGetErrorString(err)); + exit(-1); + } #endif - return; + return; } struct is_true { - __host__ __device__ bool operator()(const int x) { return x; } + __host__ __device__ bool operator()(const int x) { return x; } }; struct is_false { - __host__ __device__ bool operator()(const int x) { return !x; } + __host__ __device__ bool operator()(const int x) { return !x; } }; -inline void __cudaCheckError(const char* file, const int line) { +inline void __cudaCheckError(const char* file, const int line) +{ #ifdef CUDA_ERROR_CHECK - cudaError err = cudaGetLastError(); - if (cudaSuccess != err) { - fprintf(stderr, "cudaCheckError() failed at %s:%i : %s\n", file, line, - cudaGetErrorString(err)); - exit(-1); - } - - // More careful checking. However, this will affect performance. - // Comment away if needed. - err = cudaDeviceSynchronize(); - if (cudaSuccess != err) { - fprintf(stderr, "cudaCheckError() with sync failed at %s:%i : %s\n", file, line, - cudaGetErrorString(err)); - exit(-1); - } + cudaError err = cudaGetLastError(); + if (cudaSuccess != err) { + fprintf(stderr, + "cudaCheckError() failed at %s:%i : %s\n", + file, + line, + cudaGetErrorString(err)); + exit(-1); + } + + // More careful checking. However, this will affect performance. + // Comment away if needed. + err = cudaDeviceSynchronize(); + if (cudaSuccess != err) { + fprintf(stderr, + "cudaCheckError() with sync failed at %s:%i : %s\n", + file, + line, + cudaGetErrorString(err)); + exit(-1); + } #endif - return; + return; } -__global__ void srand_dev(curandState* states, const int total_threads) { - int id = threadIdx.x + blockDim.x * blockIdx.x; - if (id < total_threads) { - int seed = id; // different seed per thread - curand_init(seed, id, 0, &states[id]); - } +__global__ void srand_dev(curandState* states, const int total_threads) +{ + int id = threadIdx.x + blockDim.x * blockIdx.x; + if (id < total_threads) { + int seed = id; // different seed per thread + curand_init(seed, id, 0, &states[id]); + } } -__global__ void initIndices(int* ind, int length) { - int id = threadIdx.x + blockDim.x * blockIdx.x; - if (id < length) - ind[id] = id; +__global__ void initIndices(int* ind, int length) +{ + int id = threadIdx.x + blockDim.x * blockIdx.x; + if (id < length) ind[id] = id; } template -__global__ void fillRandom(bool* predicate, const int total_threads, curandState* states, - const size_t length, T threshold) { - int id = threadIdx.x + blockDim.x * blockIdx.x; - if (id < total_threads) { - for (int i = id; i < length; i += total_threads) { - float x = curand_uniform(&states[id]); - predicate[i] = (x <= threshold); - } +__global__ void fillRandom(bool* predicate, + const int total_threads, + curandState* states, + const size_t length, + T threshold) +{ + int id = threadIdx.x + blockDim.x * blockIdx.x; + if (id < total_threads) { + for (int i = id; i < length; i += total_threads) { + float x = curand_uniform(&states[id]); + predicate[i] = (x <= threshold); } + } } template -__global__ void computeBlockCounts(bool cond, T* d_input, int length, int* d_BlockCounts) { - int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < length) { - int pred = ( d_input[idx] == cond ); - int BC = __syncthreads_count(pred); - - if (threadIdx.x == 0) { - d_BlockCounts[blockIdx.x] = - BC; // BC will contain the number of valid elements in all threads of this thread block - } +__global__ void computeBlockCounts(bool cond, + T* d_input, + int length, + int* d_BlockCounts) +{ + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < length) { + int pred = (d_input[idx] == cond); + int BC = __syncthreads_count(pred); + + if (threadIdx.x == 0) { + d_BlockCounts[blockIdx.x] = + BC; // BC will contain the number of valid elements in all threads of this thread block } + } } template -__global__ void assignK(T** sparse, T** dense, int* indices, size_t length, int dims, - bool isReverse) { - int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < length) { - int index = indices[idx]; - if (!isReverse) { - for (int i = 0; i < dims; i++) { - dense[i][idx] = sparse[i][index]; - } - } else { - for (int i = 0; i < dims; i++) { - sparse[i][index] = dense[i][idx]; - } - } +__global__ void assignK(T** sparse, + T** dense, + int* indices, + size_t length, + int dims, + bool isReverse) +{ + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < length) { + int index = indices[idx]; + if (!isReverse) { + for (int i = 0; i < dims; i++) { + dense[i][idx] = sparse[i][index]; + } + } else { + for (int i = 0; i < dims; i++) { + sparse[i][index] = dense[i][idx]; + } } + } } template -__global__ void compactK(bool cond, T** d_input, +__global__ void compactK(bool cond, + T** d_input, T** d_output, const bool* predicates, const size_t length, int dims, int* d_BlocksOffset, - bool reverse) { - int idx = threadIdx.x + blockIdx.x * blockDim.x; - extern __shared__ int warpTotals[]; - if (idx < length) { - int pred = (predicates[idx] == cond); - int w_i = threadIdx.x / warpSize; //warp index - int w_l = idx % warpSize; //thread index within a warp - - // compute exclusive prefix sum based on predicate validity to get output offset for thread in warp - int t_m = fullMask >> (warpSize - w_l); //thread mask + bool reverse) +{ + int idx = threadIdx.x + blockIdx.x * blockDim.x; + extern __shared__ int warpTotals[]; + if (idx < length) { + int pred = (predicates[idx] == cond); + int w_i = threadIdx.x / warpSize; //warp index + int w_l = idx % warpSize; //thread index within a warp + + // compute exclusive prefix sum based on predicate validity to get output offset for thread in warp + int t_m = fullMask >> (warpSize - w_l); //thread mask #if (CUDART_VERSION < 9000) - int b = __ballot(pred) & t_m; //ballot result = number whose ith bit - //is one if the ith's thread pred is true - //masked up to the current index in warp + int b = __ballot(pred) & t_m; //ballot result = number whose ith bit + //is one if the ith's thread pred is true + //masked up to the current index in warp #else - int b = __ballot_sync(fullMask, pred) & t_m; + int b = __ballot_sync(fullMask, pred) & t_m; #endif - int t_u = __popc( - b); // popc count the number of bit one. simply count the number predicated true BEFORE MY INDEX - - // last thread in warp computes total valid counts for the warp - if (w_l == warpSize - 1) { - warpTotals[w_i] = t_u + pred; - } - - // need all warps in thread block to fill in warpTotals before proceeding - __syncthreads(); - - // first numWarps threads in first warp compute exclusive prefix sum to get output offset for each warp in thread block - int numWarps = blockDim.x / warpSize; - unsigned int numWarpsMask = fullMask >> (warpSize - numWarps); - if (w_i == 0 && w_l < numWarps) { - int w_i_u = 0; - for ( int j = 0; j <= 5; j++) { + int t_u = __popc( + b); // popc count the number of bit one. simply count the number predicated true BEFORE MY INDEX + + // last thread in warp computes total valid counts for the warp + if (w_l == warpSize - 1) { + warpTotals[w_i] = t_u + pred; + } + + // need all warps in thread block to fill in warpTotals before proceeding + __syncthreads(); + + // first numWarps threads in first warp compute exclusive prefix sum to get output offset for each warp in thread block + int numWarps = blockDim.x / warpSize; + unsigned int numWarpsMask = fullMask >> (warpSize - numWarps); + if (w_i == 0 && w_l < numWarps) { + int w_i_u = 0; + for (int j = 0; j <= 5; j++) { #if (CUDART_VERSION < 9000) - int b_j = __ballot(warpTotals[w_l] & pow2i(j)); //# of the ones in the j'th digit of the warp offsets + int b_j = __ballot( + warpTotals[w_l] & + pow2i(j)); //# of the ones in the j'th digit of the warp offsets #else - int b_j = __ballot_sync(numWarpsMask, warpTotals[w_l] & pow2i(j)); + int b_j = __ballot_sync(numWarpsMask, warpTotals[w_l] & pow2i(j)); #endif - w_i_u += (__popc(b_j & t_m)) << j; - } - warpTotals[w_l] = w_i_u; - } - - // need all warps in thread block to wait until prefix sum is calculated in warpTotals - __syncthreads(); - - // if valid element, place the element in proper destination address based on thread offset in warp, warp offset in block, and block offset in grid - if (pred) { - if (!reverse) { - for (int i = 0; i < dims; i++) - d_output[i][t_u + warpTotals[w_i] + d_BlocksOffset[blockIdx.x]] = - d_input[i][idx]; - } else { - for (int i = 0; i < dims; i++) - d_input[i][idx] = - d_output[i][t_u + warpTotals[w_i] + d_BlocksOffset[blockIdx.x]]; - } - } + w_i_u += (__popc(b_j & t_m)) << j; + } + warpTotals[w_l] = w_i_u; } -} + // need all warps in thread block to wait until prefix sum is calculated in warpTotals + __syncthreads(); + + // if valid element, place the element in proper destination address based on thread offset in warp, warp offset in block, and block offset in grid + if (pred) { + if (!reverse) { + for (int i = 0; i < dims; i++) + d_output[i][t_u + warpTotals[w_i] + d_BlocksOffset[blockIdx.x]] = + d_input[i][idx]; + } else { + for (int i = 0; i < dims; i++) + d_input[i][idx] = + d_output[i][t_u + warpTotals[w_i] + d_BlocksOffset[blockIdx.x]]; + } + } + } +} -template -void __global__ linearizeK(TypeOutValue *output, const TypeInValue * const *inputs, size_t dims, size_t elements){ - int idx = threadIdx.x + blockIdx.x * blockDim.x; - if ( idx >= elements ) - return; - for (int i = 0; i < dims; i++ ){ - output[ idx * dims + i] = static_cast(inputs[i][idx]); - } +template +void __global__ linearizeK(TypeOutValue* output, + const TypeInValue* const* inputs, + size_t dims, + size_t elements) +{ + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx >= elements) return; + for (int i = 0; i < dims; i++) { + output[idx * dims + i] = static_cast(inputs[i][idx]); + } } -void __global__ compute_predicate( float *data, bool *predicate, size_t nData, const size_t kneigh, float threshold){ - int idx = threadIdx.x + blockIdx.x * blockDim.x; - if ( idx >= nData ) - return; +void __global__ compute_predicate(float* data, + bool* predicate, + size_t nData, + const size_t kneigh, + float threshold) +{ + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx >= nData) return; - int index = idx * kneigh; - float acc = 0.0f; - for (int i = 0; i < kneigh; i++) - acc += data[index + i]; - acc /= static_cast(kneigh); + int index = idx * kneigh; + float acc = 0.0f; + for (int i = 0; i < kneigh; i++) { + acc += data[index + i]; + } - bool pred = acc < threshold ? true : false; + acc /= static_cast(kneigh); - predicate[idx] = pred; + bool pred = acc < threshold ? true : false; + + predicate[idx] = pred; } template @@ -248,107 +284,176 @@ int compact(bool cond, const size_t length, int dims, int blockSize, - bool isReverse = false) { - int numBlocks = divup(length, blockSize); - int* d_BlocksCount = ams::ResourceManager::allocate(numBlocks, AMSResourceType::DEVICE); - int* d_BlocksOffset = ams::ResourceManager::allocate(numBlocks, AMSResourceType::DEVICE); - // determine number of elements in the compacted list - int* h_BlocksCount = ams::ResourceManager::allocate(numBlocks, AMSResourceType::HOST); - int* h_BlocksOffset = ams::ResourceManager::allocate(numBlocks, AMSResourceType::HOST); - - T** d_dense = ams::ResourceManager::allocate(dims, AMSResourceType::DEVICE); - T** d_sparse = ams::ResourceManager::allocate(dims, AMSResourceType::DEVICE); - - ams::ResourceManager::registerExternal(dense, sizeof(T*) * dims, AMSResourceType::HOST); - ams::ResourceManager::registerExternal(sparse, sizeof(T*) * dims, AMSResourceType::HOST); - ams::ResourceManager::copy(dense, d_dense); - ams::ResourceManager::copy(const_cast (sparse), d_sparse); - thrust::device_ptr thrustPrt_bCount(d_BlocksCount); - thrust::device_ptr thrustPrt_bOffset(d_BlocksOffset); - - //phase 1: count number of valid elements in each thread block - computeBlockCounts<<>>(cond, dPredicate, length, d_BlocksCount); - - //phase 2: compute exclusive prefix sum of valid block counts to get output offset for each thread block in grid - thrust::exclusive_scan(thrust::device, d_BlocksCount, d_BlocksCount + numBlocks, d_BlocksOffset); - - //phase 3: compute output offset for each thread in warp and each warp in thread block, then output valid elements - compactK<<>>( cond, - d_sparse, d_dense, dPredicate, length, dims, d_BlocksOffset, isReverse); - - ams::ResourceManager::copy(d_BlocksCount, h_BlocksCount); - ams::ResourceManager::copy(d_BlocksOffset, h_BlocksOffset); - int compact_length = h_BlocksOffset[numBlocks - 1] + thrustPrt_bCount[numBlocks - 1]; - - ams::ResourceManager::deallocate(d_BlocksCount, AMSResourceType::DEVICE); - ams::ResourceManager::deallocate(d_BlocksOffset, AMSResourceType::DEVICE); - - ams::ResourceManager::deallocate(h_BlocksCount, AMSResourceType::HOST); - ams::ResourceManager::deallocate(h_BlocksOffset, AMSResourceType::HOST); - - ams::ResourceManager::deallocate(d_dense, AMSResourceType::DEVICE); - ams::ResourceManager::deallocate(d_sparse, AMSResourceType::DEVICE); - - ams::ResourceManager::deregisterExternal(dense); - ams::ResourceManager::deregisterExternal(sparse); - cudaDeviceSynchronize(); + bool isReverse = false) +{ + int numBlocks = divup(length, blockSize); + int* d_BlocksCount = + ams::ResourceManager::allocate(numBlocks, AMSResourceType::DEVICE); + int* d_BlocksOffset = + ams::ResourceManager::allocate(numBlocks, AMSResourceType::DEVICE); + // determine number of elements in the compacted list + int* h_BlocksCount = + ams::ResourceManager::allocate(numBlocks, AMSResourceType::HOST); + int* h_BlocksOffset = + ams::ResourceManager::allocate(numBlocks, AMSResourceType::HOST); + + T** d_dense = + ams::ResourceManager::allocate(dims, AMSResourceType::DEVICE); + T** d_sparse = + ams::ResourceManager::allocate(dims, AMSResourceType::DEVICE); + + ams::ResourceManager::registerExternal(dense, + sizeof(T*) * dims, + AMSResourceType::HOST); + ams::ResourceManager::registerExternal(sparse, + sizeof(T*) * dims, + AMSResourceType::HOST); + ams::ResourceManager::copy(dense, d_dense); + ams::ResourceManager::copy(const_cast(sparse), d_sparse); + thrust::device_ptr thrustPrt_bCount(d_BlocksCount); + thrust::device_ptr thrustPrt_bOffset(d_BlocksOffset); + + //phase 1: count number of valid elements in each thread block + computeBlockCounts<<>>(cond, + dPredicate, + length, + d_BlocksCount); + + //phase 2: compute exclusive prefix sum of valid block counts to get output offset for each thread block in grid + thrust::exclusive_scan(thrust::device, + d_BlocksCount, + d_BlocksCount + numBlocks, + d_BlocksOffset); + + //phase 3: compute output offset for each thread in warp and each warp in thread block, then output valid elements + compactK<<>>( + cond, + d_sparse, + d_dense, + dPredicate, + length, + dims, + d_BlocksOffset, + isReverse); + + ams::ResourceManager::copy(d_BlocksCount, h_BlocksCount); + ams::ResourceManager::copy(d_BlocksOffset, h_BlocksOffset); + int compact_length = + h_BlocksOffset[numBlocks - 1] + thrustPrt_bCount[numBlocks - 1]; + + ams::ResourceManager::deallocate(d_BlocksCount, AMSResourceType::DEVICE); + ams::ResourceManager::deallocate(d_BlocksOffset, AMSResourceType::DEVICE); + + ams::ResourceManager::deallocate(h_BlocksCount, AMSResourceType::HOST); + ams::ResourceManager::deallocate(h_BlocksOffset, AMSResourceType::HOST); + + ams::ResourceManager::deallocate(d_dense, AMSResourceType::DEVICE); + ams::ResourceManager::deallocate(d_sparse, AMSResourceType::DEVICE); + + ams::ResourceManager::deregisterExternal(dense); + ams::ResourceManager::deregisterExternal(sparse); + cudaDeviceSynchronize(); - return compact_length; + return compact_length; } template -int compact(bool cond, T** sparse, T** dense, int* indices, const size_t length, int dims, int blockSize, - const bool* dPredicate, bool isReverse = false) { - int numBlocks = divup(length, blockSize); - size_t sparseElements = length; - - if (!isReverse) { - initIndices<<>>(indices, length); - if ( cond ){ - auto last = thrust::copy_if(thrust::device, indices, indices + sparseElements, dPredicate, - indices, is_true()); - sparseElements = last - indices; - } - else{ - auto last = thrust::copy_if(thrust::device, indices, indices + sparseElements, dPredicate, - indices, is_false()); - sparseElements = last - indices; - } +int compact(bool cond, + T** sparse, + T** dense, + int* indices, + const size_t length, + int dims, + int blockSize, + const bool* dPredicate, + bool isReverse = false) +{ + int numBlocks = divup(length, blockSize); + size_t sparseElements = length; + + if (!isReverse) { + initIndices<<>>(indices, length); + if (cond) { + auto last = thrust::copy_if(thrust::device, + indices, + indices + sparseElements, + dPredicate, + indices, + is_true()); + sparseElements = last - indices; + } else { + auto last = thrust::copy_if(thrust::device, + indices, + indices + sparseElements, + dPredicate, + indices, + is_false()); + sparseElements = last - indices; } + } - assignK<<>>(sparse, dense, indices, sparseElements, dims, isReverse); - cudaDeviceSynchronize(); + assignK<<>>( + sparse, dense, indices, sparseElements, dims, isReverse); + cudaDeviceSynchronize(); - return sparseElements; + return sparseElements; } -template -void device_linearize(TypeOutValue *output, const TypeInValue * const *inputs, size_t dims, size_t elements){ -// TODO: Fix "magic number". +template +void device_linearize(TypeOutValue* output, + const TypeInValue* const* inputs, + size_t dims, + size_t elements) +{ + // TODO: Fix "magic number". const int NT = 256; -// TODO: We should add a max number of blocks typically this should be around 3K. - int NB = (elements + NT - 1 ) / NT; + // TODO: We should add a max number of blocks typically this should be around 3K. + int NB = (elements + NT - 1) / NT; + DBG(Device, + "Linearize using %ld blocks %ld threads to transpose %ld, %ld matrix", + NB, + NT, + dims, + elements); + linearizeK<<>>(output, inputs, dims, elements); + cudaDeviceSynchronize(); } template -void cuda_rand_init(bool* predicate, const size_t length, T threshold) { - static curandState* dev_random = NULL; - const int TS = 4096; - const int BS = 128; - int numBlocks = divup(TS, BS); - if (!dev_random) { - dev_random = ams::ResourceManager::allocate(4096); - srand_dev<<>>(dev_random, TS); - } - - fillRandom<<>>(predicate, TS, dev_random, length, threshold); - cudaDeviceSynchronize(); +void cuda_rand_init(bool* predicate, const size_t length, T threshold) +{ + static curandState* dev_random = NULL; + const int TS = 4096; + const int BS = 128; + int numBlocks = divup(TS, BS); + if (!dev_random) { + dev_random = ams::ResourceManager::allocate(4096); + srand_dev<<>>(dev_random, TS); + } + + DBG(Device, + "Random Fill using %ld blocks %ld threads to randomly initialize %ld " + "elements", + numBlocks, + BS, + length); + fillRandom<<>>(predicate, TS, dev_random, length, threshold); + cudaDeviceSynchronize(); } -void device_compute_predicate( float *data, bool *predicate, size_t nData, const size_t kneigh, float threshold){ +void device_compute_predicate(float* data, + bool* predicate, + size_t nData, + const size_t kneigh, + float threshold) +{ const int NT = 256; - int NB = (nData + NT - 1 ) / NT; + int NB = (nData + NT - 1) / NT; + DBG(Device, + "Compute predicate for %d elements with threshold %f", + nData, + threshold); compute_predicate<<>>(data, predicate, nData, kneigh, threshold); cudaDeviceSynchronize(); } diff --git a/src/wf/resource_manager.hpp b/src/wf/resource_manager.hpp index 3d99ddc4..b5db514e 100644 --- a/src/wf/resource_manager.hpp +++ b/src/wf/resource_manager.hpp @@ -141,7 +141,7 @@ PERFFASPECT() static TypeInValue* allocate(size_t nvalues, AMSResourceType dev = default_resource) { static auto& rm = umpire::ResourceManager::getInstance(); - DBG(ResourceManager, "Requesting to allocate %ld values using allocator :%s %d", nvalues, getAllocatorName(dev)); + DBG(ResourceManager, "Requesting to allocate %ld values using allocator :%s", nvalues, getAllocatorName(dev)); auto alloc = rm.getAllocator(allocator_ids[dev]); TypeInValue *ret = static_cast(alloc.allocate(nvalues * sizeof(TypeInValue))); CFATAL(ResourceManager, ret == nullptr, diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 268ba56d..b79a7890 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -24,5 +24,18 @@ endfunction() ADDTEST(ams_allocator ams_allocate.cpp AMSAllocate) ADDTEST(ams_packing cpu_packing_test.cpp AMSPack) +if (WITH_TORCH) ADDTEST(ams_inference_double torch_model.cpp AMSInferDouble ${CMAKE_CURRENT_SOURCE_DIR}/debug_model.pt "double") ADDTEST(ams_inference_single torch_model.cpp AMSInferSingle ${CMAKE_CURRENT_SOURCE_DIR}/debug_model.pt "single") +endif() + +if(WITH_FAISS) +ADDTEST(ams_hdcache_mean_double test_hdcache.cpp AMSHDCacheMeanPolicyDouble ${CMAKE_CURRENT_SOURCE_DIR}/faiss_debug.pt "double" 0 10 4.0 4 5) +# The max case fails on DEVICE. We should be aware abou this when adding support for CI for GPUs +ADDTEST(ams_hdcache_max_double test_hdcache.cpp AMSHDCacheMaxPolicyDouble ${CMAKE_CURRENT_SOURCE_DIR}/faiss_debug.pt "double" 1 10 4.0 4 5) + +ADDTEST(ams_hdcache_mean_single test_hdcache.cpp AMSHDCacheMeanPolicySingle ${CMAKE_CURRENT_SOURCE_DIR}/faiss_debug.pt "single" 0 10 4.0 4 5) +# The max case fails on DEVICE. We should be aware abou this when adding support for CI for GPUs +ADDTEST(ams_hdcache_max_single test_hdcache.cpp AMSHDCacheMaxPolicySingle ${CMAKE_CURRENT_SOURCE_DIR}/faiss_debug.pt "single" 1 10 4.0 4 5) +endif() + diff --git a/tests/faiss_debug.pt b/tests/faiss_debug.pt new file mode 100644 index 0000000000000000000000000000000000000000..7ea0c6eb5809890f632b40bd05c1374800999032 GIT binary patch literal 16045 zcmY+KWq4Eh7p-x3twyT2yYB@=u)*CK93sfzFoQ#aJ8g4LTihW*W^e{ba4qgGZQ8WJ zALp*Q_tTx{$)w5ooh@sYd~)Vdla#kvPck+@K4?unE(6SR^`qU2165c$&?Dz)e9 zW%l71DOp=D_x#b4`(CsxdKV#@;3zqnsg=si(T`8qvX$`lNj zocn@g?Pi_0yM#;WK9O?lexy9v7b2~y$H?s92&s8DMt)3-5pDS>IbDlqmIaCVWT;g9 zu9uiX!Sc=%CXX+KiYr4SbL;A5#Xg;U{5MA0)D4sAWkckTb0P9;SF{Yg9W6ULgi4tP zT3Nx`9NWUBVh6^19xji!hs)1LA#yq)Ldso=loD%WW&HM7Ilfvig9?Pmt5~hHNe`8y zUxTGot5|VX43_xRFd3>wN>+_nIguwye(j5r4yz+1eNC{`$fcF92ZQ8^SuaC3M#{y? zTB%VgQl4&!l8^a9ZDt#APKIdm2K_Aq>ML23fBvk@PESO;76^T{4GTC zuZxuPtE1&oNsV+b9x6?)M@l0@i0tbhAzLR?v-ZI$K8&cCeHk8YG2ggv#`; zQIb(qBZFRq%Dk{3DL5rY@?H&>Yw5v~tB6(>kB*SB^P^-xGOe1ZmdFgR_>Cz%vN*;+6l+wwC z7LhV%8MUq-CF}DB$+Df%^2a8PEV>ykt4hzWO23nw~t)sg?5e zG!i#KBS*XuGBhPtOywda=wYM;pNN%)BXp7{Aw-^XzTa;Nmr%}f+|wA@n>$=~G$gaN zqGWob2=QFfOSgV{kp&?#^rlXFH4K+^npi0}ELxQGAi1+GLI(5*lG}e#yYpIEN`1eF zM#zWZ(URkQsEjBYEF~m`VrBwXd;~e_*g|yQP&Lc{DOUD@fWD z4U;>OdKti(9{Q2YJr0td4TB_Qq)z5f(aG)pTG?DcE5~nYrC(OCJkdo-O4&$RsBkU{ zM9O9Q7hNJkTF;A;j@$vE52EDT-_cT@*VD`CrHzEkV1JC5KZHxcYmw4pVzi_b)XU{T zA(FRWn4F%Zm3D?;>BD)exr_TOF;Y%Gj*%6gf~08naOw3qR0fA?%U3m2afEX!+ZCAlcsS{@|rxdR7!qvhYNIvM;^FH7F*oSUy2U?n;n-_u}PTM6z@aBuUfu zi88;SOC(H@52DD$Aqi43U6ljX*r)1}9v2k3TO?UVETE68;ziOEB>QET?5~*p>i9%4Uvf#@D|%2XNrH|f z$du8l=$0kO<3_6duXnsWJL?jQH(C1qlO%tYBv-8G?>mZAx|}4ZsqL$B336p`k_4X7 zk8i5%AMBDQ-CYu$nIz$*lVtLCmvr8kD2GNW;=8KK>9J(#PNI~15id#31W7FsFDYU4 z{Bp7^+@nh4vMxDLGg+!dy2P(@NtJnta=5=L?rsS(wz^B+6-|&|4)XIpp8G06uD6Sq z$mfcbzn3Hqdi1JwvUGGO$*&d35?Dd*OOl&+E-Cg_m6^R1soPtTg}118a)LDKnj~wB zB}!zQWKkZu|UqH&=W4%P7SpsT=FM(g6?&))F%_UtHw(& zogzc3B+8q#B-yajCH)^K$(TGza&|(ZXt=9>B})7DiWIz(Bt1%~a(;5Me43CTmNBXyza8?s%!b zLy_0iyxm*+PG8sUQe^i4&PnzpX|tL;Y%qDc%D$W~8Tm+&x#Y4WbzFKx742QJ5S}0z z|Ee;baVERC3u6?qUQ;D4E>SxCOpvI1iZpnYAoo5b$}D=(sBWUvn#jEpugJ9A@ls`@ zOCD}ZlGW#vrRCUU$#|L||Fg1(bMew(b%I2HR;2l?WZBQ1av&-}j&o16pOPR7_ihU^ z70Nwv?s&4Sd`BMEyX3cIRaWj(q{v>EeBrJ+dc`H4J_)ijZ?ash6))uqCd#0uE>Qxi zTwSh6pQXuCV*Y>pD)PLnB6m(E%e~skQnR*8W-e4EFhmtwT!xemN)=cAfZ%kh>^qeq zIlgDgRZA+rH>64NVSec{-!FCYq{~{9U-FgBkTqfH(xpqf6ke7pZwjS~d0d+O)-53A zKV(VBy9|+k(`57ROo=KKkdhk$GA@uIXEGe3pAs?&zC8MxkjC(SrK^MRLSdb}bo+0C! z_~rM9>GE}tU#{#-6D2-Po|Q?LW`AagIyzNa|45fM%+tXfkeqE(rT(rA*)x;6Z4F4r z=rkE0oF&?xX|gIUAgTXk$+FP_xi}yocl-F|^55yw@nWj9-IFGzE2qhjkEO}+lnm*>oWzOj6XF!rQ0okzCFOybL z_utYb^0Qxp&8aeWZmKNt2jpd~R5`pbQ$|immEe7uQu2G2*qiz#wq-!_oXe7p%=cWM zD!-ojr4l{s;tWVbAg$@|`fPr=HX~J{i)P5{f77K?-wZieJ5Aai z^Gm;ljQJu}&Qj0gdDA4@s|*==C_{pW`{mQ*Oi|M_q~OFY3883Fq^BhSc$-O4Bcy@^fseyr`WeEr$B#MKiyA z>zyguKhh?boH=FWg2(coD3b#uGZn$eLXr1)}XCRi<=8EQyT0%$%j{u8EdpC^oI^130mCypo0>w!C#Yfc=1+`QKz&R5TSvYuj_2r zqu>)Q1`gL?&Pg4diWX(JYcM2IkLl$!$Q-A^=+;_{KdwdCYAwo4)Zu%S4vFbH>@{i8 z6&i#b)ncJfgJSRXc=K9=p^tUgR!fhkymv)Y9UhF=qN!Vlw_~-KwMmPkw{%$CRF88# zw0J*2k8Bln_*q+r1-Errl}Cq>FLh{nLx)CoJ$9DX@JA9oZnV+h{(pHZF^*q@mToQn zWZs!WHL!lvW5iP(;)iQ7r>X`84(j1hwP-Vic|3Z=-_)aKq!z7D=#m{$~8SMx6+~i0u2tn)8MjSkFYU%1n1Nv?TZds>on+F zS%+a;^k~WWE!f}GojR1rrNuCl24${jv1OAE?ZUNa$sQNv*JJHcEgFo{;@<{Z%#G9H z(KkI(*k2lRRobjU-3nT~>!ZW_9~yL3b#T4XV;%MF(^rEt4Ro*<)nWGv4Q@2o!xp2% z{V*NY@_Cq5ht*%SNapo&*)%9^(xW}JXbv4-{iDORiS&=&<;|zXn(sQaU!#MKz30_y zk=jNNe+&9HU4s%XJ!;$Z=<3kmw>=uvZ=^x4ObzOs&>-Tw24%^Z4K~cupciurd(qJ2^NhQp6Z`z7M-Phzv&cnW z)@Nq2Q^7&JTUjee#PJ@$CT6~zH!Il>~T$rXocFsbM zZ#onSB8MBbSVeu0v%iC5b(mzNht$EbjlI^^;A&ePUb}QiC*Q9h=`j7S9!-kr5T#=+ zyEN!IT8|4|bda|?sN2a=Z#|Nqa~A*7AdT}-^$L9rp^teq*j-AC9vk&&;LsuaBMq|J zk=37CG)~f@_)Q%;v5rNn^?1~h^~UKD)Lw^E)OD#&hk{;u%$#QyYcPr2=A#Ej8)rPA zLo@of%c6npss?-PT6`U(L+xHVEFljoF6*%3A>(nr>yOmoD7ksiSP%DF?gZ*v?zIjH zGuUII27MN4@Ts0216bQxa-WYiy`J*lKG0(K8Vz=>(jb+)Xj^SPUc~A!aEu1Gi|bH& zvJMpsXz;j^7KJBjk)dkvHNOs@l6CMj)}nSFEgTEW` zwOIL5gXMqfaHpgmfq7aKrT@{~XH$1`Kaj60X*ztNKl!?AF>jOxdV>y!YG_dWF}Wez zZHsBJ@S6sm7V0pilm^u}SN5`cw7a9hz2h2OS*}H+C7eU825VmHQKk-SS)xb&9(r62 z<}RC~L1&*H{~pof%X$sgm!_BG>-akz9*2_2>w5e_p5}hip+!GE414(?uU(yb41deB z+#we(YvHS`!R$B<-pv43Q4nO8`cC#}c_dqr7y=7}PxUgJ@dht3Ope8++ zX|ap@xwK7##oV{WKI;&7K@WWhYm6ozo4LRGlOghPwIDg6?-$!@Q2wDF8#rsaCuEd8 z-ua-%tiu|do~+0JsO9EZ1Nzi4qF^lp9xONFc?&bTtuy1sa1)*lGox)ABU(h7aRvso zK43zJJqApxYQTu429&yKLf?xf$p4Huw9<$Q#|*fA%ZSOvObEMVgtRxHZJY_Ox|vaH zy$K6~4QRf_h^7@xX!nm1>37Ze5n;x;z9w|uV8A7qko3TW`jw3MkYGlR3^NKeHe-Xv zjL)~s=snbk11C%{jWXdP-|to0fX+t@m{rw;yM80AO-u-7O${;@d!!5;1zuzRZszh;{7J7Z@1%YZ#vBQ75`Ba`=T9dEz|rx8hZ19G{I z=sC!Y7}iruZ^rhICb$Y1@%tkq?!7nRP);-EoiO0vNhX9`V^39$c$I0w_xlE%sAoXY z&PG%{V8m6Y8N)6Z(LBkFnO)3S_nq}puWHv#xbxYJw+~HdkG*@z=Q4d`~;h-Uwq;qGRHd!-qB%A0WcPc!l^GNSKcGhBns*l#o; zeytIi+31zUh(5JV=&{*|lLyS$8f`=qaJ@7GlME>S%ZQR4%_w)(fc!7aC>~E;Sm)A!5ibfGv5BD9dS_d;X89DsIfK$Vb*f7|L9qUYZ+{S95o;csaD($Rb(;xeSoh+uCd92aU?jO)QP_-tn6dn%5se$0 z@Q=b-En>vVHwG-YU_!x}M&$ZrhQb+4An(^^a|U!~)EjF;zEWmf7-Ynxa|VnZYe4%7 zX8hW3#zdH5uWZ1)Eb8`)b2ij~l4Z#mIbPMlgfh&VGnLGqWBlLDc>mdevShQ(Tq80* z^8IH9eAQ5c6J}g$XTkx_Ncp!$tes{?gLMYH8EeJ_#%iBn!rVg!OpN3FoH3&_IePJx z*P~5%%ihN}Gaz6y;uifl{lSb%IgD7OGh$gD0~*B|v5DT!y-q!^k>Mls_l_CPQ0|}U zMpWESj|-XbqM{L=DrOAaYCv`Nm(tLT)t!v^qq7MMe>dW64il#B;M`@IG5Wa?TK1f~ zGJTq2M2+VLxF49IsmH!@ny~Y@3B|dCHr6tu&H@8keKg=X`#sp3Y^*V$d^0nD1~ws> zefBG8!eAHmxzAnUGol@5)B3Lw<+w8zw=%%bU3cuP33YoIu$o-_$BcUmnT|GNO-U0< zQ@?D*IisAt8BXpgvfI3Y2|tEVJJvOi`&HA;giO5=$|-tQj62(8#8IyagNmE*f%|2C zZzH_54X8ZcfS|fY+|6l3SJqeFZ@`@VJJ)O&w8DaHCTQGm71=>MYtSWCuy3UFg z<*oRqwH*`Y+7Rrwpk^l?Ov6 zOK~;~ci511(S~V^_xHax%%8z?dF<%pvh(*e<~eA=_Ean0mb2pLARDs1vZCY?_H&$l zkF~)TYeh4~ijSA=IRBb8{%%Fjj#hlNSYc{r!J{*FgcRdFkE|Fx!Gwcds*i_brJPj<^+Qg2%o27No>RAFRy_7mFS8Bn zcUmws#fH7A6$`Fd(S`9|wy>erGb=XG_qQ9Zc+%5`yz4CZbD$khwpekpjtvFKOxkHH z7O{@@(KgiEZ$ot(ndoeRdd`Ls9qed6)Pf&NY?%Dcf=;)sxVpoFF^%bIF&mO6*)fZp zwMej`Vx}F7_StZqHJ{yT!H1F-%%N`WOItBxq7@4_*8_?wSQ1gUB)K@$_PyBdsWZ)q)(X zWkr%53#(f2jr^FBESS>Vim)AaOs&A$82bx#+mvoa%u)-|zgcl)z7?Oy;K2V`VBcv) z{+1RD-*3gxMs_S`j&+x*HFHjhx1s$1Y{;2TkEz9n={BSnu%PD$8``}j*ZJ%i6k){> z>SLSC8aekd9jq{1u>qXdJmg|4InH=ufdnk5bKM5l7b~_Ew<6eRfvEy(UCufEWW|dH zWcs8PE2-b08&=FLW=C6kH1?4Ni^xm+u6B(3+k%Zh?P#^k2FooQ9+Kq^1}h2&+i|Lf z1s5OL@xGu1?xA*s_p_t)-*z0L=P6|Q)kiBT_o7~UJL*)k;dO5dPFJ+y>ktdlqHI`p zmAo{v!al=>Ax;~5@xD@wyI_DFD|g$lEr$)iap&|MY~$x7RvhD8%$RA%f$|nCA7h8T zsTIqr+fZ{ac`Ir|Z_ZG7ODlGqv7w0Fj;LWa9GuPBE@4HKjr%IuiUB1#V?qXb?FRSE zT(XdoVMQV4>q+1L)2ouvj%zKf$gTTJQ?ziq1Og?1AmkXRn_He*p z$FUz4Y^Xv$e%KI2%@+M`#olIG4ec-^shYs|->BNGsPK?nx@Xr+o&W(4V)OH7cZ{)XEc3^B12X38n zAS>C4UMC$$eC@>G0uFS1;lQ@pPUM~JK$4DedEeM<4peR7#H!z&C~?7w?sJ?-w(-8D z4lKRwM5P2LhHrF2-`a_|4i2oi=0Nf`Ck}pa;^PPhh8K6@#(!sGm@mVLTw5I|V{~9b zZ3o(%cH&2>1Fo{XZ>|$XMmg~5kONmgvDUvGh{pA!Hz!&}vL-uY4tJuD z!-4p(4kSEvAp2e?ww!k0HqYEW?!@LO2R0seAb)!&x?N{q>{q*-k2oh%%}!WZ?{2RX z=Bcc?z5}aYIS}`U11ZxT_)*k>kM*6XGtmh{B6Dwa;L2eq8cucKSp_GGuX3PQBL`X( za^l0EyqAxs3a{UEpx$ODH18a!VRIm~rUTCWjDLu^=+D2Ood|j3#Kj{{^!Ue#4C;}? z>cq=&PLvI2j8YEVT;@Rc{!T3CwbTvtZM734#EFh&oR~7ifjeED_{Ou1=}ASO6ZH}t zF!K7(Z+s4N;5nRFc*nutg`N0JZCbLXuY1S?`xwAFzk8gx$Lr6ZI#GI;1Cwt%@ot*~ zj~I8{PA4|jbK=4lCtjRkpV=LlvCn~;tfPIJ6Q>?IaBYbbIrB1qz=0>XsdrB&CXtIT zkLgK>1AR(3(SdyrTkk}~btk^xa$uZ=x)0;C+X;`6y|Vw}eI3v(aN_UY%s<+RtJJdO zb|>z{I8lvf+OUtp?CYP(4xD`9z+f^xirUmY!JG~!ma+FsKOCUKXw3Wv20F2lo=x55 zgu_hF7;`IGI#YoxUEoZd=e>Fd%nzL~UU5R@T$H%%z}W*lcZyo9bzqdAy|trutg|BP zXgk)4ueY42JI;Y69UQ1qzzHEM(TtWR*DIL|sbr%lMkY`$*HSv*1ibFt5-DijRW@6 z|FP#lPI|SFHLs_CEq2n!s!lXvPF*u6rVl4`_Z%=yalpno@-${$Z+L#S1E6*lc&1e* z&vzng=}uJc>%>jgdVQD^GvAOa6M4VyK+_LS+^4@s$;WZl`^e?Qw6adr$mPTd>eO|% z17+iRjoOD)BYTG&Sj2b>5}k;D>qLVq+zYM9I%lm)A18htb0Q@dXOvuJ{q2OmG+8?4 zK(QRGzqu28we)106KQ80=*!q;$l>%a)VQt#uV-@4u#cDjou_8;)cc;!fkuB(7siex zFO64{&sZni++$Ce-@`rfcSR?ZE#!5&6Z-}@kXy^WM4cv*ze1Oovm=>r=|Cs)9)6O~ z9i4c6l^jsl9gR6d?Dae6`yusSyMRpYaG*7NtU>KseeY^0!Kgbq) zFFTFBQsy4!rB`z=e98<)*BCrivDiRQ@cY@^ef+k5&*F zt?+YL1w&>jaOP5x(nrA=zF+dJf?m@U=rdKs`xNY7ref}172XsDh3yK4CaCz*LBUm? zcTH9Cv5|s>wG?QFDk$q$G1H@oq49H{8^G`4k>8&UBScwDk9FQX!2S`wG%2ny;1PK zf{O76m~*Lu8|CTM9R-7qs(2Z!;M`pmKXWi2@2SY%Kdn>Pmx^=c6a;rxaF4l1-&f%8 zrXqa;^;xOlL@@=!R??@#Dr%%Dc*@>pWvG~)PldEq(28|@<-N~;QJ?xMjxSPipsRwQ zKgb6)uaK@_5xvl`_BsX?xtK4$ql%?9RYX&>A=EU&s$y|N73H@pC^(5d{i~pBCj}Mx ze$%lEVnXTpPJYe0_nK6k{a3}N`U<);hjYG)sdLB~bG4v<-yiZU*+8rcXc+U0ii0{8 zf1g#+C`3i8)hfP~Q86r8!GS^wqRp&_JZ*SHU0N#W_)5i1);019eSDx`Gy5CmRgu8h ziyDy6-^p`#=48xnlgUaC1^Wi72&R{rWdAYQc|;vg(C0!ksLe%%-%C_HnMIDbDQHN} zj$9tM(DQHArV)}E2 zhEel41ubWjt2}(IRWPAAz1ghb7=3-7SH=;Tn~ zqh4$N)2^w4-qikeGcs&bF{Pn`k)>74yGn2BGA?^w!`Z1yt}@wIL>UEt^&~4Z6;yvt z-^!^tzmTkFuuk4zfcspEaJ-Gt}9^ zGsj_sf1(P-uc8y@sd-5S57^g-0t)&cQqha`@7u=x(1i28m4c@EROIn0D9*mOk5cf&uOPjUg2!7Ftp31xCzqW(3VxffpdaUG^a=%y z*~`5nDlU+_Go0-)x-9MyxTjXeCF(}SMRJs9@HgWi95__t3FQgXPVALK@hD{h<$^`K~X z5B7HQp#3lpz9qUbFVcfZKH?|35xd2M{L?)M-Q>ZS93H$`Ff26yS zx!Vo&^kCK{5B{F%#*BYhb1~j)^T1WagM`K&EG*;U-xEEs@QkUR2gN3PFus`wL)i1_ zLms#pciAvE|E|Ytb~nbZV~nyM)IQ7{9X&uZH$Dya;EL9bHyzyQzsZgA_1%~m#2ml5 z(TVjX@x6h4-FVQ*jh>G^xbe)x&&wFUqZ88a=8b!Lfy!d#vXIJ(d?cFi%Phm zYwE`4<8G9?<-r2#Q+mG}+Gsbb|K@@B9}hOK^5Fa%dK&P+|Ca|wAMdB%x%;`X>AnYH zr`(vr+@(%>F!;I~ds)u|_B^Sk2N8GOIA`+U>?JpH($ikuJlG{&JZ{wL#eC%JQ2~19bwge2 z#<&Y^lwtjcs(VnsAi3G-=EH+-Lp=E37_xYgd_HsIEnlDR>Or*!9vswr@Zy{Yg)=?) z_1c3K^~l3_5C7)tM)NOj+)Vc%UoQGR)`KS#$N+QR*z87jYFe3E9xv*~t1vhJJ%t+= z_mYzbZZxdwLI0;7{`&kdPc2giU(=g-AJO>u3^l%kMnTZgSV_VG}Ddu zaHDNkHyY<~BkwnI)WVHh^s)^#ze4_tvyNY62WL5lxjp#vjRzM>y782LUHh+Q3${&@3T$)7{v^Ij??? z^>T(Tu5=?Mi8ES*dC!pfyB;j5;KnYq8ymY&)5DzIYUDSXeUZ;+gFU!3kiD#-@9o_< zla1%te|>6LhB^HmJeYXOgS$L4XS^G8TyFl}N8MTTU)MYs7)F0*xv|^eLGxv-qn`&E z;(xQn+5gIXZ>ZfEp7BoM?&`?i&++_6 zH;jke=(vV^jvDR13^$l;e++!bW25q)Z1gW7Yqz5bWlFmF%p zldC1@`*{zJUL;oyxF^>coI-fg|eOvzaV4K^6 zHM`uH{(*DH*&WU~*?pR`801EwIUby1%@g`iOZpMl--9EZ-!cQ;sN0!Lk;_v1IiJ)y zJ;99tS^XHGW_{f_JC^hP!-ED(neVL|%h_u;>gKKDK@4kN&bW;lc~GAB)V6ZBRdVCt zYVL5(R;4D)(~x_5IOl-h2c~f+ePT@scRXm)&I89aH%^k*r*qvn#M~QyxX~qrn&$N3 z%Pk)gGrU;d$cLkLFYfrgs504yfj7Na?eU@P4Mi}}#* zs1Lp8_%QD`FBb3eVJ2&-@X&{l#y*VuLj8D8lhHl|FZbd55ibgs@Z$JbYA8PB)A-P* z7jw_?Vp(A?Ob5MqNlnMa`S4?|50!ad_rQnaH@x^X&xgm%S-6!Kp;r4IA^GpE^y;Rn3vznJIP*NDbmxT*2u z!altF?nA8zFE%Y@uRQ;wiVtradC~E_4|gw8V=_`?m=|jbc`=fG==b{&I)&#>v-g)i z{JGVOdSO0PcOM@q?8QEf7fZMMFxTqE2IkAY!Ha%p zd^kzp_tjtz6m#hj|arq{on*8$Psc=|%bxYER}4z4xI}Hy?uTc~P1(RGEIXN$?_> zkEV7XD%K&_WOnQwAJ(#le^ydg_S<$aIXmS=tp{E-eCWkr)U4iKFLKfoGr4y+^g(X= zkbKgMPGqQ<#f#bGs12EJIL(U_%xNjX86ck@b9#}F^R=EmzpcjEJxYDp->AAijBoBm zB{Ek0D(|P}UiPK>b{FbQX#oZVm9FKgc+20FaL)IGS#cOKuXC!OqY@OWY z#gi-_wp8|E*9tF&_u)S3=*89hlJA&oWMMru;|$NP$h~ow zwf^*C%pfm5bLKwwpqIAQOoM*r$D-%ZZh;@4z5)r + +#include +#include +#include +#include +#include +#include +#include +#include + +template +std::vector generate_vectors(const int num_clusters, + int elements, + int dims) +{ + std::vector v_data; + // This are fixed to mimic the way the faiss was generated + // The code below generates data values that are either within + // the distance of the faiss index or just outside of it. + const T distance = 10.0; + const T offset = 5.0; + for (int i = 0; i < dims; i++) { + T *data = ams::ResourceManager::allocate(num_clusters * elements, + AMSResourceType::HOST); + for (int j = 0; j < elements; j++) { + // Generate a value for every cluster center + for (int k = 0; k < num_clusters; k++) { + T tmp = ((T)rand()) / INT_MAX; + tmp += (k + 1) * num_clusters; + if ((j % 2) == 0) { + tmp += offset; + } + data[j * num_clusters + k] = tmp; + } + } + v_data.push_back(data); + } + return std::move(v_data); +} + +template +void print_vectors(std::vector &vec, int num_elements, int num_clusters) +{ + for (int i = 0; i < num_elements; i++) { + for (int c = 0; c < num_clusters; c++) { + for (auto v : vec) { + std::cout << v[i * num_clusters + c] << ":"; + } + std::cout << "\n"; + } + } +} + + +bool validate(const int num_clusters, const int elements, bool *predicates) +{ + bool res = true; + for (int j = 0; j < elements; j++) { + // Generate a value for every cluster center + for (int k = 0; k < num_clusters; k++) { + if (j % 2 == 0 && predicates[j * num_clusters + k] == true) { + res = false; + } else if (j % 2 == 1 && predicates[j * num_clusters + k] == false) { + res = false; + } + } + } + return res; +} + +template +bool do_faiss(std::shared_ptr> &index, + AMSResourceType resource, + int nClusters, + int nDims, + int nElements, + float threshold) +{ + + std::vector orig_data = + generate_vectors(nClusters, nElements, nDims); + std::vector data = orig_data; + + bool *predicates = + ams::ResourceManager::allocate(nClusters * nElements, resource); + + if (resource == AMSResourceType::DEVICE) { + for (int i = 0; i < orig_data.size(); i++) { + T *d_data = + ams::ResourceManager::allocate(nClusters * nElements, resource); + ams::ResourceManager::copy(const_cast(orig_data[i]), + d_data, + nClusters * nElements * sizeof(T)); + data[i] = d_data; + } + } + + + index->evaluate(nClusters * nElements, data, predicates); + + bool *h_predicates = predicates; + + if (resource == AMSResourceType::DEVICE) { + h_predicates = ams::ResourceManager::allocate(nClusters * nElements, + AMSResourceType::HOST); + ams::ResourceManager::copy(predicates, h_predicates, nClusters * nElements); + for (auto d : data) { + ams::ResourceManager::deallocate(const_cast(d), + AMSResourceType::DEVICE); + } + ams::ResourceManager::deallocate(predicates, AMSResourceType::DEVICE); + } + + + for (auto h_d : orig_data) + ams::ResourceManager::deallocate(const_cast(h_d), + AMSResourceType::HOST); + + bool res = validate(nClusters, nElements, h_predicates); + + ams::ResourceManager::deallocate(h_predicates, AMSResourceType::HOST); + return res; +} + + +int main(int argc, char *argv[]) +{ + using namespace ams; + + if (argc < 8) { + std::cerr << "Wrong CLI\n"; + std::cerr << argv[0] + << " 'use device' 'path to faiss' 'data type (double|float)' " + "'UQPolicy (0:Mean, 1:Max)' 'Num Clusters' 'Threshold' " + "'number of dimensions' 'num elements'"; + abort(); + } + auto &rm = umpire::ResourceManager::getInstance(); + int use_device = std::atoi(argv[1]); + char *faiss_path = argv[2]; + char *data_type = argv[3]; + AMSUQPolicy uq_policy = static_cast(std::atoi(argv[4])); + int nClusters = std::atoi(argv[5]); + float threshold = std::atoi(argv[6]); + int nDims = std::atoi(argv[7]); + int nElements = std::atoi(argv[8]); + + AMSSetupAllocator(AMSResourceType::HOST); + AMSResourceType resource = AMSResourceType::HOST; + if (use_device == 1) { + AMSSetupAllocator(AMSResourceType::DEVICE); + AMSSetDefaultAllocator(AMSResourceType::DEVICE); + resource = AMSResourceType::DEVICE; + } + + if (std::strcmp("double", data_type) == 0) { + std::shared_ptr> cache = HDCache::getInstance( + faiss_path, use_device, uq_policy, 10, threshold); + bool result = + do_faiss(cache, resource, nClusters, nDims, nElements, threshold); + cache.reset(); + return !result; + } else if (std::strcmp("single", data_type) == 0) { + std::shared_ptr> cache = HDCache::getInstance( + faiss_path, use_device, uq_policy, 10, threshold); + bool result = + do_faiss(cache, resource, nClusters, nDims, nElements, threshold); + cache.reset(); + return !result; + } + + + return 0; +}