From bb4f2d22a2c64711a23df9b1592c3fb607fc0755 Mon Sep 17 00:00:00 2001
From: Konstantinos Parasyris <koparasy@gmail.com>
Date: Fri, 27 Oct 2023 18:00:12 -0700
Subject: [PATCH] Add tests for FAISS and release memory (#15)

* Add faiss-index test and fix faiss-gpu syncrhonization issue.

* Delete HDCache and free example memory

* Delete index only if loaded
---
 examples/main.cpp           |   9 +
 src/ml/hdcache.hpp          |  33 ++-
 src/wf/cuda/utilities.cuh   | 575 +++++++++++++++++++++---------------
 src/wf/resource_manager.hpp |   2 +-
 tests/CMakeLists.txt        |  13 +
 tests/faiss_debug.pt        | Bin 0 -> 16045 bytes
 tests/generate_faiss.py     |  73 +++++
 tests/test_hdcache.cpp      | 182 ++++++++++++
 8 files changed, 643 insertions(+), 244 deletions(-)
 create mode 100644 tests/faiss_debug.pt
 create mode 100644 tests/generate_faiss.py
 create mode 100644 tests/test_hdcache.cpp

diff --git a/examples/main.cpp b/examples/main.cpp
index e7ae3e87..4c6e740f 100644
--- a/examples/main.cpp
+++ b/examples/main.cpp
@@ -700,6 +700,15 @@ int main(int argc, char **argv)
     CALIPER(CALI_MARK_END("Cycle");)
     MPI_CALL(MPI_Barrier(MPI_COMM_WORLD));
   }
+
+  delete[] workflow;
+
+  // TODO: Add smart-pointers
+  for (int mat_idx = 0; mat_idx < num_mats; ++mat_idx) {
+    delete eoses[mat_idx];
+    eoses[mat_idx] = nullptr;
+  }
+
   CALIPER(CALI_MARK_END("TimeStepLoop"););
   MPI_CALL(MPI_Finalize());
   return 0;
diff --git a/src/ml/hdcache.hpp b/src/ml/hdcache.hpp
index a610fcff..cfaa9548 100644
--- a/src/ml/hdcache.hpp
+++ b/src/ml/hdcache.hpp
@@ -248,7 +248,15 @@ class HDCache
     return new_cache;
   }
 
-  ~HDCache() { DBG(Surrogate, "Destroying UQ-cache") }
+  ~HDCache()
+  {
+    DBG(UQModule, "Deleting UQ-Module");
+    if (m_index) {
+      DBG(UQModule, "Deleting HD-Cache");
+      m_index->reset();
+      delete m_index;
+    }
+  }
 
   //! ------------------------------------------------------------------------
   //! simple queries
@@ -400,6 +408,7 @@ class HDCache
     } else {
       _evaluate(ndata, data, is_acceptable);
     }
+    DBG(UQModule, "Done with evalution of uq")
   }
 
   //! train on data that comes separate features (a vector of pointers)
@@ -431,6 +440,7 @@ class HDCache
       _evaluate(ndata, lin_data, is_acceptable);
       ams::ResourceManager::deallocate(lin_data, defaultRes);
     }
+    DBG(UQModule, "Done with evalution of uq");
   }
 
 private:
@@ -529,28 +539,35 @@ class HDCache
     for (int start = 0; start < ndata; start += MAGIC_NUMBER) {
       unsigned int nElems =
           ((ndata - start) < MAGIC_NUMBER) ? ndata - start : MAGIC_NUMBER;
+      DBG(UQModule, "Running for %d elements %d %d", nElems, start, m_dim);
       m_index->search(nElems,
-                      &data[start],
+                      &data[start * m_dim],
                       knbrs,
                       &kdists[start * knbrs],
                       &kidxs[start * knbrs]);
     }
+#ifdef __ENABLE_CUDA__
+    faiss::gpu::synchronizeAllDevices();
+#endif
 
     // compute means
     if (defaultRes == AMSResourceType::HOST) {
-      TypeValue total_dist = 0;
       for (size_t i = 0; i < ndata; ++i) {
         CFATAL(UQModule,
                m_policy == AMSUQPolicy::DeltaUQ,
                "DeltaUQ is not supported yet");
         if (m_policy == AMSUQPolicy::FAISSMean) {
-          total_dist =
-              std::accumulate(kdists + i * knbrs, kdists + (i + 1) * knbrs, 0.);
-          is_acceptable[i] = (ook * total_dist) < acceptable_error;
+          TypeValue mean_dist = std::accumulate(kdists + i * knbrs,
+                                                kdists + (i + 1) * knbrs,
+                                                0.) *
+                                ook;
+          is_acceptable[i] = mean_dist < acceptable_error;
         } else if (m_policy == AMSUQPolicy::FAISSMax) {
           // Take the furtherst cluster as the distance metric
-          total_dist = kdists[i * knbrs + knbrs - 1];
-          is_acceptable[i] = (total_dist) < acceptable_error;
+          TypeValue max_dist =
+              *std::max_element(&kdists[i * knbrs],
+                                &kdists[i * knbrs + knbrs - 1]);
+          is_acceptable[i] = (max_dist) < acceptable_error;
         }
       }
     } else {
diff --git a/src/wf/cuda/utilities.cuh b/src/wf/cuda/utilities.cuh
index 76818857..7f10f26c 100644
--- a/src/wf/cuda/utilities.cuh
+++ b/src/wf/cuda/utilities.cuh
@@ -23,13 +23,9 @@
 const int warpSize = 32;
 const unsigned int fullMask = 0xffffffff;
 
-__host__ int divup(int x, int y) {
-    return (x + y - 1) / y;
-}
+__host__ int divup(int x, int y) { return (x + y - 1) / y; }
 
-__device__ __inline__ int pow2i(int e) {
-    return 1 << e;
-}
+__device__ __inline__ int pow2i(int e) { return 1 << e; }
 
 // Define this to turn on error checking
 #define CUDA_ERROR_CHECK
@@ -37,207 +33,247 @@ __device__ __inline__ int pow2i(int e) {
 #define CUDASAFECALL(err) __cudaSafeCall(err, __FILE__, __LINE__)
 #define CUDACHECKERROR() __cudaCheckError(__FILE__, __LINE__)
 
-inline void __cudaSafeCall(cudaError err, const char* file, const int line) {
+inline void __cudaSafeCall(cudaError err, const char* file, const int line)
+{
 #ifdef CUDA_ERROR_CHECK
-    if (cudaSuccess != err) {
-        fprintf(stderr, "cudaSafeCall() failed at %s:%i : %s\n", file, line,
-                cudaGetErrorString(err));
-
-        fprintf(stdout, "cudaSafeCall() failed at %s:%i : %s\n", file, line,
-                cudaGetErrorString(err));
-        exit(-1);
-    }
+  if (cudaSuccess != err) {
+    fprintf(stderr,
+            "cudaSafeCall() failed at %s:%i : %s\n",
+            file,
+            line,
+            cudaGetErrorString(err));
+
+    fprintf(stdout,
+            "cudaSafeCall() failed at %s:%i : %s\n",
+            file,
+            line,
+            cudaGetErrorString(err));
+    exit(-1);
+  }
 #endif
 
-    return;
+  return;
 }
 
 struct is_true {
-    __host__ __device__ bool operator()(const int x) { return x; }
+  __host__ __device__ bool operator()(const int x) { return x; }
 };
 
 struct is_false {
-    __host__ __device__ bool operator()(const int x) { return !x; }
+  __host__ __device__ bool operator()(const int x) { return !x; }
 };
 
 
-inline void __cudaCheckError(const char* file, const int line) {
+inline void __cudaCheckError(const char* file, const int line)
+{
 #ifdef CUDA_ERROR_CHECK
-    cudaError err = cudaGetLastError();
-    if (cudaSuccess != err) {
-        fprintf(stderr, "cudaCheckError() failed at %s:%i : %s\n", file, line,
-                cudaGetErrorString(err));
-        exit(-1);
-    }
-
-    // More careful checking. However, this will affect performance.
-    // Comment away if needed.
-    err = cudaDeviceSynchronize();
-    if (cudaSuccess != err) {
-        fprintf(stderr, "cudaCheckError() with sync failed at %s:%i : %s\n", file, line,
-                cudaGetErrorString(err));
-        exit(-1);
-    }
+  cudaError err = cudaGetLastError();
+  if (cudaSuccess != err) {
+    fprintf(stderr,
+            "cudaCheckError() failed at %s:%i : %s\n",
+            file,
+            line,
+            cudaGetErrorString(err));
+    exit(-1);
+  }
+
+  // More careful checking. However, this will affect performance.
+  // Comment away if needed.
+  err = cudaDeviceSynchronize();
+  if (cudaSuccess != err) {
+    fprintf(stderr,
+            "cudaCheckError() with sync failed at %s:%i : %s\n",
+            file,
+            line,
+            cudaGetErrorString(err));
+    exit(-1);
+  }
 #endif
 
-    return;
+  return;
 }
 
-__global__ void srand_dev(curandState* states, const int total_threads) {
-    int id = threadIdx.x + blockDim.x * blockIdx.x;
-    if (id < total_threads) {
-        int seed = id;  // different seed per thread
-        curand_init(seed, id, 0, &states[id]);
-    }
+__global__ void srand_dev(curandState* states, const int total_threads)
+{
+  int id = threadIdx.x + blockDim.x * blockIdx.x;
+  if (id < total_threads) {
+    int seed = id;  // different seed per thread
+    curand_init(seed, id, 0, &states[id]);
+  }
 }
 
-__global__ void initIndices(int* ind, int length) {
-    int id = threadIdx.x + blockDim.x * blockIdx.x;
-    if (id < length)
-        ind[id] = id;
+__global__ void initIndices(int* ind, int length)
+{
+  int id = threadIdx.x + blockDim.x * blockIdx.x;
+  if (id < length) ind[id] = id;
 }
 
 template <typename T>
-__global__ void fillRandom(bool* predicate, const int total_threads, curandState* states,
-                           const size_t length, T threshold) {
-    int id = threadIdx.x + blockDim.x * blockIdx.x;
-    if (id < total_threads) {
-        for (int i = id; i < length; i += total_threads) {
-            float x = curand_uniform(&states[id]);
-            predicate[i] = (x <= threshold);
-        }
+__global__ void fillRandom(bool* predicate,
+                           const int total_threads,
+                           curandState* states,
+                           const size_t length,
+                           T threshold)
+{
+  int id = threadIdx.x + blockDim.x * blockIdx.x;
+  if (id < total_threads) {
+    for (int i = id; i < length; i += total_threads) {
+      float x = curand_uniform(&states[id]);
+      predicate[i] = (x <= threshold);
     }
+  }
 }
 
 template <typename T>
-__global__ void computeBlockCounts(bool cond, T* d_input, int length, int* d_BlockCounts) {
-    int idx = threadIdx.x + blockIdx.x * blockDim.x;
-    if (idx < length) {
-        int pred = ( d_input[idx] == cond );
-        int BC = __syncthreads_count(pred);
-
-        if (threadIdx.x == 0) {
-            d_BlockCounts[blockIdx.x] =
-                BC;  // BC will contain the number of valid elements in all threads of this thread block
-        }
+__global__ void computeBlockCounts(bool cond,
+                                   T* d_input,
+                                   int length,
+                                   int* d_BlockCounts)
+{
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < length) {
+    int pred = (d_input[idx] == cond);
+    int BC = __syncthreads_count(pred);
+
+    if (threadIdx.x == 0) {
+      d_BlockCounts[blockIdx.x] =
+          BC;  // BC will contain the number of valid elements in all threads of this thread block
     }
+  }
 }
 
 template <typename T>
-__global__ void assignK(T** sparse, T** dense, int* indices, size_t length, int dims,
-                        bool isReverse) {
-    int idx = threadIdx.x + blockIdx.x * blockDim.x;
-    if (idx < length) {
-        int index = indices[idx];
-        if (!isReverse) {
-            for (int i = 0; i < dims; i++) {
-                dense[i][idx] = sparse[i][index];
-            }
-        } else {
-            for (int i = 0; i < dims; i++) {
-                sparse[i][index] = dense[i][idx];
-            }
-        }
+__global__ void assignK(T** sparse,
+                        T** dense,
+                        int* indices,
+                        size_t length,
+                        int dims,
+                        bool isReverse)
+{
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < length) {
+    int index = indices[idx];
+    if (!isReverse) {
+      for (int i = 0; i < dims; i++) {
+        dense[i][idx] = sparse[i][index];
+      }
+    } else {
+      for (int i = 0; i < dims; i++) {
+        sparse[i][index] = dense[i][idx];
+      }
     }
+  }
 }
 
 template <typename T>
-__global__ void compactK(bool cond, T** d_input,
+__global__ void compactK(bool cond,
+                         T** d_input,
                          T** d_output,
                          const bool* predicates,
                          const size_t length,
                          int dims,
                          int* d_BlocksOffset,
-                         bool reverse) {
-    int idx = threadIdx.x + blockIdx.x * blockDim.x;
-    extern __shared__ int warpTotals[];
-    if (idx < length) {
-        int pred = (predicates[idx] == cond);
-        int w_i = threadIdx.x / warpSize;  //warp index
-        int w_l = idx % warpSize;          //thread index within a warp
-
-        // compute exclusive prefix sum based on predicate validity to get output offset for thread in warp
-        int t_m = fullMask >> (warpSize - w_l);  //thread mask
+                         bool reverse)
+{
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  extern __shared__ int warpTotals[];
+  if (idx < length) {
+    int pred = (predicates[idx] == cond);
+    int w_i = threadIdx.x / warpSize;  //warp index
+    int w_l = idx % warpSize;          //thread index within a warp
+
+    // compute exclusive prefix sum based on predicate validity to get output offset for thread in warp
+    int t_m = fullMask >> (warpSize - w_l);  //thread mask
 #if (CUDART_VERSION < 9000)
-        int b = __ballot(pred) & t_m;  //ballot result = number whose ith bit
-                                       //is one if the ith's thread pred is true
-                                       //masked up to the current index in warp
+    int b = __ballot(pred) & t_m;  //ballot result = number whose ith bit
+                                   //is one if the ith's thread pred is true
+                                   //masked up to the current index in warp
 #else
-        int b = __ballot_sync(fullMask, pred) & t_m;
+    int b = __ballot_sync(fullMask, pred) & t_m;
 #endif
-        int t_u = __popc(
-            b);  // popc count the number of bit one. simply count the number predicated true BEFORE MY INDEX
-
-        // last thread in warp computes total valid counts for the warp
-        if (w_l == warpSize - 1) {
-            warpTotals[w_i] = t_u + pred;
-        }
-
-        // need all warps in thread block to fill in warpTotals before proceeding
-        __syncthreads();
-
-        // first numWarps threads in first warp compute exclusive prefix sum to get output offset for each warp in thread block
-        int numWarps = blockDim.x / warpSize;
-        unsigned int numWarpsMask = fullMask >> (warpSize - numWarps);
-        if (w_i == 0 && w_l < numWarps) {
-            int w_i_u = 0;
-            for ( int j = 0; j <= 5; j++) {
+    int t_u = __popc(
+        b);  // popc count the number of bit one. simply count the number predicated true BEFORE MY INDEX
+
+    // last thread in warp computes total valid counts for the warp
+    if (w_l == warpSize - 1) {
+      warpTotals[w_i] = t_u + pred;
+    }
+
+    // need all warps in thread block to fill in warpTotals before proceeding
+    __syncthreads();
+
+    // first numWarps threads in first warp compute exclusive prefix sum to get output offset for each warp in thread block
+    int numWarps = blockDim.x / warpSize;
+    unsigned int numWarpsMask = fullMask >> (warpSize - numWarps);
+    if (w_i == 0 && w_l < numWarps) {
+      int w_i_u = 0;
+      for (int j = 0; j <= 5; j++) {
 #if (CUDART_VERSION < 9000)
-                int b_j = __ballot(warpTotals[w_l] & pow2i(j));  //# of the ones in the j'th digit of the warp offsets
+        int b_j = __ballot(
+            warpTotals[w_l] &
+            pow2i(j));  //# of the ones in the j'th digit of the warp offsets
 #else
-                int b_j = __ballot_sync(numWarpsMask, warpTotals[w_l] & pow2i(j));
+        int b_j = __ballot_sync(numWarpsMask, warpTotals[w_l] & pow2i(j));
 #endif
-                w_i_u += (__popc(b_j & t_m)) << j;
-            }
-            warpTotals[w_l] = w_i_u;
-        }
-
-        // need all warps in thread block to wait until prefix sum is calculated in warpTotals
-        __syncthreads();
-
-        // if valid element, place the element in proper destination address based on thread offset in warp, warp offset in block, and block offset in grid
-        if (pred) {
-            if (!reverse) {
-                for (int i = 0; i < dims; i++)
-                    d_output[i][t_u + warpTotals[w_i] + d_BlocksOffset[blockIdx.x]] =
-                        d_input[i][idx];
-            } else {
-                for (int i = 0; i < dims; i++)
-                    d_input[i][idx] =
-                        d_output[i][t_u + warpTotals[w_i] + d_BlocksOffset[blockIdx.x]];
-            }
-        }
+        w_i_u += (__popc(b_j & t_m)) << j;
+      }
+      warpTotals[w_l] = w_i_u;
     }
-}
 
+    // need all warps in thread block to wait until prefix sum is calculated in warpTotals
+    __syncthreads();
+
+    // if valid element, place the element in proper destination address based on thread offset in warp, warp offset in block, and block offset in grid
+    if (pred) {
+      if (!reverse) {
+        for (int i = 0; i < dims; i++)
+          d_output[i][t_u + warpTotals[w_i] + d_BlocksOffset[blockIdx.x]] =
+              d_input[i][idx];
+      } else {
+        for (int i = 0; i < dims; i++)
+          d_input[i][idx] =
+              d_output[i][t_u + warpTotals[w_i] + d_BlocksOffset[blockIdx.x]];
+      }
+    }
+  }
+}
 
-template<typename TypeInValue, typename TypeOutValue>
-void __global__ linearizeK(TypeOutValue *output, const TypeInValue * const *inputs, size_t dims, size_t elements){
-    int idx = threadIdx.x + blockIdx.x * blockDim.x;
-    if ( idx >= elements )
-      return;
 
-    for (int i = 0; i < dims; i++ ){
-      output[ idx * dims + i] = static_cast<TypeOutValue>(inputs[i][idx]);
-    }
+template <typename TypeInValue, typename TypeOutValue>
+void __global__ linearizeK(TypeOutValue* output,
+                           const TypeInValue* const* inputs,
+                           size_t dims,
+                           size_t elements)
+{
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx >= elements) return;
 
+  for (int i = 0; i < dims; i++) {
+    output[idx * dims + i] = static_cast<TypeOutValue>(inputs[i][idx]);
+  }
 }
 
 
-void __global__ compute_predicate( float *data, bool *predicate, size_t nData, const size_t kneigh, float threshold){
-    int idx = threadIdx.x + blockIdx.x * blockDim.x;
-    if ( idx >= nData )
-      return;
+void __global__ compute_predicate(float* data,
+                                  bool* predicate,
+                                  size_t nData,
+                                  const size_t kneigh,
+                                  float threshold)
+{
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx >= nData) return;
 
-    int index = idx * kneigh;
-    float acc = 0.0f;
-    for (int i = 0; i < kneigh; i++)
-      acc += data[index + i];
-    acc /= static_cast<float>(kneigh);
+  int index = idx * kneigh;
+  float acc = 0.0f;
+  for (int i = 0; i < kneigh; i++) {
+    acc += data[index + i];
+  }
 
-    bool pred = acc < threshold ? true : false;
+  acc /= static_cast<float>(kneigh);
 
-    predicate[idx] = pred;
+  bool pred = acc < threshold ? true : false;
+
+  predicate[idx] = pred;
 }
 
 template <typename T>
@@ -248,107 +284,176 @@ int compact(bool cond,
             const size_t length,
             int dims,
             int blockSize,
-            bool isReverse = false) {
-    int numBlocks = divup(length, blockSize);
-    int* d_BlocksCount = ams::ResourceManager::allocate<int>(numBlocks, AMSResourceType::DEVICE);
-    int* d_BlocksOffset = ams::ResourceManager::allocate<int>(numBlocks, AMSResourceType::DEVICE);
-    // determine number of elements in the compacted list
-    int* h_BlocksCount = ams::ResourceManager::allocate<int>(numBlocks, AMSResourceType::HOST);
-    int* h_BlocksOffset = ams::ResourceManager::allocate<int>(numBlocks, AMSResourceType::HOST);
-
-    T** d_dense  = ams::ResourceManager::allocate<T*>(dims, AMSResourceType::DEVICE);
-    T** d_sparse = ams::ResourceManager::allocate<T*>(dims, AMSResourceType::DEVICE);
-
-    ams::ResourceManager::registerExternal(dense, sizeof(T*) * dims, AMSResourceType::HOST);
-    ams::ResourceManager::registerExternal(sparse, sizeof(T*) * dims, AMSResourceType::HOST);
-    ams::ResourceManager::copy(dense, d_dense);
-    ams::ResourceManager::copy(const_cast<T**> (sparse), d_sparse);
-    thrust::device_ptr<int> thrustPrt_bCount(d_BlocksCount);
-    thrust::device_ptr<int> thrustPrt_bOffset(d_BlocksOffset);
-
-    //phase 1: count number of valid elements in each thread block
-    computeBlockCounts<<<numBlocks, blockSize>>>(cond, dPredicate, length, d_BlocksCount);
-
-    //phase 2: compute exclusive prefix sum of valid block counts to get output offset for each thread block in grid
-    thrust::exclusive_scan(thrust::device, d_BlocksCount, d_BlocksCount + numBlocks, d_BlocksOffset);
-
-    //phase 3: compute output offset for each thread in warp and each warp in thread block, then output valid elements
-    compactK<<<numBlocks, blockSize, sizeof(int) * (blockSize / warpSize)>>>( cond,
-        d_sparse, d_dense, dPredicate, length, dims, d_BlocksOffset, isReverse);
-
-    ams::ResourceManager::copy(d_BlocksCount, h_BlocksCount);
-    ams::ResourceManager::copy(d_BlocksOffset, h_BlocksOffset);
-    int compact_length = h_BlocksOffset[numBlocks - 1] + thrustPrt_bCount[numBlocks - 1];
-
-    ams::ResourceManager::deallocate(d_BlocksCount, AMSResourceType::DEVICE);
-    ams::ResourceManager::deallocate(d_BlocksOffset, AMSResourceType::DEVICE);
-
-    ams::ResourceManager::deallocate(h_BlocksCount, AMSResourceType::HOST);
-    ams::ResourceManager::deallocate(h_BlocksOffset, AMSResourceType::HOST);
-
-    ams::ResourceManager::deallocate(d_dense, AMSResourceType::DEVICE);
-    ams::ResourceManager::deallocate(d_sparse, AMSResourceType::DEVICE);
-
-    ams::ResourceManager::deregisterExternal(dense);
-    ams::ResourceManager::deregisterExternal(sparse);
-    cudaDeviceSynchronize();
+            bool isReverse = false)
+{
+  int numBlocks = divup(length, blockSize);
+  int* d_BlocksCount =
+      ams::ResourceManager::allocate<int>(numBlocks, AMSResourceType::DEVICE);
+  int* d_BlocksOffset =
+      ams::ResourceManager::allocate<int>(numBlocks, AMSResourceType::DEVICE);
+  // determine number of elements in the compacted list
+  int* h_BlocksCount =
+      ams::ResourceManager::allocate<int>(numBlocks, AMSResourceType::HOST);
+  int* h_BlocksOffset =
+      ams::ResourceManager::allocate<int>(numBlocks, AMSResourceType::HOST);
+
+  T** d_dense =
+      ams::ResourceManager::allocate<T*>(dims, AMSResourceType::DEVICE);
+  T** d_sparse =
+      ams::ResourceManager::allocate<T*>(dims, AMSResourceType::DEVICE);
+
+  ams::ResourceManager::registerExternal(dense,
+                                         sizeof(T*) * dims,
+                                         AMSResourceType::HOST);
+  ams::ResourceManager::registerExternal(sparse,
+                                         sizeof(T*) * dims,
+                                         AMSResourceType::HOST);
+  ams::ResourceManager::copy(dense, d_dense);
+  ams::ResourceManager::copy(const_cast<T**>(sparse), d_sparse);
+  thrust::device_ptr<int> thrustPrt_bCount(d_BlocksCount);
+  thrust::device_ptr<int> thrustPrt_bOffset(d_BlocksOffset);
+
+  //phase 1: count number of valid elements in each thread block
+  computeBlockCounts<<<numBlocks, blockSize>>>(cond,
+                                               dPredicate,
+                                               length,
+                                               d_BlocksCount);
+
+  //phase 2: compute exclusive prefix sum of valid block counts to get output offset for each thread block in grid
+  thrust::exclusive_scan(thrust::device,
+                         d_BlocksCount,
+                         d_BlocksCount + numBlocks,
+                         d_BlocksOffset);
+
+  //phase 3: compute output offset for each thread in warp and each warp in thread block, then output valid elements
+  compactK<<<numBlocks, blockSize, sizeof(int) * (blockSize / warpSize)>>>(
+      cond,
+      d_sparse,
+      d_dense,
+      dPredicate,
+      length,
+      dims,
+      d_BlocksOffset,
+      isReverse);
+
+  ams::ResourceManager::copy(d_BlocksCount, h_BlocksCount);
+  ams::ResourceManager::copy(d_BlocksOffset, h_BlocksOffset);
+  int compact_length =
+      h_BlocksOffset[numBlocks - 1] + thrustPrt_bCount[numBlocks - 1];
+
+  ams::ResourceManager::deallocate(d_BlocksCount, AMSResourceType::DEVICE);
+  ams::ResourceManager::deallocate(d_BlocksOffset, AMSResourceType::DEVICE);
+
+  ams::ResourceManager::deallocate(h_BlocksCount, AMSResourceType::HOST);
+  ams::ResourceManager::deallocate(h_BlocksOffset, AMSResourceType::HOST);
+
+  ams::ResourceManager::deallocate(d_dense, AMSResourceType::DEVICE);
+  ams::ResourceManager::deallocate(d_sparse, AMSResourceType::DEVICE);
+
+  ams::ResourceManager::deregisterExternal(dense);
+  ams::ResourceManager::deregisterExternal(sparse);
+  cudaDeviceSynchronize();
 
-    return compact_length;
+  return compact_length;
 }
 
 template <typename T>
-int compact(bool cond, T** sparse, T** dense, int* indices, const size_t length, int dims, int blockSize,
-            const bool* dPredicate, bool isReverse = false) {
-    int numBlocks = divup(length, blockSize);
-    size_t sparseElements = length;
-
-    if (!isReverse) {
-        initIndices<<<numBlocks, blockSize>>>(indices, length);
-        if ( cond ){
-          auto last = thrust::copy_if(thrust::device, indices, indices + sparseElements, dPredicate,
-                                      indices, is_true());
-          sparseElements = last - indices;
-        }
-        else{
-          auto last = thrust::copy_if(thrust::device, indices, indices + sparseElements, dPredicate,
-                                      indices, is_false());
-          sparseElements = last - indices;
-        }
+int compact(bool cond,
+            T** sparse,
+            T** dense,
+            int* indices,
+            const size_t length,
+            int dims,
+            int blockSize,
+            const bool* dPredicate,
+            bool isReverse = false)
+{
+  int numBlocks = divup(length, blockSize);
+  size_t sparseElements = length;
+
+  if (!isReverse) {
+    initIndices<<<numBlocks, blockSize>>>(indices, length);
+    if (cond) {
+      auto last = thrust::copy_if(thrust::device,
+                                  indices,
+                                  indices + sparseElements,
+                                  dPredicate,
+                                  indices,
+                                  is_true());
+      sparseElements = last - indices;
+    } else {
+      auto last = thrust::copy_if(thrust::device,
+                                  indices,
+                                  indices + sparseElements,
+                                  dPredicate,
+                                  indices,
+                                  is_false());
+      sparseElements = last - indices;
     }
+  }
 
-    assignK<<<numBlocks, blockSize>>>(sparse, dense, indices, sparseElements, dims, isReverse);
-    cudaDeviceSynchronize();
+  assignK<<<numBlocks, blockSize>>>(
+      sparse, dense, indices, sparseElements, dims, isReverse);
+  cudaDeviceSynchronize();
 
-    return sparseElements;
+  return sparseElements;
 }
 
-template<typename TypeInValue, typename TypeOutValue>
-void device_linearize(TypeOutValue *output, const TypeInValue * const *inputs, size_t dims, size_t elements){
-// TODO: Fix "magic number".
+template <typename TypeInValue, typename TypeOutValue>
+void device_linearize(TypeOutValue* output,
+                      const TypeInValue* const* inputs,
+                      size_t dims,
+                      size_t elements)
+{
+  // TODO: Fix "magic number".
   const int NT = 256;
-// TODO: We should add a max number of blocks typically this should be around 3K.
-  int NB = (elements + NT - 1 ) / NT;
+  // TODO: We should add a max number of blocks typically this should be around 3K.
+  int NB = (elements + NT - 1) / NT;
+  DBG(Device,
+      "Linearize using %ld blocks %ld threads to transpose %ld, %ld matrix",
+      NB,
+      NT,
+      dims,
+      elements);
+
   linearizeK<<<NB, NT>>>(output, inputs, dims, elements);
+  cudaDeviceSynchronize();
 }
 
 template <typename T>
-void cuda_rand_init(bool* predicate, const size_t length, T threshold) {
-    static curandState* dev_random = NULL;
-    const int TS = 4096;
-    const int BS = 128;
-    int numBlocks = divup(TS, BS);
-    if (!dev_random) {
-        dev_random = ams::ResourceManager::allocate<curandState>(4096);
-        srand_dev<<<numBlocks, BS>>>(dev_random, TS);
-    }
-
-    fillRandom<<<numBlocks, BS>>>(predicate, TS, dev_random, length, threshold);
-    cudaDeviceSynchronize();
+void cuda_rand_init(bool* predicate, const size_t length, T threshold)
+{
+  static curandState* dev_random = NULL;
+  const int TS = 4096;
+  const int BS = 128;
+  int numBlocks = divup(TS, BS);
+  if (!dev_random) {
+    dev_random = ams::ResourceManager::allocate<curandState>(4096);
+    srand_dev<<<numBlocks, BS>>>(dev_random, TS);
+  }
+
+  DBG(Device,
+      "Random Fill using %ld blocks %ld threads to randomly initialize %ld "
+      "elements",
+      numBlocks,
+      BS,
+      length);
+  fillRandom<<<numBlocks, BS>>>(predicate, TS, dev_random, length, threshold);
+  cudaDeviceSynchronize();
 }
 
-void device_compute_predicate( float *data, bool *predicate, size_t nData, const size_t kneigh, float threshold){
+void device_compute_predicate(float* data,
+                              bool* predicate,
+                              size_t nData,
+                              const size_t kneigh,
+                              float threshold)
+{
   const int NT = 256;
-  int NB = (nData + NT - 1 ) / NT;
+  int NB = (nData + NT - 1) / NT;
+  DBG(Device,
+      "Compute predicate for %d elements with threshold %f",
+      nData,
+      threshold);
   compute_predicate<<<NB, NT>>>(data, predicate, nData, kneigh, threshold);
   cudaDeviceSynchronize();
 }
diff --git a/src/wf/resource_manager.hpp b/src/wf/resource_manager.hpp
index 3d99ddc4..b5db514e 100644
--- a/src/wf/resource_manager.hpp
+++ b/src/wf/resource_manager.hpp
@@ -141,7 +141,7 @@ PERFFASPECT()
   static TypeInValue* allocate(size_t nvalues, AMSResourceType dev = default_resource)
   {
     static auto& rm = umpire::ResourceManager::getInstance();
-    DBG(ResourceManager, "Requesting to allocate %ld values using allocator :%s %d", nvalues, getAllocatorName(dev));
+    DBG(ResourceManager, "Requesting to allocate %ld values using allocator :%s", nvalues, getAllocatorName(dev));
     auto alloc = rm.getAllocator(allocator_ids[dev]);
     TypeInValue *ret = static_cast<TypeInValue*>(alloc.allocate(nvalues * sizeof(TypeInValue)));
     CFATAL(ResourceManager, ret == nullptr,
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 268ba56d..b79a7890 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -24,5 +24,18 @@ endfunction()
 
 ADDTEST(ams_allocator ams_allocate.cpp AMSAllocate)
 ADDTEST(ams_packing cpu_packing_test.cpp AMSPack)
+if (WITH_TORCH)
 ADDTEST(ams_inference_double torch_model.cpp AMSInferDouble ${CMAKE_CURRENT_SOURCE_DIR}/debug_model.pt "double")
 ADDTEST(ams_inference_single torch_model.cpp AMSInferSingle ${CMAKE_CURRENT_SOURCE_DIR}/debug_model.pt "single")
+endif()
+
+if(WITH_FAISS)
+ADDTEST(ams_hdcache_mean_double test_hdcache.cpp AMSHDCacheMeanPolicyDouble ${CMAKE_CURRENT_SOURCE_DIR}/faiss_debug.pt "double" 0 10 4.0 4 5)
+# The max case fails on DEVICE. We should be aware abou this when adding support for CI for GPUs
+ADDTEST(ams_hdcache_max_double  test_hdcache.cpp AMSHDCacheMaxPolicyDouble ${CMAKE_CURRENT_SOURCE_DIR}/faiss_debug.pt "double" 1 10 4.0 4 5)
+
+ADDTEST(ams_hdcache_mean_single test_hdcache.cpp AMSHDCacheMeanPolicySingle ${CMAKE_CURRENT_SOURCE_DIR}/faiss_debug.pt "single" 0 10 4.0 4 5)
+# The max case fails on DEVICE. We should be aware abou this when adding support for CI for GPUs
+ADDTEST(ams_hdcache_max_single  test_hdcache.cpp AMSHDCacheMaxPolicySingle ${CMAKE_CURRENT_SOURCE_DIR}/faiss_debug.pt "single" 1 10 4.0 4 5)
+endif()
+
diff --git a/tests/faiss_debug.pt b/tests/faiss_debug.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7ea0c6eb5809890f632b40bd05c1374800999032
GIT binary patch
literal 16045
zcmY+KWq4Eh7p-x3twyT2yYB@=u)*CK93sfzFoQ#aJ8g4LTihW*W^e{ba4qgGZQ8WJ
zALp*Q_tTx{$)w5ooh@s<d$&z7Pc+oXl`UJgH#zyg|NCFD|L-|_cD_`L{=fIi9*&fO
zD`I86M9Y{Z;bJ-(E%z@3i>Yd~)Vdla#kvPck+@K4?unE(6SR^`qU2165c$&?Dz)e9
zW%l71DOp=D_x#b4`(CsxdKV#@;3zqnsg=s<I;m7KM4ESwly9G+<>i(T`8qvX$`lNj
zocn@g?Pi_0yM#;WK9O?lexy9v7b2~y$H?s92&s8DMt)3-5pDS>IbDlqmIaCVWT;g9
zu9uiX!Sc=%CXX+KiYr4SbL;A5#Xg;U{5MA0)D4sAWkckTb0P9;SF{Yg9W6ULgi4tP
zT3Nx`9NWUBVh6^19xji!hs)1LA#yq)Ldso=loD%WW&HM7Ilfvig9?Pmt5~hHNe`8y
zUxTGot5|VX43_xRFd3>wN>+_nIguwye(j5r4yz+1eNC{`$fcF92ZQ8^SuaC3M#{y?
zTB%VgQl4&!l8^a9<?-WC$^9x?Iu;C(nz^Gy%lo?b4U%ykwBlM6D^W=?Qe=FnEIJh|
zohC)g$h}dLe^`)UeTd8|5iHee>ZDt#APKIdm2K_Aq>ML23fBvk@PESO;76^T{4GTC
zuZxuPtE1&oNsV+b9x6?)M@l0@i0tbhAzLR?v-ZI<B|KIh>$K8&cCeHk8YG2ggv#`;
zQIb(qBZFRq%Dk{3DL5rY@?H&>Yw5v~tB6(>kB*SB^P^-<C!Mqz5Gn_$Yvu9bvSCk<
z99<tF?-xZ&&ig^~Yno0Pe~XYqpLLS5BuWO%43pXKqvd8Rjnw`WCc#r;#j++`-er=j
zE-|w9WsEd+1j$rSgdA~%iuzeA^{)lVg9@yBe56$B87*z-%jCUbQmj{)M4pO}?~L`}
zaD*&B5F%yS+utUwJYT1mRk!sr<i18Mt0E-J5+t=31WU(L8p)eCQk0=l(q?P8)EOTp
zZ+eAGVI6(ir<XGeBcxRho%G8UE3I0D3#w@3;T>xGOe1ZmdFgR_>Cz%vN*;+6l+wwC
z7LhV%8MUq-CF}DB$+Df%^2a8PEV>yk<J)MYZOvFI-z`}79gh+_?`!c-tUOGoHXS3y
zRfaWwVm#`#@mIKv+7Kxxl4IqsgCTOEey}*|M##*3VRC+CsJ!kHA|Hn8<*!dM5*r;P
zqfUp)q)@%I@&w5l#%)?ZOe+2nCJ_jg0J(}d9WBSplYd{Bd>t4hzWO23nw~t)sg?5e
zG!i#KBS*XuGBhPtOywda=wYM;pNN%)BXp7{Aw-^XzTa;Nmr%}f+|wA@n>$=~G$gaN
zqGWob2=QFfOSgV{kp&?#^rlXFH4K+^npi0}ELxQGAi1+GLI(5*lG}e#yYpIEN`1eF
zM#zWZ(URkQsEjBYEF~<Fa_~m1*j+l=xHDEB{2nFUGlHec;81Bq-&Ti4i9Rw`e!Hlb
z{#T;ps!J==PsT{~yIMJRNH4WhgC)ffBX4(v%Dr9T(k?7oehi9~KX!)7_$pf2eONEE
zu4v@)oDjL6UoXqL(BB;4;_4YBPlpA|!Lu=9_@R}{aXLBQI$FLjijV^2X?Ki9(lSEj
zR=-gBc1I(Vb_B~!?yt;d5i)#bj06^jNoHh>m`VrBwXd;~e_*g|yQP&Lc{DOUD@fWD
z4U;>OdKti(9{Q2YJr0td4TB_Qq)z5f(aG)pTG?DcE5~nYrC(OCJkdo-O4&$RsBkU{
zM9O9Q7hNJkTF;A;j@$vE52EDT-_cT@*VD`CrHzEkV1JC5KZHxcYmw4pVzi_b)XU{T
zA(FRWn4F%Zm3D?;>BD)exr_TOF;Y%Gj*%6gf~08naOw3qR0fA?<w;be#LtTsy)Rf=
z^wr6T6s>%U3m2afEX!+ZCAlcsS{@|rxdR7!qvhYNIvM;^FH7F*<f=7BdOeAf`P6HC
zt0+n6A1kRHb@IxPAgh`tNRJz;EQ^kpR*e%S_xA*etd}e=4!Pu3Q&oCeRrz{7UP5Xm
zNX+L%xs;wPowg)N@s<g4w}v9E=PELBup+L#iITNGUIu+urSSY@Nx7RKADbn~=Pin)
z4^5JVU*l!ugCsdrM3pgfT+%Z!UVOYB6_qHSsCdaXB0)O#RHfHWRW@`=lp?*8q+0d_
zS$<cMKrU5^luMHJ#Z*bOE8NS9*eko_&Fm!kcr;$(n<dD!uL+WslqBg@T=LfJl25_$
z^1O#j26R)UU1FllwI)h7qawLRCCaXS2~r?`qRhLb%B+hnc`!XuUi40s;SZA~b7Q;|
zTB1t+N~-MeB})EJiX6J2N*_rOZ%Vubb|y&Qa76~~O_me?)lo;0QK2qLNROBL%@i@7
zOp?2?Nit++qU`&YDAjtpL_0WH_WYA9KciH6<Z{X6CQ0JD8ZSSCU9uu9UcU8Bl79bF
zWNXJ{nXTu!dC79QV1iVclpuv9S@Ks%kb2jWC8sr6(z_@!zF54hn#?*6xTJKhBpDa2
z$n__Qa+A-opIlP^yGyS8jF(~;l4SXbM0w^<7Hb>oSUy2U?n;n-_u}PTM6z@aBuUfu
zi88;SOC(H@52DD$Aqi43U6ljX*r)1}9v2k3TO?UVETE68;ziOEB>QET?5~<Ek1xja
zdy^t9cPGidABoa-TD%PXoFo%Q#LMsk36i#%UK~!8?D>*p>i9%4Uvf#@D|%2XNrH|f
z$du8l=$0kO<3_6duXnsWJL?jQH(C1qlO%tYBv-8G?>mZAx|}4ZsqL$B336p`k_4X7
zk8i5%AMBDQ-CYu$nIz$*lVtLCmvr8kD2GNW;=8KK>9J(#PNI~15id#31W7FsFDYU4
z{Bp7^+@nh4vMxDLGg+!dy2P(@NtJnta=5=L?rsS(wz^B+6-|&|4)XIpp8G06uD6Sq
z$mfcbzn3Hqdi1JwvUGGO$*&d35?Dd*OOl&+E-Cg_m6^R1soPtTg}118a)LDKnj~wB
zB}!zQWKkZu<ooj^v85!*lu`*Yn?78-nk+Bf39{o*yp)=x$jlkZvQ(1fTWM7`^+=XK
z#;bDje4;$8lPp)r#pwB**^`PW<^-|5NR$;ummKM#O4(RdntV(WXZv_rdofw&^&<z5
zlf}Te=Zh!EfpoIQ8L4tzkq!5fWqVk%>|UqH&=W4%P7SpsT=FM(g6?&))F%_UtHw(&
zogzc3B+8q#B-yajCH)^K$(TGza&|(Z<mLGbp$U@XNwRF1s!E0?S$vz4Wp3km&aq1t
zalUW7PLvb<SpNZPao#11Te(EzRK&|Z-%U!C+l%9+3ui4TJW1M5c1cn8(y)G#M2=37
zi~C$MZIL2>Xt=9>B})7DiWIz(Bt1%~a(;5Me43CTmNBX<Uz990W+lj{CNAz(RXz{q
z?s2MeDv&IAmnebv3DPr$y0Grn#Z|$uL}|&LQuMqcwXVBlWvfKlm^WS~>yza8?s%!b
zLy_0iyxm*+PG8sUQe^i4&PnzpX|tL;Y%qDc%D$W~8Tm+&x#Y4WbzFKx742QJ5S}0z
z|Ee;baVERC3u6?qUQ;D4E>SxCOpvI1iZpnYAoo5b$}D=(sBWUvn#jEpugJ9A@ls`@
zOCD}ZlGW#vrRCUU$#|L||Fg1(bMew(b%I2HR;2l?WZBQ1av&-}j&o16pOPR7_ihU^
z70Nwv?s&4Sd`BMEyX3cIRaWj(q{v>EeBrJ+dc`H4J_)ijZ?ash6))uqCd#0uE>Qxi
zTwSh6pQXuCV*Y>pD)PLnB6m(E%e~skQnR*8W-e4EFhmtwT!xemN)=cAfZ%kh>^qeq
zIlgDgRZA+rH>64NVSec{-!FCYq{~{9U-FgBkTqfH(xpqf6ke7pZwjS~d0d+O)-53A
zKV(VBy9|+k(`57ROo=KKkdhk$GA@uI<zoDD=TVky&q(7AN|oB{1Jbj8hP*kSDqAXM
zNU6P9QZ+qGb_N({a6tYTm?^7fXGzyi0onc{Lkep%<x0U!dFl$tgn(b-nx#svxHJhX
zo+ghvX2|FA=@RNqlOpMU32UAu;d9gE+MrZfls8i%!cwKc!8DmNmp$ysk}@%wl079$
zE-vxQ-IM9^%$qJ7@@I&#q+bfg24v>XEGe3pAs?&zC8MxkjC(SrK^MRLSdb}bo+0C!
z_~rM9>GE}tU#{#-6D2-Po|Q?LW`AagIyzNa|45fM%+tXfkeqE(rT(rA*)x;6Z4F4r
z=rkE0oF&?xX|gIUAgTXk$+FP_xi}yocl-F|^55yw@nWj9-IFGzE2qh<mTB_)UB9%Q
zmmwvt1;o-ZAch-R^08f}tlg5upBXdc*Rf3bE?IJTRX}EU%ak8;{Zcnunsj=SCAx)a
zlJirBTz$>jkEO}+lnm*>oW<t^<k0?1dD1yuoaS_C<n~LA>zOj6XF!rQ0okzCFOybL
z_utYb^0Qxp&8aeWZmKNt2jpd~R5`pbQ$|immEe7uQu2G2*qiz#wq-!_oXe7p%=cWM
zD!-ojr4l{s;tWV<YW~}gOsORq;?2mATaVHt;#ii1Ez6STMOee(G{GvrEcrW4PDE!&
z=X05Ix^J2^-$_2`^@`knDPJ=~`jpEQFZmgD#V@<=WQb;eKwQ83MJ<{s^+Wx#j_k(f
zB7gL&L=V5TuM!Yz?{rB&6A<4|zx*>bAg$@|`fPr=HX~J{i)P5{f77K?-wZieJ5Aai
z^Gm;ljQJu}&Qj0gdDA4@s|*==C_{pW`{mQ*Oi|M_q~OF<F<s7-jbk$;w=q@nhqLe0
zRN3rLm5ltU^3a+kXIRS%<{MnXFNs65WM=7rEa;XYuHXDJc2}DGyq6|ZvofUUgMfrL
z^2@tOG9H^QrL}32`+Y#7if78#>Y3883Fq^BhSc$-O4Bcy@^fseyr`WeEr$B#MKiyA
z>zyguKhh<owqFj`$dK`y0y0IPDZ@u_rZ^*YP6edjtSl*AAzh9i$&zGWri|H?C2#L!
z$*_WH(#Mf5Pa^`7(m7Rru=X$8vgAOpU#e2KYU}(`-=8TR#$?In@O1g!I9=vySktS3
zRP9Gj$WV{_e$h40l!yKdne#h$!|ilg+%O==76s(gpII{LX{J;kkSdx3sj{$mKxz#M
zNWC`HXH%*)*qti58wDh!T&iRq=Wb}n^Og(=ac4@a0a<eC&va=Y7Ld0i(q-h8ENR^-
zOVY-t$$|g*<!HM!*|{lGMm3{`eKI7gS*A=iWXgC`mNb5yCYST3O3#OW@kVAz3C?!S
zA*nKYc|g8Z&Xm%B1Z4TiG#M3}C7yCwa%6d`>?boH=F<b}QOQC-Et&GAbCyhQohePd
z)cJ6RG@F$sS)((hMyE8HaXKJ3sdFCc*mO^ZXhYLwK_&K{nJQN9xd)fBq}(FzkmYHj
zS(zm{eE}Iu4dXv%$%sj*;vSJEPZwv&uWRWt;!vi{<}5Fm8IY^wsm0<{Ne@kxL+82w
zhGoeO?!w5<+*^%O<w%uu88RhRW}ghmwCVvldNflG&hSeL&l<n^B|q;Qz9miO&dm`0
z2hJ^bdNIa$b0J-7jpzPck}20MX;Qld_uqklv<yiT&rtfdF<lNHPnU7;GNs=8RC!A+
z|Fx#dDtAD}_sfvFmVk71r^|fql-!3>Wg2(coD3b#uGZn$eLXr1)}XCRi<<Lw2tKXF
zz7={L->=8EQyT0%$%j{u8EdpC^oI^130mCypo0>w!C#Yfc=1+`QKz&R5TSvYuj_2r
zqu>)Q1`gL?&Pg4diWX(JYcM2IkLl$!$Q-A^=+;_{KdwdCYAwo4)Zu%S4vFbH>@{i8
z6&i#b)ncJfgJSRXc=K9=p^tUgR!fhkymv)Y9UhF=qN!Vlw_~-KwMmPkw{%$CRF88#
zw0J*2k8Bln_*q+r1-Errl}Cq>FLh{nLx)CoJ$9DX@JA9oZnV+h{(pHZF^*q@mToQn
zWZs!WHL!lvW5iP(;)iQ7r>X`84(j1hwP-Vic|3Z=-_)aKq!z7D=<s2v4*MDFXC*Bv
zEYqN4oDTI{Xff_L4OXXUF>#m{$~8SMx6+~i0u2tn)8MjSkFYU%1n1Nv?TZds>on+F
zS%+a;^k~WWE!f}GojR1rrNuCl24${jv1OAE?ZUNa$sQNv*JJHcEgFo{;@<{Z%#G9H
z(KkI(*k2lRRobjU-3nT~>!ZW_9~yL3b#T4XV;%MF(^rEt4Ro*<)nWGv4Q@2o!xp2%
z{V*NY@_Cq5ht*%SNapo&*)%9^(xW}JXbv4-{iDORiS&=&<;|zXn(sQaU!#MKz30_y
zk=jNNe+&9HU4s%XJ!;$Z=<3kmw>=uvZ=^x4ObzOs&>-Tw24%^Z<F*#L_i3^Ho(36b
z^*Db}iyIF#nE6YCQOh+5`csdxWa(f>4K~cupciurd(qJ2^NhQp6Z`z7M-Phzv&cnW
z)@<pkN8tfFRBp`rGs$J71`W1oVcx1kjbeKA-mSr%+<N3<?U%l2(1<!0W{m2sG#LCy
zi(h84PM_x$)53B@gJWdla#IZ&>Nq2Q^7&JTUjee#PJ@$CT6~zH!Il>~T$rXocFsbM
zZ#onSB8MBbSVeu0v%iC5b(mzNht$EbjlI^^;A&ePUb}QiC*Q9h=`j7S9!-kr5T#=+
zyEN!IT8|4|bda|?sN2a=Z#|Nqa~A*7AdT}-^$L9rp^teq*j-AC9vk&&;LsuaBMq|J
zk=37CG)~f@_)Q%;v5rNn^?1~h^~UKD)Lw^E)OD#&hk{;u%$#QyYcPr2=A#Ej8)rPA
zLo@of%c6npss?-PT6`U(L+xHVEFljoF6*%3A>(nr>yOmoD7ksiSP%DF?gZ*v?zIjH
zGuUII27MN4@Ts0216bQxa-WYiy`J*lKG0(K8Vz=>(jb+)Xj^SPUc~A!aEu1Gi|bH&
zvJMpsXz;j^7KJBjk)dkvHNOs@l6CMj)}nSFEgTE<aB<fES4#&I^FHjW!!5F?vd>W`
zwOIL5gXMqfaHpgmfq7aKrT@{~XH$1`Kaj60X*ztNKl!?AF>jOxdV>y!YG_dWF}Wez
zZHsBJ@S6sm7V0pilm^u}SN5`cw7a9hz2h2OS*}H+C7eU825VmHQKk-SS)xb&9(r62
z<}RC~L1&*H{~pof%X$sgm!_BG>-akz9*2_2>w5e_p5}hip+!GE414(?uU(yb41deB
z+#we(YvHS`!R$B<-p<v)N1vk(>v43Q4nO8`cC#}c_dqr7y=7}PxUgJ@dht3Ope8++
zX|ap@xwK7##oV{WKI;&7K@WWhYm6ozo4LRGlOghPwIDg6?-$!@Q2wDF8#rsaCuEd8
z-ua-%tiu|do~+0JsO9EZ1Nzi4qF^lp9xONFc?&bTtuy1sa1)*lGox)ABU(h7aRvso
zK43zJJqApxYQTu429&yKLf?xf$p4Huw9<$Q#|*fA%ZSOvObEMVgtRxHZJY_Ox|vaH
zy$K6~4QRf_h^7@xX!nm1>37Ze5n;x;z9w|uV8A7qko3TW`jw3MkYGlR3^NKeHe-Xv
zjL)~s=snbk11C%{jWXdP-|to0fX+t@m{rw;yM80AO-u-7O${<k=*N7o&KmK(i2;9)
zHNsGwy^P@N>;@d!!5;1zuzRZszh;{7J7Z@1%YZ#vBQ75`Ba`=T9dEz|rx8hZ19G{I
z=sC!Y7}iruZ^rhICb$Y1@%tkq?!7nRP);-EoiO0vNhX9`V^39$c$I0w_xlE%sAoXY
z&PG%{V8m6Y8N)6Z(LBkFnO)3S_nq}puWHv#xbxYJw+~Hdk<WzV&CU2W%z&7-M%0{T
z##-u<ve1M^56$>G*@z=Q4d`~;h-Uwq;qGRHd!-qB%A0WcPc!l^GNSKcGhBns*l#o;
zeytIi+31zUh(5JV=&{*|lLyS$8f`=<eYiN$h}Jz!I6~iF<~Kn*-iY$l<<MRu)&&{S
zX0!p`yJR85fEo>qaJ@7GlME>S%ZQR4%_w)(fc!7aC>~E;Sm)A!5ibfGv5<a!IBmw@
zt491a%YcF(&A1b4#AC+!_}YN)!%g^d*@*c)O;|!Nts~56r8i*hdNYpoGb3oD5piG5
zNP9;;YZ{<sp5+^iI99?0%LD@!uQ1`M!OU1@99&~WfNWUUTah|ujM`(uT6)rtHLBF0
zHsAj~(uAY6>BD9dS_d;X89DsIfK$Vb*f7|L9qUYZ+{S<geN3pb%!u}86Y5N+cY6$o
zcN<VCuL(C-P{ST(RR7Zie-braWkSqLYPH0O;W-UB-rtNT<Vr{HLPwB?jb=PLYQo9^
zW;}jxKw3cqhJ2vsqsjk76IABz_sWQktam~wBMPNZ?`sA$ahdUF3j?nH&w#;MCNxeo
zBl(8`<%*bKs7XD$7|`>95o;csaD($Rb(;xeSoh+uCd92aU?jO)QP_-tn6dn%5se$0
z@Q=b-En>vVHwG-YU_!x}M&$ZrhQb+4An(^^a|U!~)EjF;zEWmf7-Ynxa|VnZYe4%7
zX8hW3#zdH5uWZ1)Eb8`)b2ij~l4Z#mIbPMlgfh&VGnLGqWBlLDc>mdevShQ(Tq80*
z^8IH9eAQ5c6J}g$XTkx_Ncp!$tes{?gLMYH8EeJ_#%iBn!rVg!OpN3FoH3&_IePJx
z*P~5%%ihN}Gaz6y;uifl{lSb%IgD7OGh$gD0~*B|v5DT!y-q!^k>Mls_l_CPQ0|}U
zMpWESj|-XbqM{L=DrOAaYCv`Nm(tLT)t!v^qq7MMe>dW64il#B;M`@IG5Wa?TK1f~
zGJTq2M2+VLxF49IsmH!@ny~Y@3B|dCHr6tu&H@8keKg=X`#sp3Y^*V$d^0nD1~ws>
zefBG8!eAHmxzAnUGol@5)B3Lw<+w8zw=%%bU3cuP33YoIu$o-_$BcUmnT|GNO-U0<
zQ@?D*IisAt8BXpgvfI3Y2|tEVJJvOi`&HA;giO5=$|-tQj62(8#8IyagNmE*f%|2C
zZzH_54X8ZcfS|fY+|6l3SJqeFZ@`@V<iTTtrl|?D$*?kk`g_c{$eGr}n6R%W_c8U+
zkY`cY|3N#7CRtHysvWm8Y-lysj++~8sPWK>JJ)O&w8DaH<!o4^x1(%B3&LvJ@IJQ%
z#r9b+^MMuKI#xUvI~L!zVM8Aq&PG{LHIEIqy4ldIupMPGEErY64oe;@zAv`pTU`sP
zPO;(SRyz{DF-~tAqJ!-C$UF^d+fene4cU9x@nW$B-QL*IBie#2zW3B(L;II@#GJMx
z?z<K5QtY^Yz=mVJZ0J_NhW?u^Xi~$9Qg#c9{<I+VlpU>CTQGm71=>MYtSWCuy3UFg
z<*oRqwH*`Y+7Rrwpk^l<N<FnBd7cG7TiS3`XUDpWcD!C=gZ+{PPY<&e#f}XL>?Ov6
zOK~;~ci511(S~V^_xHax%%8z?dF<%pvh(*e<~eA=_Ean0mb2pLARDs1vZCY?_H&$l
zkF~)TYeh4~ijSA=IRBb8{%%Fjj#hlNSYc{r!J{*FgcRdFkE|Fx!G<p9?I=Ilie8f}
zsFd4|Vu}qNsmc6$7ObCSgTJm7JLBz0%5F#Afi_$oXvG}XUpmB&0!J*ESIvr=b}RDo
z>wcds*i_brJPj<^+Qg2%o2<B8(2B%o77X}h!8DH@E&sIOPL>7No>RAFRy_7mFS8Bn
zcUmws#fH7A6$`Fd(S`9|wy>erGb=XG_qQ9Zc+%5`yz4CZbD$khwpekpjtvFKOxkHH
z7O{@@(KgiEZ$ot(ndoeRdd`Ls9qed6)Pf&NY?%Dcf=;)sxVpoFF^%bIF&mO6*)fZp
zwMej`Vx}F7_StZqHJ{yT!H1F-%%N`WOItBxq7@4_*<q|@!D_z^F+Hsq^U#L<tm|R2
z4R7;V5q;f?Zg=d+3TNyI797}Q!Tm=zcpVlvSX-UKRw(TK-_ABnsZ5S`TQTl03wCU`
z!a?3Dp0}V1bvfF@hNOyiH0eV2R*~DOHk=t?#R>8_?wSQ1gUB)K@$_PyBdsWZ)q)(X
zWkr%53#(f2jr^FBESS>Vim)AaOs&A$82bx#+mvoa%u)-|zgcl)z7?Oy;K2V`VBcv)
z{+1RD-*3gxMs_S`j&+x*HFHjhx1s$1Y{;2TkEz9n={BSnu%PD$8``}j*ZJ%i6k){>
z>SLSC8aekd9jq{1u>qXdJmg|4InH=ufdnk5bKM5l7b~_Ew<6eRfvEy(UCufEWW|dH
zWcs8PE2-b08&=FLW=C6kH1?4Ni^xm+u6B(3+k%Zh?P#^k2FooQ9+Kq^1}h2&+i|Lf
z1s5OL@xGu1?xA*s_p_t)-*z0L=P6|Q)kiBT_o7~UJL*)k;dO5dPFJ+y>ktdlqHI`p
zmAo{v!al=>Ax;~5@xD@wyI_DFD|g$lEr$)iap&|MY~$x7RvhD8%$RA%f$|nCA7h8T
zsTIqr+fZ{ac`Ir|Z_ZG7ODlGqv7w0Fj;LWa9GuPBE@4HKjr%IuiUB1#V?qXb?FRSE
zT(XdoVMQV4>q+1L)2ouvj%zKf$g<iIo!yEx?5q0#3*ySyVH(4IT8ni(wV{FDg7b5^
zGmcpCKFf{~`EAIg&qse*@$H8dTRCqHyW4Q;xec2gR$RM6oyo_4`zoWf1q(Gc{37?w
zn%XeBA@zS|!>TTJ<mSH4rBcspR@5DCN46Q<KVHtI%Yu>Q?ziq1Og?1AmkXRn_He*p
z$FUz4Y^Xv$e%KI2%@+M`#o<Bhn==^A-u@(e4?0^hf!=*%U-P)X26MmV;oeZlZd`K*
z^8M{Z>lIG4ec-^shYs|->BNGsPK?nx@Xr+o&W(4V)OH7cZ{)<ex(>XEc3^B12X38n
zAS>C4UMC$$eC@>G0uFS1;lQ@pPUM~JK$4DedEeM<4peR7#H!z&C~?7w?sJ?-w(-8D
z4lKRwM5P2LhHrF2-`a_|4i2oi=0Nf`Ck}pa;^PPhh8K6@#(!sGm@mVLTw5I|V{~9b
zZ3o(%cH&2>1Fo{XZ>|$XMmg~5kONmgvDUvGh{<x``T+-O^>pA!Hz!&}vL-uY4tJuD
z!-4p(4kSEvAp2e?ww!k0HqYEW?!@LO2R0seAb)!&x?N{q>{q*-k2oh%%}!WZ?{2RX
z=Bcc?z5}aYIS}`U11ZxT_)*k>kM*6XGtmh{B6Dwa;L2eq8cucKSp_GGuX3PQBL`X(
za^l0EyqAxs3a{UEpx$ODH18a!VRIm~rUTCWjDLu^=+D2Ood|j3#Kj{{^!Ue#4C;}?
z>cq=&PLvI2j8YEVT;@Rc{!T3CwbTvtZM734#EFh&oR~7ifjeED_{Ou1=}ASO6ZH}t
zF!K7(Z+s4N;5nRFc*nutg`N0JZCbLXuY1S?`xwAFzk8gx$Lr6ZI#GI;1Cwt%@ot*~
zj~I8{PA4|jbK=4lCtjRkpV=LlvCn~;tfPIJ6Q>?IaBYbbIrB1qz=0>XsdrB&CXtIT
zkLgK>1AR(3(SdyrTkk}~btk^xa$uZ=x)0;C+X;`6y|Vw}eI3v(aN_UY%s<+RtJJdO
zb|>z{I8lvf+OUtp?CYP(4xD`9z+f^xirUmY!JG~!ma+FsKOCUKXw3Wv20F2lo=x55
zgu_hF7;`IGI#YoxUEoZd=e>Fd%nzL~UU5R@T$H%%z}W*lcZyo9bzqdAy|trutg|BP
zXgk)4ueY42JI;Y69UQ1qzzHEM(<V7_o&C<8?Szx8Il4PBdyNyNes|#RXa}}Tb0W~g
ziOa=!&Evp<a_l>TtWR*DIL|sbr%lMkY`$*HSv*1ibFt5<WU$3MCpP6GQ>-DijRW@6
z|FP#lPI|SFHLs_CEq2n!s!lXvPF*u6rVl4`_Z%=yalpno@-${$Z+L#S1E6*lc&1e*
z&vzng=}uJc>%>jgdVQD^GvAOa6M4VyK+_LS+^4@s$;WZl`^e?Qw6adr$mPTd>eO|%
z17+iRjoOD)BYTG&Sj2b>5}k;D>qLVq+zYM9I%lm)A18htb0Q@dXOvuJ{q2OmG+8?4
zK(QRGzqu28we)106KQ80=*!q;$l>%a)VQt#uV-@4u#cDjou_8;)cc;!fkuB(7siex
zFO64{&sZni++$Ce-@`rfcSR?ZE#!5&6Z-}@kXy^WM4cv*ze1Oovm=>r=|Cs)9)6O~
z9i4c6l^jsl9gR6d?Dae6`yusSyMRpYaG*7NtU>Ks<a6L6XKLvoClWey?+oO9<ZU|Z
zSa-?6eeFPOtOI`bp+q~eh&?p!;y^)a{5-D{yQo3iT23sv<3t6{!2kGl{<++7^j_`C
z*?H~2XX^ZpzVu7te)u1CAa7A0*~4`9$X(WDgA*3+qG0xM^c3sO&sk>eeY^0!Kgbq)
zFFTF<o4avdb?%7y^k}pLkp`YW>BQsy4!rB`z=e98<)*BCrivDiRQ@cY@^ef+k5&*F
zt?+YL1w&>jaOP5x(nrA=zF+dJf?m@U=rdKs`xNY7ref}172XsDh3yK4CaCz*LBUm?
zcTH9Cv5|s>wG?QFDk$q$G1H@<?OGLSw-n6cJ$afc{9Qvq(hddV!}yvx*SAw}f1ipj
z2UKjmr=lin$-Pa*prtB5gH`aNo{BxJw|Es5PCkDqqM}Mcg`cUb*fCm#p_ziEqj*nm
z6^-Aqj&Um9Y*H|`l#1~6D%P{7*zGDT%~_XDLF=0;{M8kt#;I7gSjEIP3O`R&QBhU#
zP*HFsOhw(d3Z}BIdVi~^ZB*bms9;Pd71i0hu9%8{nyC0?S1|%AN-tJ$ajSwZTNGGt
zC<xqEk;L4isn0Jcs6gEkiYj<>oq49H{8^G`4k>8&UBScwDk9FQX!2S`wG%2ny;1PK
zf{O76m~*Lu8|CTM9R-7qs(2Z!;M`pmKXWi2@2SY%Kdn>Pmx^=c6a;rxaF4l1-&f%8
zrXqa;^;xOlL@@=!R??@#Dr%%Dc*@>pWvG~)PldEq(28|@<-N~;QJ?xMjxSPipsRwQ
zKgb6)uaK@_5xvl`_BsX?xtK4$ql%?9RYX&>A=EU&s$y|N73H@pC^(5d{i~pBCj}Mx
ze$%lEVnXTpPJYe0_nK6k{a3}N`U<);hjYG)sdLB~bG4v<-yiZU*+8rcXc+U0ii0{8
zf1g#+C`3i8)hfP~Q86r8!GS^wqRp&_JZ*SHU0N#W_)5i1);019eSDx`Gy5CmRgu8h
ziyDy6-^p`#=48xnlgUaC1^Wi72&R{rWdAYQc|;vg(C0!ksLe%%-%C_HnMIDbDQHN}
zj<r|Omh~^lPL5d96UJ$<KtcX^6+X_yyAoulm4buh_w_~vpYEw>$9tM(DQHArV)}E2
zhEel41ubWjt2}(IRWPAAz1ghb7=3-7SH<L-3Px99Z}jIqwfu69%;ZwA<P+;JtzZOs
zY(hW1l@;82!Z{qQ;_!17dD&aLE@ZL*U)N@hvlX1}$=TwZO=7=Q@+io-$J%}>=;Tn~
zqh4$N)2^w4-qikeGcs&bF{Pn`k)>74yGn2BGA?^w!`Z1yt}@wIL>UEt^&~4Z6;yvt
z-^!^tzmTkFuuk4zfc<qFz**!x*U7D7Wg``>spEaJ-Gt}9^<zxN0lE2|oEO_l4J_1>
zGsj_sf1(P-uc8y@sd-5S57^g-0t)&cQqha`@7u=x(1i2<jGV_SxK6JYXIJsSs-V&d
z6+K^)*LMm+PqE*^%+331RwvVnf`945V%GAN@vl5pFm928y<~7d_sOcd3LY}{0<!v*
zwfc&Y5B5>8m4c@EROIn0D9*mOk5cf&uOPjUg2!7Ftp31xCzqW(3VxffpdaUG^a=%y
z*~`5nDlU+_Go0-)<k81{GGx4pwPddF83l==U^g|$=cHHcqcZh+!F*dsDDZHGFVlBZ
ze%4i1ftVGfG*A#vW<yfA`<Q?A1hPuMTOU&pLVve$ZVq#Hhmx~E9r{CkbYIC%DFq+N
z;Tx54rYiU`T|pi4zGN<U#|sq$dU0=4?<vPsRNAfJWC8j&lFU$}K0zvexZL<-mm8~F
zxl!@72Z;kb828DI+Y3Ax+usetKORJ1b|Zh32W9touw#-3nv-t+Z0o_*pB`-Vc+h&9
z2Rqxj`MuVIuV>x-9MyxTjXeCF(}SMRJs9@HgWi95__t3FQgXPVALK@hD{h<$^`K~X
z5B7HQp#3lpz9qUbFVcfZKH?|35xd2M{L?)M-Q>ZS93H$`<UxgsZkV&Vak_>Ff26yS
zx!Vo&^kCK{5B{F%#*BYhb1~j)^T1WagM`K&EG*;U-xEEs@QkUR2gN3PFus`wL)i1_
zLms#pciAvE|E|Ytb~nbZV~nyM)IQ7{9X&uZH$Dya;EL9bHyzyQzsZgA_1%~m#2ml5
z(TVjX@x6h4-FVQ*jh>G^xbe)x&&wFUqZ<pxQR9go{!ZXQW#*`mpMBk?&M!SE8RN#t
zbsoH@UImW1abl_)R@Rla&w~lnzvM_aKA-a7x1%1kp^kS-QL}+=+^Frrpq*|!D(=C0
z_R@Nk8{J&2wWb@r7kc1i3<s}oIqu=-RE)=-GM>88a=8b!Lfy!d#vXIJ(d?cFi%Phm
zYwE`4<8G9?<-r2#Q+mG}+Gsbb|K@@B9}hOK^5Fa%dK&P+|Ca|wAMdB%x%;`X>AnYH
zr`(vr+@(%>F!;I~ds)u|_B^Sk2N8GOIA`+U>?JpH($ikuJlG{<tS9w;=RwY`ZtQ64
z!J}1f%sS&nRn^VUTc~*(dY9<I^QzR39;KA?U=tbHGSh>&JZ{wL#eC%JQ2~19bwge2
z#<&Y^lwtjcs(VnsAi3G-=EH+-Lp=E37_xYgd_HsIEnlDR>Or*!9vswr@Zy{Yg)=?)
z_1c3K^~l3_5C7)tM)NOj+)Vc%UoQGR)`KS#$N+QR*z87jYFe3E9xv*~t1vhJJ%t+=
z_mYzbZZxdwLI0;7{`&<Fo{`-RWIX>kdPc2giU(=g-AJO>u3^l%kMnTZgSV_VG}Ddu
zaHDNkHyY<~BkwnI)WVHh^s)^#ze4_tvyNY62WL5lxjp#vjRzM>y782LUHh+Q3$<V5
zMpK?$MZXs*9<0vqM(AH|w5HbGIR|s%$RmA9F5tnZ3=ck*BwO>{&@3T$)7{v^Ij???
z^>T(Tu5=?Mi8ES*dC!pfyB;j5;KnYq8ymY&)5DzIYUDSXeUZ;+gFU!3kiD#-@9o_<
zla1%te|>6LhB^HmJeYXOgS$L4XS^G8TyFl}N8MTTU)MYs7)F0*xv|^eLGxv-qn`&E
z<hTc8+&k-mot&jSc4Jf-H%8BNWByS$#_#i>;(xQn+5gIXZ>ZfEp7BoM?&`?i&++_6
zH;jke=(vV^jvDR1<H6L~9^BmE=JzT$!Z>3^$l;e++!bW25q)Z1gW7Yqz5bWlFmF%p
zldC1@`*{zJUL;oyxF^<n(3o5;SnNTbq0}XcbH9yWv%je=$>>coI-fg|eOvzaV4K^6
zHM`uH{(*DH*&WU~*?pR`801EwIUby1%@g`iOZpMl--9EZ-!cQ;sN0!Lk;_v1IiJ)y
zJ;99tS^XHGW_{f_JC^hP!-ED(neVL|%h_u;>gKKDK@4kN&bW;lc~GAB)V6ZBRdVCt
zYVL5(R;4D)(~x_5IOl-h2c~f+ePT@scRXm)&I89aH%^k*r*qvn#M~QyxX~qrn&$N3
z%Pk)gGrU;d$cLkLFYfrgs504yfj7Na?eU@P4<BrIy@&|%!DaFBZ?0Z=7x*x!z7K_x
zyqL7mi*fN@bnWUx?Y=%lPW2(jJ}-9t<wNtWK3s77P_?8NSMu?CUN4r6@FD$~59^D2
zF^aLHnPb`}9|EhrSh&H*&oh0<DBwl6a4#~AUX<M8<7Z|*bY;GWExdTO*@s0{ym(i{
z$IorOz(60?EMk707y8d$<T~X;t_EIIU*f}|mR`j0dXv{a6s+#W<g#9rDDFeHwO(kM
zvw1Y*4f3H4W7X;4Ma^4Y9BuEz(j*^Bt@L7r(TBRveQ5v9i)s11{C7gsq^S>Mi}}#*
zs1Lp8_%QD`FBb3eVJ2&-@X&{l#y*VuLj8D8lhHl|FZbd55ibgs@Z$JbYA8PB)A-P*
z7jw_?Vp(A?Ob5MqNlnMa`S4?|50!ad_rQnaH@x^X&xgm%S-6!K<HM;#E-!9snLovg
z$PCsO<%91l?=R}b_MJYwti#^FcoD?D{n>p;r4IA^GpE^y;Rn3vznJIP*NDbmxT*2u
z!altF?nA8zFE%Y@uRQ;wiVtradC~E_4|gw8V=_`?m=|jbc`=fG==b{&I)&#>v-g)i
z{JGVOdSO0Pc<x2(7CwGv;KT2~7<UQl|L#To>OM@q?8QEf7fZMMFxTqE2IkAY!Ha%p
zd^kzp_tjtz<gg0!j~U{_3dSna&x@7@@)_Yn0T20TMebPR?S4MgzwU+AK!$idcLN`c
zt9=-j=Ed4_KCIm6Mc$p{C)9^3)Xq&V^3%^V<@sJsFZP$FF0;JI(0Xx+o;}aw<?kR~
z3=8w3x8lVXa$5DB51v}oY%qQ4;lqM6UW{N3qdNMatKmbl4_-89{DKFlIoWSE#f!Go
zcvv+rir9SER>6m#hj|arq{on*8$Psc=|%bxYER}4z4xI}Hy?uTc~P1(RGEIXN$?_>
zkEV7XD%K&_WOnQwAJ(#le^ydg_S<$aIXmS=tp{E-eCWkr)U4iKFLKfoGr4y+^g(X=
zkbKgMPGqQ<#f#bGs12EJIL(U_%xNjX86ck@b9#}F^R=EmzpcjEJxYDp->AAijBoBm
zB{Ek0D(|P}UiP<q3-f;_<8OSJ5$8jv|1_eeiS(;oNgq~`krMm7C~%hZObx1LdC`;C
ziuuVXcgpy2UTpr!_d9!G+`*Y@<He!zUUd23L;tJPV!RK(T_T4(x0QR~GGnA1_8~6F
zi`cnjhMeRi`{%CF@3}riB>K>b{FbQX#oZVm9FKgc+20FaL)IGS#cOKuXC!OqY@OWY
z#gi-_wp8|E*9tF&_u)S3=*89h<k;cEM*7m4x_3Fm`SW;T%g6g(c%kI*VLE4j6mu4;
z#QVwZEattFpL09Khkixq183k6nW(XiyNg^_BufnzcyVf@56W9FdeN(1eZ7dIhD#o^
zH_m7|^0)2;H5ute56*7mZ0w7^oO<R%=l0wm<ahc9_WY6>lJA&oWMMru;|$NP$h~ow
zwf^*C%pfm5bLKwwpqI<Z8g=W=9TZ*Ii|d@XlAQIS^yzjV<|j+5s<98oEH%N4vt&?D
vf18zJkE}P2o^R%Cd~QuPISX@d`*4Eq>AQOoM*r$D-%ZZh;@4z5)r<cFa!R__

literal 0
HcmV?d00001

diff --git a/tests/generate_faiss.py b/tests/generate_faiss.py
new file mode 100644
index 00000000..05d66ff1
--- /dev/null
+++ b/tests/generate_faiss.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+# Copyright 2021-2023 Lawrence Livermore National Security, LLC and other
+# AMSLib Project Developers
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import argparse
+import numpy as np
+import faiss  # make faiss available
+
+
+def create_elements(nb, d, distance=10, offset=0):
+    centers = []
+    for i in range(nb):
+        c = ((i + 1) * distance) + offset
+        centers.append(np.random.uniform(low=c - 0.5, high=c + 0.5, size=(100, d)))
+
+    xb = np.vstack(centers).astype("float32")
+    return xb
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Faiss index generator for test purposes"
+    )
+    parser.add_argument(
+        "-d", "--dimensions", type=int, help="Dimensions of our vectors", default=4
+    )
+    parser.add_argument(
+        "-dst",
+        "--distance",
+        type=int,
+        help="Jump between centers in Faiss index",
+        default=10,
+    )
+    parser.add_argument(
+        "-f",
+        "--file",
+        type=str,
+        help="File to store FAISS index",
+        default="faiss_debug.pt",
+    )
+    parser.add_argument(
+        "-c",
+        "--num-centers",
+        type=int,
+        help="Number of centers in FAISS index",
+        default=10,
+    )
+    args = parser.parse_args()
+
+    xb = create_elements(args.num_centers, args.dimensions)
+    index = faiss.IndexFlatL2(args.dimensions)  # build the index
+    index.add(xb)  # add vectors to the index
+    faiss.write_index(index, args.file)
+    # print(xb)
+
+    # k=1
+    # xq_false = create_elements(args.num_centers, args.dimensions, args.distance, 5)
+    # xq_true = create_elements(args.num_centers,args.dimensions)
+    # xq = np.vstack((xq_false, xq_true)).astype('float32')
+    # print("Search for")
+    # print(xq)
+    # D, I = index.search(xq, k)
+    # print("Distance is", D)
+    # print("Indexes are", I)
+
+    # predicate = np.any(D < 4, axis=1)
+    # print(predicate)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_hdcache.cpp b/tests/test_hdcache.cpp
new file mode 100644
index 00000000..b26de8fe
--- /dev/null
+++ b/tests/test_hdcache.cpp
@@ -0,0 +1,182 @@
+/*
+ * Copyright 2021-2023 Lawrence Livermore National Security, LLC and other
+ * AMSLib Project Developers
+ *
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#include <AMS.h>
+
+#include <climits>
+#include <cstring>
+#include <iostream>
+#include <ml/hdcache.hpp>
+#include <umpire/ResourceManager.hpp>
+#include <umpire/Umpire.hpp>
+#include <vector>
+#include <wf/resource_manager.hpp>
+
+template <typename T>
+std::vector<const T *> generate_vectors(const int num_clusters,
+                                        int elements,
+                                        int dims)
+{
+  std::vector<const T *> v_data;
+  // This are fixed to mimic the way the faiss was generated
+  // The code below generates data values that are either within
+  // the distance of the faiss index or just outside of it.
+  const T distance = 10.0;
+  const T offset = 5.0;
+  for (int i = 0; i < dims; i++) {
+    T *data = ams::ResourceManager::allocate<T>(num_clusters * elements,
+                                                AMSResourceType::HOST);
+    for (int j = 0; j < elements; j++) {
+      // Generate a value for every cluster center
+      for (int k = 0; k < num_clusters; k++) {
+        T tmp = ((T)rand()) / INT_MAX;
+        tmp += (k + 1) * num_clusters;
+        if ((j % 2) == 0) {
+          tmp += offset;
+        }
+        data[j * num_clusters + k] = tmp;
+      }
+    }
+    v_data.push_back(data);
+  }
+  return std::move(v_data);
+}
+
+template <typename T>
+void print_vectors(std::vector<T *> &vec, int num_elements, int num_clusters)
+{
+  for (int i = 0; i < num_elements; i++) {
+    for (int c = 0; c < num_clusters; c++) {
+      for (auto v : vec) {
+        std::cout << v[i * num_clusters + c] << ":";
+      }
+      std::cout << "\n";
+    }
+  }
+}
+
+
+bool validate(const int num_clusters, const int elements, bool *predicates)
+{
+  bool res = true;
+  for (int j = 0; j < elements; j++) {
+    // Generate a value for every cluster center
+    for (int k = 0; k < num_clusters; k++) {
+      if (j % 2 == 0 && predicates[j * num_clusters + k] == true) {
+        res = false;
+      } else if (j % 2 == 1 && predicates[j * num_clusters + k] == false) {
+        res = false;
+      }
+    }
+  }
+  return res;
+}
+
+template <typename T>
+bool do_faiss(std::shared_ptr<HDCache<T>> &index,
+              AMSResourceType resource,
+              int nClusters,
+              int nDims,
+              int nElements,
+              float threshold)
+{
+
+  std::vector<const T *> orig_data =
+      generate_vectors<T>(nClusters, nElements, nDims);
+  std::vector<const T *> data = orig_data;
+
+  bool *predicates =
+      ams::ResourceManager::allocate<bool>(nClusters * nElements, resource);
+
+  if (resource == AMSResourceType::DEVICE) {
+    for (int i = 0; i < orig_data.size(); i++) {
+      T *d_data =
+          ams::ResourceManager::allocate<T>(nClusters * nElements, resource);
+      ams::ResourceManager::copy(const_cast<T *>(orig_data[i]),
+                                 d_data,
+                                 nClusters * nElements * sizeof(T));
+      data[i] = d_data;
+    }
+  }
+
+
+  index->evaluate(nClusters * nElements, data, predicates);
+
+  bool *h_predicates = predicates;
+
+  if (resource == AMSResourceType::DEVICE) {
+    h_predicates = ams::ResourceManager::allocate<bool>(nClusters * nElements,
+                                                        AMSResourceType::HOST);
+    ams::ResourceManager::copy(predicates, h_predicates, nClusters * nElements);
+    for (auto d : data) {
+      ams::ResourceManager::deallocate(const_cast<T *>(d),
+                                       AMSResourceType::DEVICE);
+    }
+    ams::ResourceManager::deallocate(predicates, AMSResourceType::DEVICE);
+  }
+
+
+  for (auto h_d : orig_data)
+    ams::ResourceManager::deallocate(const_cast<T *>(h_d),
+                                     AMSResourceType::HOST);
+
+  bool res = validate(nClusters, nElements, h_predicates);
+
+  ams::ResourceManager::deallocate(h_predicates, AMSResourceType::HOST);
+  return res;
+}
+
+
+int main(int argc, char *argv[])
+{
+  using namespace ams;
+
+  if (argc < 8) {
+    std::cerr << "Wrong CLI\n";
+    std::cerr << argv[0]
+              << " 'use device' 'path to faiss' 'data type (double|float)' "
+                 "'UQPolicy (0:Mean, 1:Max)' 'Num Clusters' 'Threshold' "
+                 "'number of dimensions' 'num elements'";
+    abort();
+  }
+  auto &rm = umpire::ResourceManager::getInstance();
+  int use_device = std::atoi(argv[1]);
+  char *faiss_path = argv[2];
+  char *data_type = argv[3];
+  AMSUQPolicy uq_policy = static_cast<AMSUQPolicy>(std::atoi(argv[4]));
+  int nClusters = std::atoi(argv[5]);
+  float threshold = std::atoi(argv[6]);
+  int nDims = std::atoi(argv[7]);
+  int nElements = std::atoi(argv[8]);
+
+  AMSSetupAllocator(AMSResourceType::HOST);
+  AMSResourceType resource = AMSResourceType::HOST;
+  if (use_device == 1) {
+    AMSSetupAllocator(AMSResourceType::DEVICE);
+    AMSSetDefaultAllocator(AMSResourceType::DEVICE);
+    resource = AMSResourceType::DEVICE;
+  }
+
+  if (std::strcmp("double", data_type) == 0) {
+    std::shared_ptr<HDCache<double>> cache = HDCache<double>::getInstance(
+        faiss_path, use_device, uq_policy, 10, threshold);
+    bool result =
+        do_faiss(cache, resource, nClusters, nDims, nElements, threshold);
+    cache.reset();
+    return !result;
+  } else if (std::strcmp("single", data_type) == 0) {
+    std::shared_ptr<HDCache<float>> cache = HDCache<float>::getInstance(
+        faiss_path, use_device, uq_policy, 10, threshold);
+    bool result =
+        do_faiss(cache, resource, nClusters, nDims, nElements, threshold);
+    cache.reset();
+    return !result;
+  }
+
+
+  return 0;
+}