Search disk index fix (#20)

* change default cache behavior in search_disk_index; scale factors in float to int8 * added int8_to_float_scale file
microsoft · Sep 19, 2021 · 10c1b3a · 10c1b3a
1 parent 3e7d511
commit 10c1b3a
Show file tree

Hide file tree

Showing 4 changed files with 75 additions and 4 deletions.
diff --git a/tests/search_disk_index.cpp b/tests/search_disk_index.cpp
@@ -149,9 +149,9 @@ int search_disk_index(int argc, char** argv) {
   std::vector<uint32_t> node_list;
   diskann::cout << "Caching " << num_nodes_to_cache
                 << " BFS nodes around medoid(s)" << std::endl;
-  _pFlashIndex->cache_bfs_levels(num_nodes_to_cache, node_list);
-  //  _pFlashIndex->generate_cache_list_from_sample_queries(
-  //      warmup_query_file, 15, 6, num_nodes_to_cache, num_threads, node_list);
+  //_pFlashIndex->cache_bfs_levels(num_nodes_to_cache, node_list);
+  _pFlashIndex->generate_cache_list_from_sample_queries(
+       warmup_query_file, 15, 6, num_nodes_to_cache, num_threads, node_list);
   _pFlashIndex->load_cache_list(node_list);
   node_list.clear();
   node_list.shrink_to_fit();

diff --git a/tests/utils/CMakeLists.txt b/tests/utils/CMakeLists.txt
@@ -47,6 +47,15 @@ else()
 	target_link_libraries(int8_to_float ${PROJECT_NAME})
 endif()
 
+add_executable(int8_to_float_scale int8_to_float_scale.cpp)
+if(MSVC)
+	target_link_options(int8_to_float_scale PRIVATE /MACHINE:x64)
+	target_link_libraries(int8_to_float_scale debug ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}/diskann_dll.lib)
+	target_link_libraries(int8_to_float_scale optimized ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib)
+else()
+	target_link_libraries(int8_to_float_scale ${PROJECT_NAME})
+endif()
+
 add_executable(uint8_to_float uint8_to_float.cpp)
 if(MSVC)
 	target_link_options(uint8_to_float PRIVATE /MACHINE:x64)

diff --git a/tests/utils/float_bin_to_int8.cpp b/tests/utils/float_bin_to_int8.cpp
@@ -12,7 +12,7 @@ void block_convert(std::ofstream& writer, int8_t* write_buf,
   for (_u64 i = 0; i < npts; i++) {
     for (_u64 d = 0; d < ndims; d++) {
       write_buf[d + i * ndims] =
-          (int8_t)((read_buf[d + i * ndims] - bias) * (256.0 / scale));
+          (int8_t)((read_buf[d + i * ndims] - bias) * (254.0 / scale));
     }
   }
   writer.write((char*) write_buf, npts * ndims);

diff --git a/tests/utils/int8_to_float_scale.cpp b/tests/utils/int8_to_float_scale.cpp
@@ -0,0 +1,62 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT license.
+
+#include <iostream>
+#include "utils.h"
+
+void block_convert(std::ofstream& writer, float* write_buf,
+                   std::ifstream& reader, int8_t* read_buf, _u64 npts,
+                   _u64 ndims, float bias, float scale) {
+  reader.read((char*) read_buf, npts * ndims * sizeof(int8_t));
+
+  for (_u64 i = 0; i < npts; i++) {
+    for (_u64 d = 0; d < ndims; d++) {
+      write_buf[d + i * ndims] =
+          (((float)read_buf[d + i * ndims] - bias) * scale);
+    }
+  }
+  writer.write((char*) write_buf, npts * ndims * sizeof(float));
+}
+
+int main(int argc, char** argv) {
+  if (argc != 5) {
+    std::cout << "Usage: " << argv[0] << "  input-int8.bin  output-float.bin  bias  scale"
+              << std::endl;
+    exit(-1);
+  }
+
+  std::ifstream reader(argv[1], std::ios::binary);
+  _u32          npts_u32;
+  _u32          ndims_u32;
+  reader.read((char*) &npts_u32, sizeof(_s32));
+  reader.read((char*) &ndims_u32, sizeof(_s32));
+  size_t npts = npts_u32;
+  size_t ndims = ndims_u32;
+  std::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims
+            << std::endl;
+
+  _u64 blk_size = 131072;
+  _u64 nblks = ROUND_UP(npts, blk_size) / blk_size;
+
+  std::ofstream writer(argv[2], std::ios::binary);
+  auto          read_buf = new int8_t[blk_size * ndims];
+  auto          write_buf = new float[blk_size * ndims];
+  float         bias = atof(argv[3]);
+  float         scale = atof(argv[4]);
+
+  writer.write((char*) (&npts_u32), sizeof(_u32));
+  writer.write((char*) (&ndims_u32), sizeof(_u32));
+
+  for (_u64 i = 0; i < nblks; i++) {
+    _u64 cblk_size = std::min(npts - i * blk_size, blk_size);
+    block_convert(writer, write_buf, reader, read_buf, cblk_size, ndims, bias,
+                  scale);
+    std::cout << "Block #" << i << " written" << std::endl;
+  }
+
+  delete[] read_buf;
+  delete[] write_buf;
+
+  writer.close();
+  reader.close();
+}