microsoft · harsha-simhadri · Jun 30, 2021 · Jul 1, 2021 · Jul 1, 2021 · Jul 5, 2021
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -18,7 +18,7 @@ else()
 endif()
 
 project(diskann)
-include_directories(${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/include/dll)
+include_directories(${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/include/tsl/include ${PROJECT_SOURCE_DIR}/include/dll ${PROJECT_SOURCE_DIR}/include/ols)
 
 #OpenMP
 find_package(OpenMP)
@@ -92,7 +92,7 @@ if(MSVC)
 	set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY_RELEASE ${PROJECT_SOURCE_DIR}/x64/Release)
 else()
 	set(ENV{TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD} 500000000000)
-    #	set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g -DDEBUG -O0 -fsanitize=address -fsanitize=leak -fsanitize=undefined")
+    #set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g -DDEBUG -O0 -fsanitize=address -fsanitize=leak -fsanitize=undefined")
     set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g -DDEBUG -Wall -Wextra")
 	set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Ofast -DNDEBUG -march=native -mtune=native -ftree-vectorize")
 	add_compile_options(-march=native -Wall -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free -fopenmp -fopenmp-simd -funroll-loops -Wfatal-errors -DUSE_ACCELERATED_PQ -DUSE_AVX2)

diff --git a/CompilerOptions.cmake b/CompilerOptions.cmake
@@ -0,0 +1,21 @@
+if(MSVC)
+	#changing default target to X64
+	string(REGEX REPLACE "/[M|m][A|a][C|c][H|h][I|i][N|n][E|e]:[X|x]86" "/MACHINE:X64" CMAKE_EXE_LINKER_FLAGS_INIT "${CMAKE_EXE_LINKER_FLAGS_INIT}")
+	string(REGEX REPLACE "/[M|m][A|a][C|c][H|h][I|i][N|n][E|e]:[X|x]86" "/MACHINE:X64" CMAKE_MODULE_LINKER_FLAGS_INIT "${CMAKE_MODULE_LINKER_FLAGS_INIT}")
+	string(REGEX REPLACE "/[M|m][A|a][C|c][H|h][I|i][N|n][E|e]:[X|x]86" "/MACHINE:X64" CMAKE_SHARED_LINKER_FLAGS_INIT "${CMAKE_SHARED_LINKER_FLAGS_INIT}")
+	string(REGEX REPLACE "/[M|m][A|a][C|c][H|h][I|i][N|n][E|e]:[X|x]86" "/MACHINE:X64" CMAKE_STATIC_LINKER_FLAGS_INIT "${CMAKE_STATIC_LINKER_FLAGS_INIT}")
+	string(REGEX REPLACE "/[M|m][A|a][C|c][H|h][I|i][N|n][E|e]:[X|x]86" "/MACHINE:X64" CMAKE_EXE_LINKER_FLAGS_INIT "${CMAKE_EXE_LINKER_FLAGS_INIT}")
+	string(REGEX REPLACE "/[M|m][A|a][C|c][H|h][I|i][N|n][E|e]:[X|x]86" "/MACHINE:X64" CMAKE_MODULE_LINKER_FLAGS_INIT "${CMAKE_MODULE_LINKER_FLAGS_INIT}")
+	string(REGEX REPLACE "/[M|m][A|a][C|c][H|h][I|i][N|n][E|e]:[X|x]86" "/MACHINE:X64" CMAKE_SHARED_LINKER_FLAGS_INIT "${CMAKE_SHARED_LINKER_FLAGS_INIT}")
+	string(REGEX REPLACE "/[M|m][A|a][C|c][H|h][I|i][N|n][E|e]:[X|x]86" "/MACHINE:X64" CMAKE_STATIC_LINKER_FLAGS_INIT "${CMAKE_STATIC_LINKER_FLAGS_INIT}")
+	string(REGEX REPLACE "Debug" "Release" CMAKE_BUILD_TYPE_INIT "${CMAKE_BUILD_TYPE_INIT}")
+endif()
+
+
+get_cmake_property(_varNames VARIABLES)
+list (REMOVE_DUPLICATES _varNames)
+list (SORT _varNames)
+foreach (_varName ${_varNames})
+	message(STATUS "${_varName}=${${_varName}}")
+endforeach()
+
diff --git a/LICENSE b/LICENSE
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
     DiskANN
 
     MIT License

diff --git a/README.md b/README.md
@@ -1,4 +1,164 @@
 # DiskANN
+##Linux build:
+
+Install the following packages through apt-get, and Intel MKL either by downloading the installer or using [apt](https://software.intel.com/en-us/articles/installing-intel-free-libs-and-python-apt-repo) (we tested with build 2019.4-070).
+```
+sudo apt install cmake g++ libaio-dev libgoogle-perftools-dev clang-format-4.0
+```
+
+Build
+```
+mkdir build && cd build && cmake .. && make -j 
+```
+
+##Windows build:
+
+The Windows version has been tested with the enterprise editions of Visual Studio 2017 and Visual Studio 2019
+
+**Prerequisites:**
+
+* Install CMAKE (v3.15.2 or later) from https://cmake.org
+* Install MKL from https://software.intel.com/en-us/mkl
+* Download boost (v1.71.0, later versions are not tested) from boost.org 
+
+* Environment variables: 
+    * Set a new System environment variable, called INTEL_ROOT to the "windows" folder under your MKL installation
+	   (For instance, if your install folder is "C:\Program Files (x86)\IntelSWtools", set INTEL_ROOT to "C:\Program Files (x86)\IntelSWtools\compilers_and_libraries\windows")
+    * Set BOOST_ROOT to your boost download folder
+
+**Build steps:**
+-	Open a new developer command prompt
+-	Create a "build" directory under diskann
+-	Change to the "build" directory and run  
+```
+cmake -B. -A x64 ..
+```
+**Note: Since VS comes with its own (older) version of cmake, you have to specify the full path to cmake to ensure that the right version is used.**
+-	This will create a “diskann” solution file.
+-	Open the "diskann" solution and build the "diskpriority_io" and “nsg_dll” projects in order. 
+- 	Then build all the other binaries using the ALL_BUILD project that is part of the solution
+- 	Generated binaries are stored in the diskann/x64/Debug or diskann/x64/Release directories.
+
+To build from command line, change to the "build" directory and use msbuild to first build the "diskpriority_io" and "nsg_dll" projects. And then build the entire solution, as shown below.
+```
+msbuild src\dll\diskpriority_io.vcxproj
+msbuild src\dll\nsg_dll.vcxproj
+msbuild diskann.sln
+```
+Check msbuild docs for additional options including choosing between debug and release builds.
+
+
+##Usage:
+
+We now detail the main binaries using which one can build and search indices which reside in memory as well as SSD-resident indices.
+
+**Usage for SSD-based indices**
+===============================
+
+To generate an SSD-friendly index, use the `tests/build_disk_index` program. 
+----------------------------------------------------------------------------
+
+```
+./tests/build_disk_index  [data_type<float/int8/uint8>]  [data_file.bin]  [index_prefix_path]  [R]  [L]  [B]  [M]  [T]. 
+```
+
+The arguments are as follows:
+
+(i) data_type:  The datatype is the type of dataset you wish to build an index. We support byte indices (signed int8 or unsigned uint8) or float indices. 
+
+(ii) data_file: The input data over which to build an index, in .bin format. The first 4 bytes represent number of points as integer. The next 4 bytes represent the dimension of data as integer. The following n*d*sizeof(T) bytes contain the contents of the data one data point in time. sizeof(T) is 1 for byte indices, and 4 for float indices. This will be read by the program as int8_t for signed indices, uint8_t for unsigned indices or float for float indices.
+
+(iii) index_prefix_path: the index will generate a few files, all beginning with the specified prefix path. For example, if you provide ~/index_test as the prefix path, build  generates files such as ~/index_test_pq_pivots.bin, ~/index_test_pq_compressed.bin, ~/index_test_disk.index, etc. There may be between 8 and 10 files generated with this prefix depending on how we construct the index.
+
+(iv) R: the degree of our graph index, typically between 60 and 150. Again, larger values will result in bigger indices (with longer indexing times), but better search quality. Try to ensure that the L value is at least the R value unless you need to build indices really quickly, but can somewhat compromise on quality. 
+
+(v) L: the size of search list we maintain during index building. Typical values are between 75 to 200. Larger values will take more time to build but result in indices that provide higher recall for the same search parameters.
+
+(vi) B: bound on the memory footprint of the index at search time. Once built, the index will use up only the specified RAM limit, the rest will reside on disk. This will dictate how aggressively we compress the data vectors to store in memory. Larger will yield better performance at search time.
+
+(vii) M: Limit on the memory allowed for building the index. If you specify a value less than what is required to build the index in one pass, the index is  built using a divide and conquer approach so that  sub-graphs will fit in the RAM budget. The sub-graphs are  stitched together to build the overall index. This approach can be upto 1.5 times slower than building the index in one shot. Try to allocate as much memory as possible for index build as your RAM allows.
+
+(viii) T: number of threads used by the index build process. Since the code is highly parallel, the  indexing time improves almost linearly with the number of threads (subject to the cores available on the machine).
+
+To search the SSD-index, use the `tests/search_disk_index` program. 
+----------------------------------------------------------------------------
+
+```
+./tests/search_disk_index  [index_type<float/int8/uint8>]  [index_prefix_path]  [num_nodes_to_cache]  [num_threads]  [beamwidth (use 0 to optimize internally)]  [query_file.bin]  [truthset.bin (use "null" for none)]  [K]  [result_output_prefix]  [L1]  [L2] etc.
+```
+
+The arguments are as follows:
+
+(i) data type: same as (i) above in building index.
+
+(ii) index_prefix_path: same as (iii) above in building index.
+
+(iii) num_nodes_to_cache: our program stores the entire graph on disk. For faster search performance, we provide the support to cache a few nodes (which are closest to the starting point) in memory. 
+
+(iv) num_threads: search using specified number of threads in parallel, one thread per query. More will result in more IOs, so find the balance depending on the bandwidth of the SSD.
+
+(v) beamwidth: maximum number of IO requests each query will issue per iteration of search code. Larger beamwidth williult in fewer IO round-trips per query, but might result in slightly higher number of IO requests to SSD per query. Specifying 0 will optimize the beamwidth depending on the number of threads performing search.
+
+(vi) query_file.bin: search on these queries, same format as data file (ii) above. The query file must be the same type as specified in (i).
+
+(vii) truthset.bin file. Must be in the following format, or specify "null": n, the number of queries (4 bytes) followed by d, the number of ground truth elements per query (4 bytes), followed by n*d entries per query representing the d closest IDs per query in integer format,  followed by n*d entries representing the corresponding distances (float). Total file size is 8 + 4*n*d + 4*n*d. The groundtruth file, if not available, can be calculated using our program, tests/utils/compute_groundtruth. If you just want to measure the latency numbers of search and output the nearest neighbors without calculating recall, enter "null".
+
+(viii) K: measure recall@k, meaning the accuracy of retrieving top-k nearest neighbors.
+
+(ix) result output prefix: search results will be stored in files with specified prefix, in bin format.
+
+(x, xi, ...) various search_list sizes to perform search with. Larger will result in slower latencies, but higher accuracies. Must be atleast the recall@ value in (vi).
+
+
+**Usage for in-memory indices**
+================================
+
+To generate index, use the `tests/build_memory_index` program. 
+--------------------------------------------------------------
+
+```
+./tests/build_memory_index  [data_type<int8/uint8/float>]  [data_file.bin]  [output_index_file]  [R]  [L]  [alpha]  [num_threads_to_use]
+```
+
+The arguments are as follows:
+
+(i) data_type: same as (i) above in building disk index.
+
+(ii) data_file: same as (ii) above in building disk index, the input data file in .bin format of type int8/uint8/float.
+
+(iii) output_index_file: memory index will be saved here.
+
+(iv) R: max degree of index: larger is typically better, range (50-150). Preferrably ensure that L is at least R.
+
+(v) L: candidate_list_size for building index, larger is better (typical range: 75 to 200)
+
+(vi) alpha: float value which determines how dense our overall graph will be, and diameter will be log of n base alpha (roughly). Typical values are between 1 to 1.5. 1 will yield sparsest graph, 1.5 will yield denser graphs.
+
+(vii) number of threads to use: indexing uses specified number of threads.
+
+
+To search the generated index, use the `tests/search_memory_index` program:
+---------------------------------------------------------------------------
+
+```
+./tests/search_memory_index  [index_type<float/int8/uint8>]  [data_file.bin]  [memory_index_path]  [query_file.bin]  [truthset.bin (use "null" for none)] [K]  [result_output_prefix]  [L1]  [L2] etc. 
+```
+
+The arguments are as follows:
+
+(i) data type: same as (i) above in building index.
+
+(ii) memory_index_path: enter path of index built (argument (iii) above in building memory index).
+
+(iii) query_bin: search on these queries, same format as data file (ii) above. The query file must be the same type as specified in (i).
+
+(iv) Truthset file. Must be in the following format: n, the number of queries (4 bytes) followed by d, the number of ground truth elements per query (4 bytes), followed by n*d entries per query representing the d closest IDs per query in integer format,  followed by n*d entries representing the corresponding distances (float). Total file size is 8 + 4*n*d + 4*n*d. The groundtruth file, if not available, can be calculated using our program, tests/utils/compute_groundtruth.
+
+(v) K: search for recall@k, meaning accuracy of retrieving top-k nearest neighbors.
+
+(vi) result output prefix: will search and store the computed results in the files with specified prefix in bin format.
+
+(vii, viii, ...) various search_list sizes to perform search with. Larger will result in slower latencies, but higher accuracies. Must be atleast the recall@ value in (vi).
 
 The goal of the project is to build scalable, performant and cost-effective approximate nearest neighbor search algorithms.
 The initial release has the in-memory version of the [DiskANN paper](https://papers.nips.cc/paper/9527-rand-nsg-fast-accurate-billion-point-nearest-neighbor-search-on-a-single-node.pdf) published in NeurIPS 2019. 

diff --git a/dependencies/windows/dll/LIBEAY32.dll b/dependencies/windows/dll/LIBEAY32.dll
diff --git a/dependencies/windows/dll/SSLEAY32.dll b/dependencies/windows/dll/SSLEAY32.dll
diff --git a/dependencies/windows/dll/boost_date_time-vc141-mt-x64-1_70.dll b/dependencies/windows/dll/boost_date_time-vc141-mt-x64-1_70.dll
diff --git a/dependencies/windows/dll/cpprest_2_10.dll b/dependencies/windows/dll/cpprest_2_10.dll
diff --git a/dependencies/windows/dll/zlib1.dll b/dependencies/windows/dll/zlib1.dll
diff --git a/include/Neighbor_Tag.h b/include/Neighbor_Tag.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <cassert>
+#include <map>
+#include <shared_mutex>
+#include <sstream>
+#include <stack>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include "tsl/robin_set.h"
+
+#include "parameters.h"
+
+namespace diskann {
+
+  template<typename TagT = int>
+  struct Neighbor_Tag {
+    TagT  tag;
+    float dist;
+
+    Neighbor_Tag() = default;
+
+    Neighbor_Tag(TagT tag, float dist) : tag{tag}, dist{dist} {
+    }
+    inline bool operator<(const Neighbor_Tag &other) const {
+      return (dist < other.dist);
+    }
+    inline bool operator==(const Neighbor_Tag &other) const {
+      return (tag == other.tag);
+    }
+  };
+}  // namespace diskann
diff --git a/include/aligned_file_reader.h b/include/aligned_file_reader.h
@@ -16,9 +16,10 @@ typedef io_context_t IOContext;
 #else
 #include <Windows.h>
 #include <minwinbase.h>
+#include <memory>
 
 #ifndef USE_BING_INFRA
-struct IOContext{
+struct IOContext {
   HANDLE                  fhandle = NULL;
   HANDLE                  iocp = NULL;
   std::vector<OVERLAPPED> reqs;
@@ -77,7 +78,7 @@ struct AlignedRead {
 class AlignedFileReader {
  protected:
   tsl::robin_map<std::thread::id, IOContext> ctx_map;
-  std::mutex ctx_mut;
+  std::mutex                                 ctx_mut;
 
  public:
   // returns the thread-specific context
@@ -91,9 +92,11 @@ class AlignedFileReader {
   // de-register thread-id for a context
   virtual void deregister_thread() = 0;
 
+  virtual void deregister_all_threads() = 0;
   // Open & close ops
   // Blocking calls
-  virtual void open(const std::string& fname) = 0;
+  virtual void open(const std::string& fname, bool enable_writes,
+                    bool enable_create) = 0;
   virtual void close() = 0;
 
   // process batch of aligned requests in parallel

diff --git a/include/ann_exception.h b/include/ann_exception.h
@@ -19,6 +19,7 @@ namespace diskann {
                                    unsigned int       lineNum);
 
     DISKANN_DLLEXPORT std::string message() const;
+    DISKANN_DLLEXPORT int         errorCode() const;
 
    private:
     int          _errorCode;

diff --git a/include/aux_utils.h b/include/aux_utils.h
@@ -1,4 +1,4 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT license.
 
 #pragma once
@@ -27,23 +27,41 @@ typedef int FileHandle;
 
 #include "cached_io.h"
 #include "common_includes.h"
+#include "tsl/robin_set.h"
 #include "utils.h"
 #include "windows_customizations.h"
 
 namespace diskann {
-  const size_t   TRAINING_SET_SIZE = 1500000;
+
+  const size_t   MAX_PQ_TRAINING_SET_SIZE = 256000;
+  const size_t   MAX_SAMPLE_POINTS_FOR_WARMUP = 1000000;
+  const double   PQ_TRAINING_SET_FRACTION = 0.1;
   const double   SPACE_FOR_CACHED_NODES_IN_GB = 0.25;
   const double   THRESHOLD_FOR_CACHING_IN_GB = 1.0;
   const uint32_t NUM_NODES_TO_CACHE = 250000;
   const uint32_t WARMUP_L = 20;
 
-  template<typename T>
+  template<typename T, typename TagT>
   class PQFlashIndex;
 
+  DISKANN_DLLEXPORT double get_memory_budget(const std::string &mem_budget_str);
+  DISKANN_DLLEXPORT double get_memory_budget(double search_ram_budget_in_gb);
+  DISKANN_DLLEXPORT void   add_new_file_to_single_index(std::string index_file,
+                                                        std::string new_file);
+
+  DISKANN_DLLEXPORT size_t calculate_num_pq_chunks(double final_index_ram_limit,
+                                                   size_t points_num,
+                                                   uint32_t dim);
+
   DISKANN_DLLEXPORT double calculate_recall(
       unsigned num_queries, unsigned *gold_std, float *gs_dist, unsigned dim_gs,
       unsigned *our_results, unsigned dim_or, unsigned recall_at);
 
+  DISKANN_DLLEXPORT double calculate_recall(
+      unsigned num_queries, unsigned *gold_std, float *gs_dist, unsigned dim_gs,
+      unsigned *our_results, unsigned dim_or, unsigned recall_at,
+      const tsl::robin_set<unsigned> &active_tags);
+
   DISKANN_DLLEXPORT void read_idmap(const std::string &    fname,
                                     std::vector<unsigned> &ivecs);
 
@@ -70,26 +88,29 @@ namespace diskann {
 
   template<typename T>
   DISKANN_DLLEXPORT int build_merged_vamana_index(
-      std::string base_file, diskann::Metric _compareMetric, unsigned L,
-      unsigned R, double sampling_rate, double ram_budget,
-      std::string mem_index_path, std::string medoids_file,
-      std::string centroids_file);
+      std::string base_file, diskann::Metric _compareMetric,
+      bool single_index_file, unsigned L, unsigned R, double sampling_rate,
+      double ram_budget, std::string mem_index_path, std::string medoids_file,
+      std::string centroids_file, const char *tag_file = nullptr);
 
-  template<typename T>
+  template<typename T, typename TagT = uint32_t>
   DISKANN_DLLEXPORT uint32_t optimize_beamwidth(
-      std::unique_ptr<diskann::PQFlashIndex<T>> &_pFlashIndex, T *tuning_sample,
-      _u64 tuning_sample_num, _u64 tuning_sample_aligned_dim, uint32_t L,
-      uint32_t nthreads, uint32_t start_bw = 2);
+      std::unique_ptr<diskann::PQFlashIndex<T, TagT>> &_pFlashIndex,
+      T *tuning_sample, _u64 tuning_sample_num, _u64 tuning_sample_aligned_dim,
+      uint32_t L, uint32_t nthreads, uint32_t start_bw = 2);
 
-  template<typename T>
+  template<typename T, typename TagT = uint32_t>
   DISKANN_DLLEXPORT bool build_disk_index(const char *    dataFilePath,
                                           const char *    indexFilePath,
                                           const char *    indexBuildParameters,
-                                          diskann::Metric _compareMetric);
-
-  template<typename T>
-  DISKANN_DLLEXPORT void create_disk_layout(const std::string base_file,
-                                            const std::string mem_index_file,
-                                            const std::string output_file);
+                                          diskann::Metric _compareMetric,
+                                          bool            single_file_index,
+                                          const char *    tag_file = nullptr);
 
+  template<typename T, typename TagT = uint32_t>
+  DISKANN_DLLEXPORT void create_disk_layout(
+      const std::string &mem_index_file, const std::string &base_file,
+      const std::string &tag_file, const std::string &pq_pivots_file,
+      const std::string &pq_compressed_vectors_file, bool single_file_index,
+      const std::string &output_file);
 }  // namespace diskann