Large two-level clustering (#2882)

Summary: Pull Request resolved: #2882 A two level clustering version where the training data does not need to fit in RAM. Reviewed By: algoriddle Differential Revision: D44557021 fbshipit-source-id: 892d4fec4588eb33da6e7a82c15040f39426485e
facebookresearch · May 31, 2023 · 90349f2 · 90349f2
1 parent 6fd0cb6
commit 90349f2
Show file tree

Hide file tree

Showing 3 changed files with 62 additions and 22 deletions.
diff --git a/faiss/python/extra_wrappers.py b/faiss/python/extra_wrappers.py
@@ -107,17 +107,15 @@ def randn(n, seed=12345):
 def checksum(a):
  """ compute a checksum for quick-and-dirty comparisons of arrays """
  a = a.view('uint8')
- n = a.size
- n4 = n & ~3
- cs = ivec_checksum(int(n4 / 4), swig_ptr(a[:n4].view('int32')))
- for i in range(n4, n):
-  cs += x[i] * 33657
+ if a.ndim == 1:
+  return bvec_checksum(s.size, swig_ptr(a))
+ n, d = a.shape
+ cs = np.zeros(n, dtype='uint64')
+ bvecs_checksum(n, d, swig_ptr(a), swig_ptr(cs))
  return cs
 
-
 rand_smooth_vectors_c = rand_smooth_vectors
 
-
 def rand_smooth_vectors(n, d, seed=1234):
  res = np.empty((n, d), dtype='float32')
  rand_smooth_vectors_c(n, d, swig_ptr(res), seed)
@@ -422,7 +420,7 @@ def __init__(self, d, k, **kwargs):
  including niter=25, verbose=False, spherical = False
  """
  self.d = d
- self.k = k
+ self.reset(k)
  self.gpu = False
  if "progressive_dim_steps" in kwargs:
  self.cp = ProgressiveDimClusteringParameters()
@@ -437,7 +435,32 @@ def __init__(self, d, k, **kwargs):
  # if this raises an exception, it means that it is a non-existent field
  getattr(self.cp, k)
  setattr(self.cp, k, v)
+ self.set_index()
+
+ def set_index(self):
+ d = self.d
+ if self.cp.__class__ == ClusteringParameters:
+ if self.cp.spherical:
+ self.index = IndexFlatIP(d)
+ else:
+ self.index = IndexFlatL2(d)
+ if self.gpu:
+ self.index = faiss.index_cpu_to_all_gpus(self.index, ngpu=self.gpu)
+ else:
+ if self.gpu:
+ fac = GpuProgressiveDimIndexFactory(ngpu=self.gpu)
+ else:
+ fac = ProgressiveDimIndexFactory()
+ self.fac = fac
+
+ def reset(self, k=None):
+ """ prepare k-means object to perform a new clustering, possibly
+ with another number of centroids """
+ if k is not None:
+ self.k = int(k)
  self.centroids = None
+ self.obj = None
+ self.iteration_stats = None
 
  def train(self, x, weights=None, init_centroids=None):
  """ Perform k-means clustering.
@@ -476,24 +499,14 @@ def train(self, x, weights=None, init_centroids=None):
  nc, d2 = init_centroids.shape
  assert d2 == d
  faiss.copy_array_to_vector(init_centroids.ravel(), clus.centroids)
- if self.cp.spherical:
- self.index = IndexFlatIP(d)
- else:
- self.index = IndexFlatL2(d)
- if self.gpu:
- self.index = faiss.index_cpu_to_all_gpus(self.index, ngpu=self.gpu)
  clus.train(x, self.index, weights)
  else:
  # not supported for progressive dim
  assert weights is None
  assert init_centroids is None
  assert not self.cp.spherical
  clus = ProgressiveDimClustering(d, self.k, self.cp)
- if self.gpu:
- fac = GpuProgressiveDimIndexFactory(ngpu=self.gpu)
- else:
- fac = ProgressiveDimIndexFactory()
- clus.train(n, swig_ptr(x), fac)
+ clus.train(n, swig_ptr(x), self.fac)
 
  centroids = faiss.vector_float_to_array(clus.centroids)
 

diff --git a/faiss/utils/utils.cpp b/faiss/utils/utils.cpp
@@ -428,15 +428,30 @@ void bincode_hist(size_t n, size_t nbits, const uint8_t* codes, int* hist) {
  }
 }
 
-size_t ivec_checksum(size_t n, const int32_t* asigned) {
+uint64_t ivec_checksum(size_t n, const int32_t* asigned) {
  const uint32_t* a = reinterpret_cast<const uint32_t*>(asigned);
- size_t cs = 112909;
+ uint64_t cs = 112909;
  while (n--) {
  cs = cs * 65713 + a[n] * 1686049;
  }
  return cs;
 }
 
+uint64_t bvec_checksum(size_t n, const uint8_t* a) {
+ uint64_t cs = ivec_checksum(n / 4, (const int32_t*)a);
+ for (size_t i = n / 4 * 4; i < n; i++) {
+ cs = cs * 65713 + a[n] * 1686049;
+ }
+ return cs;
+}
+
+void bvecs_checksum(size_t n, size_t d, const uint8_t* a, uint64_t* cs) {
+#pragma omp parallel for if (n > 1000)
+ for (size_t i = 0; i < n; i++) {
+ cs[i] = bvec_checksum(d, a + i * d);
+ }
+}
+
 const float* fvecs_maybe_subsample(
  size_t d,
  size_t* n,

diff --git a/faiss/utils/utils.h b/faiss/utils/utils.h
@@ -121,7 +121,19 @@ int ivec_hist(size_t n, const int* v, int vmax, int* hist);
 void bincode_hist(size_t n, size_t nbits, const uint8_t* codes, int* hist);
 
 /// compute a checksum on a table.
-size_t ivec_checksum(size_t n, const int32_t* a);
+uint64_t ivec_checksum(size_t n, const int32_t* a);
+
+/// compute a checksum on a table.
+uint64_t bvec_checksum(size_t n, const uint8_t* a);
+
+/** compute checksums for the rows of a matrix
+ *
+ * @param n number of rows
+ * @param d size per row
+ * @param a matrix to handle, size n * d
+ * @param cs output checksums, size n
+ */
+void bvecs_checksum(size_t n, size_t d, const uint8_t* a, uint64_t* cs);
 
 /** random subsamples a set of vectors if there are too many of them
  *