Skip to content

Commit

Permalink
Large two-level clustering (#2882)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #2882

A two level clustering version where the training data does not need to fit in RAM.

Reviewed By: algoriddle

Differential Revision: D44557021

fbshipit-source-id: 892d4fec4588eb33da6e7a82c15040f39426485e
  • Loading branch information
mdouze authored and facebook-github-bot committed May 31, 2023
1 parent 6fd0cb6 commit 90349f2
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 22 deletions.
51 changes: 32 additions & 19 deletions faiss/python/extra_wrappers.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,17 +107,15 @@ def randn(n, seed=12345):
def checksum(a):
""" compute a checksum for quick-and-dirty comparisons of arrays """
a = a.view('uint8')
n = a.size
n4 = n & ~3
cs = ivec_checksum(int(n4 / 4), swig_ptr(a[:n4].view('int32')))
for i in range(n4, n):
cs += x[i] * 33657
if a.ndim == 1:
return bvec_checksum(s.size, swig_ptr(a))
n, d = a.shape
cs = np.zeros(n, dtype='uint64')
bvecs_checksum(n, d, swig_ptr(a), swig_ptr(cs))
return cs


rand_smooth_vectors_c = rand_smooth_vectors


def rand_smooth_vectors(n, d, seed=1234):
res = np.empty((n, d), dtype='float32')
rand_smooth_vectors_c(n, d, swig_ptr(res), seed)
Expand Down Expand Up @@ -422,7 +420,7 @@ def __init__(self, d, k, **kwargs):
including niter=25, verbose=False, spherical = False
"""
self.d = d
self.k = k
self.reset(k)
self.gpu = False
if "progressive_dim_steps" in kwargs:
self.cp = ProgressiveDimClusteringParameters()
Expand All @@ -437,7 +435,32 @@ def __init__(self, d, k, **kwargs):
# if this raises an exception, it means that it is a non-existent field
getattr(self.cp, k)
setattr(self.cp, k, v)
self.set_index()

def set_index(self):
d = self.d
if self.cp.__class__ == ClusteringParameters:
if self.cp.spherical:
self.index = IndexFlatIP(d)
else:
self.index = IndexFlatL2(d)
if self.gpu:
self.index = faiss.index_cpu_to_all_gpus(self.index, ngpu=self.gpu)
else:
if self.gpu:
fac = GpuProgressiveDimIndexFactory(ngpu=self.gpu)
else:
fac = ProgressiveDimIndexFactory()
self.fac = fac

def reset(self, k=None):
""" prepare k-means object to perform a new clustering, possibly
with another number of centroids """
if k is not None:
self.k = int(k)
self.centroids = None
self.obj = None
self.iteration_stats = None

def train(self, x, weights=None, init_centroids=None):
""" Perform k-means clustering.
Expand Down Expand Up @@ -476,24 +499,14 @@ def train(self, x, weights=None, init_centroids=None):
nc, d2 = init_centroids.shape
assert d2 == d
faiss.copy_array_to_vector(init_centroids.ravel(), clus.centroids)
if self.cp.spherical:
self.index = IndexFlatIP(d)
else:
self.index = IndexFlatL2(d)
if self.gpu:
self.index = faiss.index_cpu_to_all_gpus(self.index, ngpu=self.gpu)
clus.train(x, self.index, weights)
else:
# not supported for progressive dim
assert weights is None
assert init_centroids is None
assert not self.cp.spherical
clus = ProgressiveDimClustering(d, self.k, self.cp)
if self.gpu:
fac = GpuProgressiveDimIndexFactory(ngpu=self.gpu)
else:
fac = ProgressiveDimIndexFactory()
clus.train(n, swig_ptr(x), fac)
clus.train(n, swig_ptr(x), self.fac)

centroids = faiss.vector_float_to_array(clus.centroids)

Expand Down
19 changes: 17 additions & 2 deletions faiss/utils/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -428,15 +428,30 @@ void bincode_hist(size_t n, size_t nbits, const uint8_t* codes, int* hist) {
}
}

size_t ivec_checksum(size_t n, const int32_t* asigned) {
uint64_t ivec_checksum(size_t n, const int32_t* asigned) {
const uint32_t* a = reinterpret_cast<const uint32_t*>(asigned);
size_t cs = 112909;
uint64_t cs = 112909;
while (n--) {
cs = cs * 65713 + a[n] * 1686049;
}
return cs;
}

uint64_t bvec_checksum(size_t n, const uint8_t* a) {
uint64_t cs = ivec_checksum(n / 4, (const int32_t*)a);
for (size_t i = n / 4 * 4; i < n; i++) {
cs = cs * 65713 + a[n] * 1686049;
}
return cs;
}

void bvecs_checksum(size_t n, size_t d, const uint8_t* a, uint64_t* cs) {
#pragma omp parallel for if (n > 1000)
for (size_t i = 0; i < n; i++) {
cs[i] = bvec_checksum(d, a + i * d);
}
}

const float* fvecs_maybe_subsample(
size_t d,
size_t* n,
Expand Down
14 changes: 13 additions & 1 deletion faiss/utils/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,19 @@ int ivec_hist(size_t n, const int* v, int vmax, int* hist);
void bincode_hist(size_t n, size_t nbits, const uint8_t* codes, int* hist);

/// compute a checksum on a table.
size_t ivec_checksum(size_t n, const int32_t* a);
uint64_t ivec_checksum(size_t n, const int32_t* a);

/// compute a checksum on a table.
uint64_t bvec_checksum(size_t n, const uint8_t* a);

/** compute checksums for the rows of a matrix
*
* @param n number of rows
* @param d size per row
* @param a matrix to handle, size n * d
* @param cs output checksums, size n
*/
void bvecs_checksum(size_t n, size_t d, const uint8_t* a, uint64_t* cs);

/** random subsamples a set of vectors if there are too many of them
*
Expand Down

0 comments on commit 90349f2

Please sign in to comment.