diff --git a/faiss/python/extra_wrappers.py b/faiss/python/extra_wrappers.py index d264767724..44802340d5 100644 --- a/faiss/python/extra_wrappers.py +++ b/faiss/python/extra_wrappers.py @@ -107,17 +107,15 @@ def randn(n, seed=12345): def checksum(a): """ compute a checksum for quick-and-dirty comparisons of arrays """ a = a.view('uint8') - n = a.size - n4 = n & ~3 - cs = ivec_checksum(int(n4 / 4), swig_ptr(a[:n4].view('int32'))) - for i in range(n4, n): - cs += x[i] * 33657 + if a.ndim == 1: + return bvec_checksum(s.size, swig_ptr(a)) + n, d = a.shape + cs = np.zeros(n, dtype='uint64') + bvecs_checksum(n, d, swig_ptr(a), swig_ptr(cs)) return cs - rand_smooth_vectors_c = rand_smooth_vectors - def rand_smooth_vectors(n, d, seed=1234): res = np.empty((n, d), dtype='float32') rand_smooth_vectors_c(n, d, swig_ptr(res), seed) @@ -422,7 +420,7 @@ def __init__(self, d, k, **kwargs): including niter=25, verbose=False, spherical = False """ self.d = d - self.k = k + self.reset(k) self.gpu = False if "progressive_dim_steps" in kwargs: self.cp = ProgressiveDimClusteringParameters() @@ -437,7 +435,32 @@ def __init__(self, d, k, **kwargs): # if this raises an exception, it means that it is a non-existent field getattr(self.cp, k) setattr(self.cp, k, v) + self.set_index() + + def set_index(self): + d = self.d + if self.cp.__class__ == ClusteringParameters: + if self.cp.spherical: + self.index = IndexFlatIP(d) + else: + self.index = IndexFlatL2(d) + if self.gpu: + self.index = faiss.index_cpu_to_all_gpus(self.index, ngpu=self.gpu) + else: + if self.gpu: + fac = GpuProgressiveDimIndexFactory(ngpu=self.gpu) + else: + fac = ProgressiveDimIndexFactory() + self.fac = fac + + def reset(self, k=None): + """ prepare k-means object to perform a new clustering, possibly + with another number of centroids """ + if k is not None: + self.k = int(k) self.centroids = None + self.obj = None + self.iteration_stats = None def train(self, x, weights=None, init_centroids=None): """ Perform k-means clustering. @@ -476,12 +499,6 @@ def train(self, x, weights=None, init_centroids=None): nc, d2 = init_centroids.shape assert d2 == d faiss.copy_array_to_vector(init_centroids.ravel(), clus.centroids) - if self.cp.spherical: - self.index = IndexFlatIP(d) - else: - self.index = IndexFlatL2(d) - if self.gpu: - self.index = faiss.index_cpu_to_all_gpus(self.index, ngpu=self.gpu) clus.train(x, self.index, weights) else: # not supported for progressive dim @@ -489,11 +506,7 @@ def train(self, x, weights=None, init_centroids=None): assert init_centroids is None assert not self.cp.spherical clus = ProgressiveDimClustering(d, self.k, self.cp) - if self.gpu: - fac = GpuProgressiveDimIndexFactory(ngpu=self.gpu) - else: - fac = ProgressiveDimIndexFactory() - clus.train(n, swig_ptr(x), fac) + clus.train(n, swig_ptr(x), self.fac) centroids = faiss.vector_float_to_array(clus.centroids) diff --git a/faiss/utils/utils.cpp b/faiss/utils/utils.cpp index 2a80e373a7..9cc8d3fe45 100644 --- a/faiss/utils/utils.cpp +++ b/faiss/utils/utils.cpp @@ -428,15 +428,30 @@ void bincode_hist(size_t n, size_t nbits, const uint8_t* codes, int* hist) { } } -size_t ivec_checksum(size_t n, const int32_t* asigned) { +uint64_t ivec_checksum(size_t n, const int32_t* asigned) { const uint32_t* a = reinterpret_cast(asigned); - size_t cs = 112909; + uint64_t cs = 112909; while (n--) { cs = cs * 65713 + a[n] * 1686049; } return cs; } +uint64_t bvec_checksum(size_t n, const uint8_t* a) { + uint64_t cs = ivec_checksum(n / 4, (const int32_t*)a); + for (size_t i = n / 4 * 4; i < n; i++) { + cs = cs * 65713 + a[n] * 1686049; + } + return cs; +} + +void bvecs_checksum(size_t n, size_t d, const uint8_t* a, uint64_t* cs) { +#pragma omp parallel for if (n > 1000) + for (size_t i = 0; i < n; i++) { + cs[i] = bvec_checksum(d, a + i * d); + } +} + const float* fvecs_maybe_subsample( size_t d, size_t* n, diff --git a/faiss/utils/utils.h b/faiss/utils/utils.h index af7c677e39..373869b58f 100644 --- a/faiss/utils/utils.h +++ b/faiss/utils/utils.h @@ -121,7 +121,19 @@ int ivec_hist(size_t n, const int* v, int vmax, int* hist); void bincode_hist(size_t n, size_t nbits, const uint8_t* codes, int* hist); /// compute a checksum on a table. -size_t ivec_checksum(size_t n, const int32_t* a); +uint64_t ivec_checksum(size_t n, const int32_t* a); + +/// compute a checksum on a table. +uint64_t bvec_checksum(size_t n, const uint8_t* a); + +/** compute checksums for the rows of a matrix + * + * @param n number of rows + * @param d size per row + * @param a matrix to handle, size n * d + * @param cs output checksums, size n + */ +void bvecs_checksum(size_t n, size_t d, const uint8_t* a, uint64_t* cs); /** random subsamples a set of vectors if there are too many of them *