diff --git a/faiss/impl/AdditiveQuantizer.cpp b/faiss/impl/AdditiveQuantizer.cpp index 42d37f32a9..64e861ab1d 100644 --- a/faiss/impl/AdditiveQuantizer.cpp +++ b/faiss/impl/AdditiveQuantizer.cpp @@ -24,6 +24,7 @@ #include #include #include +#include #include extern "C" { diff --git a/faiss/impl/ScalarQuantizer.cpp b/faiss/impl/ScalarQuantizer.cpp index 528843f606..9266392ab8 100644 --- a/faiss/impl/ScalarQuantizer.cpp +++ b/faiss/impl/ScalarQuantizer.cpp @@ -15,6 +15,8 @@ #include #include +#include + #ifdef __SSE__ #include #endif @@ -78,10 +80,9 @@ struct Codec8bit { int i) { return (code[i] + 0.5f) / 255.0f; } - -#ifdef __AVX2__ - static FAISS_ALWAYS_INLINE __m256 + static FAISS_ALWAYS_INLINE simd8float32 decode_8_components(const uint8_t* code, int i) { +#ifdef __AVX2__ const uint64_t c8 = *(uint64_t*)(code + i); const __m128i i8 = _mm_set1_epi64x(c8); @@ -89,22 +90,19 @@ struct Codec8bit { const __m256 f8 = _mm256_cvtepi32_ps(i32); const __m256 half_one_255 = _mm256_set1_ps(0.5f / 255.f); const __m256 one_255 = _mm256_set1_ps(1.f / 255.f); - return _mm256_fmadd_ps(f8, one_255, half_one_255); - } + return simd8float32(_mm256_fmadd_ps(f8, one_255, half_one_255)); #endif #ifdef __aarch64__ - static FAISS_ALWAYS_INLINE float32x4x2_t - decode_8_components(const uint8_t* code, int i) { float32_t result[8] = {}; for (size_t j = 0; j < 8; j++) { result[j] = decode_component(code, i + j); } float32x4_t res1 = vld1q_f32(result); float32x4_t res2 = vld1q_f32(result + 4); - return {res1, res2}; - } + return simd8float32({res1, res2}); #endif + } }; struct Codec4bit { @@ -122,7 +120,7 @@ struct Codec4bit { } #ifdef __AVX2__ - static FAISS_ALWAYS_INLINE __m256 + static FAISS_ALWAYS_INLINE simd8float32 decode_8_components(const uint8_t* code, int i) { uint32_t c4 = *(uint32_t*)(code + (i >> 1)); uint32_t mask = 0x0f0f0f0f; @@ -140,7 +138,7 @@ struct Codec4bit { __m256 half = _mm256_set1_ps(0.5f); f8 = _mm256_add_ps(f8, half); __m256 one_255 = _mm256_set1_ps(1.f / 15.f); - return _mm256_mul_ps(f8, one_255); + return simd8float32(_mm256_mul_ps(f8, one_255)); } #endif @@ -211,7 +209,7 @@ struct Codec6bit { /* Load 6 bytes that represent 8 6-bit values, return them as a * 8*32 bit vector register */ - static FAISS_ALWAYS_INLINE __m256i load6(const uint16_t* code16) { + static FAISS_ALWAYS_INLINE simd8uint32 load6(const uint16_t* code16) { const __m128i perm = _mm_set_epi8( -1, 5, 5, 4, 4, 3, -1, 3, -1, 2, 2, 1, 1, 0, -1, 0); const __m256i shifts = _mm256_set_epi32(2, 4, 6, 0, 2, 4, 6, 0); @@ -227,10 +225,10 @@ struct Codec6bit { // shift and mask out useless bits __m256i c4 = _mm256_srlv_epi32(c3, shifts); __m256i c5 = _mm256_and_si256(_mm256_set1_epi32(63), c4); - return c5; + return simd8uint32(c5); } - static FAISS_ALWAYS_INLINE __m256 + static FAISS_ALWAYS_INLINE simd8float32 decode_8_components(const uint8_t* code, int i) { // // Faster code for Intel CPUs or AMD Zen3+, just keeping it here // // for the reference, maybe, it becomes used oned day. @@ -245,13 +243,13 @@ struct Codec6bit { // const __m256 one_255 = _mm256_set1_ps(1.f / 63.f); // return _mm256_fmadd_ps(f8, one_255, half_one_255); - __m256i i8 = load6((const uint16_t*)(code + (i >> 2) * 3)); + __m256i i8 = load6((const uint16_t*)(code + (i >> 2) * 3)).i; __m256 f8 = _mm256_cvtepi32_ps(i8); // this could also be done with bit manipulations but it is // not obviously faster const __m256 half_one_255 = _mm256_set1_ps(0.5f / 63.f); const __m256 one_255 = _mm256_set1_ps(1.f / 63.f); - return _mm256_fmadd_ps(f8, one_255, half_one_255); + return simd8float32(_mm256_fmadd_ps(f8, one_255, half_one_255)); } #endif @@ -316,46 +314,19 @@ struct QuantizerTemplate : ScalarQuantizer::SQuantizer { } }; -#ifdef __AVX2__ - -template -struct QuantizerTemplate : QuantizerTemplate { - QuantizerTemplate(size_t d, const std::vector& trained) - : QuantizerTemplate(d, trained) {} - - FAISS_ALWAYS_INLINE __m256 - reconstruct_8_components(const uint8_t* code, int i) const { - __m256 xi = Codec::decode_8_components(code, i); - return _mm256_fmadd_ps( - xi, _mm256_set1_ps(this->vdiff), _mm256_set1_ps(this->vmin)); - } -}; - -#endif - -#ifdef __aarch64__ - template struct QuantizerTemplate : QuantizerTemplate { QuantizerTemplate(size_t d, const std::vector& trained) : QuantizerTemplate(d, trained) {} - FAISS_ALWAYS_INLINE float32x4x2_t + FAISS_ALWAYS_INLINE simd8float32 reconstruct_8_components(const uint8_t* code, int i) const { - float32x4x2_t xi = Codec::decode_8_components(code, i); - return {vfmaq_f32( - vdupq_n_f32(this->vmin), - xi.val[0], - vdupq_n_f32(this->vdiff)), - vfmaq_f32( - vdupq_n_f32(this->vmin), - xi.val[1], - vdupq_n_f32(this->vdiff))}; + simd8float32 xi = Codec::decode_8_components(code, i); + return simd8float32( + fmadd(xi, simd8float32(this->vdiff), simd8float32(this->vmin))); } }; -#endif - template struct QuantizerTemplate : ScalarQuantizer::SQuantizer { const size_t d; @@ -394,46 +365,21 @@ struct QuantizerTemplate : ScalarQuantizer::SQuantizer { } }; -#ifdef __AVX2__ - -template -struct QuantizerTemplate : QuantizerTemplate { - QuantizerTemplate(size_t d, const std::vector& trained) - : QuantizerTemplate(d, trained) {} - - FAISS_ALWAYS_INLINE __m256 - reconstruct_8_components(const uint8_t* code, int i) const { - __m256 xi = Codec::decode_8_components(code, i); - return _mm256_fmadd_ps( - xi, - _mm256_loadu_ps(this->vdiff + i), - _mm256_loadu_ps(this->vmin + i)); - } -}; - -#endif - -#ifdef __aarch64__ - template struct QuantizerTemplate : QuantizerTemplate { QuantizerTemplate(size_t d, const std::vector& trained) : QuantizerTemplate(d, trained) {} - FAISS_ALWAYS_INLINE float32x4x2_t + FAISS_ALWAYS_INLINE simd8float32 reconstruct_8_components(const uint8_t* code, int i) const { - float32x4x2_t xi = Codec::decode_8_components(code, i); - - float32x4x2_t vmin_8 = vld1q_f32_x2(this->vmin + i); - float32x4x2_t vdiff_8 = vld1q_f32_x2(this->vdiff + i); - - return {vfmaq_f32(vmin_8.val[0], xi.val[0], vdiff_8.val[0]), - vfmaq_f32(vmin_8.val[1], xi.val[1], vdiff_8.val[1])}; + simd8float32 xi = Codec::decode_8_components(code, i); + return simd8float32( + fmadd(xi, + simd8float32(this->vdiff + i), + simd8float32(this->vmin + i))); } }; -#endif - /******************************************************************* * FP16 quantizer *******************************************************************/ @@ -465,37 +411,28 @@ struct QuantizerFP16<1> : ScalarQuantizer::SQuantizer { } }; -#ifdef USE_F16C - template <> struct QuantizerFP16<8> : QuantizerFP16<1> { QuantizerFP16(size_t d, const std::vector& trained) : QuantizerFP16<1>(d, trained) {} - FAISS_ALWAYS_INLINE __m256 + FAISS_ALWAYS_INLINE simd8float32 reconstruct_8_components(const uint8_t* code, int i) const { +#ifdef USE_F16C __m128i codei = _mm_loadu_si128((const __m128i*)(code + 2 * i)); - return _mm256_cvtph_ps(codei); - } -}; - + return simd8float32(_mm256_cvtph_ps(codei)); #endif #ifdef __aarch64__ -template <> -struct QuantizerFP16<8> : QuantizerFP16<1> { - QuantizerFP16(size_t d, const std::vector& trained) - : QuantizerFP16<1>(d, trained) {} - - FAISS_ALWAYS_INLINE float32x4x2_t - reconstruct_8_components(const uint8_t* code, int i) const { uint16x4x2_t codei = vld1_u16_x2((const uint16_t*)(code + 2 * i)); - return {vcvt_f32_f16(vreinterpret_f16_u16(codei.val[0])), - vcvt_f32_f16(vreinterpret_f16_u16(codei.val[1]))}; + return simd8float32( + {vcvt_f32_f16(vreinterpret_f16_u16(codei.val[0])), + vcvt_f32_f16(vreinterpret_f16_u16(codei.val[1]))}); + +#endif } }; -#endif /******************************************************************* * BF16 quantizer @@ -528,40 +465,34 @@ struct QuantizerBF16<1> : ScalarQuantizer::SQuantizer { } }; -#ifdef __AVX2__ - template <> struct QuantizerBF16<8> : QuantizerBF16<1> { QuantizerBF16(size_t d, const std::vector& trained) : QuantizerBF16<1>(d, trained) {} - FAISS_ALWAYS_INLINE __m256 + FAISS_ALWAYS_INLINE simd8float32 reconstruct_8_components(const uint8_t* code, int i) const { +#ifdef __AVX2__ + __m128i code_128i = _mm_loadu_si128((const __m128i*)(code + 2 * i)); __m256i code_256i = _mm256_cvtepu16_epi32(code_128i); code_256i = _mm256_slli_epi32(code_256i, 16); - return _mm256_castsi256_ps(code_256i); - } -}; + return simd8float32(_mm256_castsi256_ps(code_256i)); #endif #ifdef __aarch64__ -template <> -struct QuantizerBF16<8> : QuantizerBF16<1> { - QuantizerBF16(size_t d, const std::vector& trained) - : QuantizerBF16<1>(d, trained) {} - - FAISS_ALWAYS_INLINE float32x4x2_t - reconstruct_8_components(const uint8_t* code, int i) const { uint16x4x2_t codei = vld1_u16_x2((const uint16_t*)(code + 2 * i)); - return {vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(codei.val[0]), 16)), - vreinterpretq_f32_u32( - vshlq_n_u32(vmovl_u16(codei.val[1]), 16))}; + return simd8float32( + {vreinterpretq_f32_u32( + vshlq_n_u32(vmovl_u16(codei.val[0]), 16)), + vreinterpretq_f32_u32( + vshlq_n_u32(vmovl_u16(codei.val[1]), 16))}); + +#endif } }; -#endif /******************************************************************* * 8bit_direct quantizer @@ -602,11 +533,11 @@ struct Quantizer8bitDirect<8> : Quantizer8bitDirect<1> { Quantizer8bitDirect(size_t d, const std::vector& trained) : Quantizer8bitDirect<1>(d, trained) {} - FAISS_ALWAYS_INLINE __m256 + FAISS_ALWAYS_INLINE simd8float32 reconstruct_8_components(const uint8_t* code, int i) const { __m128i x8 = _mm_loadl_epi64((__m128i*)(code + i)); // 8 * int8 __m256i y8 = _mm256_cvtepu8_epi32(x8); // 8 * int32 - return _mm256_cvtepi32_ps(y8); // 8 * float32 + return simd8float32(_mm256_cvtepi32_ps(y8)); // 8 * float32 } }; @@ -672,13 +603,13 @@ struct Quantizer8bitDirectSigned<8> : Quantizer8bitDirectSigned<1> { Quantizer8bitDirectSigned(size_t d, const std::vector& trained) : Quantizer8bitDirectSigned<1>(d, trained) {} - FAISS_ALWAYS_INLINE __m256 + FAISS_ALWAYS_INLINE simd8float32 reconstruct_8_components(const uint8_t* code, int i) const { __m128i x8 = _mm_loadl_epi64((__m128i*)(code + i)); // 8 * int8 __m256i y8 = _mm256_cvtepu8_epi32(x8); // 8 * int32 __m256i c8 = _mm256_set1_epi32(128); __m256i z8 = _mm256_sub_epi32(y8, c8); // subtract 128 from all lanes - return _mm256_cvtepi32_ps(z8); // 8 * float32 + return simd8float32(_mm256_cvtepi32_ps(z8)); // 8 * float32 } }; @@ -955,7 +886,6 @@ struct SimilarityL2<1> { } }; -#ifdef __AVX2__ template <> struct SimilarityL2<8> { static constexpr int simdwidth = 8; @@ -964,88 +894,49 @@ struct SimilarityL2<8> { const float *y, *yi; explicit SimilarityL2(const float* y) : y(y) {} - __m256 accu8; + simd8float32 accu8; FAISS_ALWAYS_INLINE void begin_8() { - accu8 = _mm256_setzero_ps(); + accu8.clear(); yi = y; } - FAISS_ALWAYS_INLINE void add_8_components(__m256 x) { - __m256 yiv = _mm256_loadu_ps(yi); + FAISS_ALWAYS_INLINE void add_8_components(simd8float32 x) { + simd8float32 yiv(yi); yi += 8; - __m256 tmp = _mm256_sub_ps(yiv, x); - accu8 = _mm256_fmadd_ps(tmp, tmp, accu8); + simd8float32 tmp = yiv - x; + accu8 = fmadd(tmp, tmp, accu8); } - FAISS_ALWAYS_INLINE void add_8_components_2(__m256 x, __m256 y_2) { - __m256 tmp = _mm256_sub_ps(y_2, x); - accu8 = _mm256_fmadd_ps(tmp, tmp, accu8); + FAISS_ALWAYS_INLINE void add_8_components_2( + simd8float32 x, + simd8float32 y_2) { + simd8float32 tmp = y_2 - x; + accu8 = fmadd(tmp, tmp, accu8); } FAISS_ALWAYS_INLINE float result_8() { +#ifdef __AVX2__ const __m128 sum = _mm_add_ps( - _mm256_castps256_ps128(accu8), _mm256_extractf128_ps(accu8, 1)); + _mm256_castps256_ps128(accu8.f), + _mm256_extractf128_ps(accu8.f, 1)); const __m128 v0 = _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2)); const __m128 v1 = _mm_add_ps(sum, v0); __m128 v2 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(0, 0, 0, 1)); const __m128 v3 = _mm_add_ps(v1, v2); return _mm_cvtss_f32(v3); - } -}; - #endif #ifdef __aarch64__ -template <> -struct SimilarityL2<8> { - static constexpr int simdwidth = 8; - static constexpr MetricType metric_type = METRIC_L2; - - const float *y, *yi; - explicit SimilarityL2(const float* y) : y(y) {} - float32x4x2_t accu8; - - FAISS_ALWAYS_INLINE void begin_8() { - accu8 = {vdupq_n_f32(0.0f), vdupq_n_f32(0.0f)}; - yi = y; - } - - FAISS_ALWAYS_INLINE void add_8_components(float32x4x2_t x) { - float32x4x2_t yiv = vld1q_f32_x2(yi); - yi += 8; - - float32x4_t sub0 = vsubq_f32(yiv.val[0], x.val[0]); - float32x4_t sub1 = vsubq_f32(yiv.val[1], x.val[1]); - - float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], sub0, sub0); - float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], sub1, sub1); - - accu8 = {accu8_0, accu8_1}; - } - - FAISS_ALWAYS_INLINE void add_8_components_2( - float32x4x2_t x, - float32x4x2_t y) { - float32x4_t sub0 = vsubq_f32(y.val[0], x.val[0]); - float32x4_t sub1 = vsubq_f32(y.val[1], x.val[1]); - - float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], sub0, sub0); - float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], sub1, sub1); - - accu8 = {accu8_0, accu8_1}; - } - - FAISS_ALWAYS_INLINE float result_8() { float32x4_t sum_0 = vpaddq_f32(accu8.val[0], accu8.val[0]); float32x4_t sum_1 = vpaddq_f32(accu8.val[1], accu8.val[1]); float32x4_t sum2_0 = vpaddq_f32(sum_0, sum_0); float32x4_t sum2_1 = vpaddq_f32(sum_1, sum_1); return vgetq_lane_f32(sum2_0, 0) + vgetq_lane_f32(sum2_1, 0); +#endif } }; -#endif template struct SimilarityIP {}; @@ -1078,8 +969,6 @@ struct SimilarityIP<1> { } }; -#ifdef __AVX2__ - template <> struct SimilarityIP<8> { static constexpr int simdwidth = 8; @@ -1091,70 +980,39 @@ struct SimilarityIP<8> { explicit SimilarityIP(const float* y) : y(y) {} - __m256 accu8; + simd8float32 accu8; FAISS_ALWAYS_INLINE void begin_8() { - accu8 = _mm256_setzero_ps(); + accu8.clear(); yi = y; } - FAISS_ALWAYS_INLINE void add_8_components(__m256 x) { - __m256 yiv = _mm256_loadu_ps(yi); + FAISS_ALWAYS_INLINE void add_8_components(simd8float32 x) { + simd8float32 yiv(yi); yi += 8; - accu8 = _mm256_fmadd_ps(yiv, x, accu8); + accu8 = fmadd(yiv, x, accu8); } - FAISS_ALWAYS_INLINE void add_8_components_2(__m256 x1, __m256 x2) { - accu8 = _mm256_fmadd_ps(x1, x2, accu8); + FAISS_ALWAYS_INLINE void add_8_components_2( + simd8float32 x1, + simd8float32 x2) { + accu8 = fmadd(x1, x2, accu8); } FAISS_ALWAYS_INLINE float result_8() { +#ifdef __AVX2__ + const __m128 sum = _mm_add_ps( - _mm256_castps256_ps128(accu8), _mm256_extractf128_ps(accu8, 1)); + _mm256_castps256_ps128(accu8.f), + _mm256_extractf128_ps(accu8.f, 1)); const __m128 v0 = _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2)); const __m128 v1 = _mm_add_ps(sum, v0); __m128 v2 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(0, 0, 0, 1)); const __m128 v3 = _mm_add_ps(v1, v2); return _mm_cvtss_f32(v3); - } -}; #endif #ifdef __aarch64__ - -template <> -struct SimilarityIP<8> { - static constexpr int simdwidth = 8; - static constexpr MetricType metric_type = METRIC_INNER_PRODUCT; - - const float *y, *yi; - - explicit SimilarityIP(const float* y) : y(y) {} - float32x4x2_t accu8; - - FAISS_ALWAYS_INLINE void begin_8() { - accu8 = {vdupq_n_f32(0.0f), vdupq_n_f32(0.0f)}; - yi = y; - } - - FAISS_ALWAYS_INLINE void add_8_components(float32x4x2_t x) { - float32x4x2_t yiv = vld1q_f32_x2(yi); - yi += 8; - - float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], yiv.val[0], x.val[0]); - float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], yiv.val[1], x.val[1]); - accu8 = {accu8_0, accu8_1}; - } - - FAISS_ALWAYS_INLINE void add_8_components_2( - float32x4x2_t x1, - float32x4x2_t x2) { - float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], x1.val[0], x2.val[0]); - float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], x1.val[1], x2.val[1]); - accu8 = {accu8_0, accu8_1}; - } - - FAISS_ALWAYS_INLINE float result_8() { float32x4x2_t sum = { vpaddq_f32(accu8.val[0], accu8.val[0]), vpaddq_f32(accu8.val[1], accu8.val[1])}; @@ -1163,9 +1021,9 @@ struct SimilarityIP<8> { vpaddq_f32(sum.val[0], sum.val[0]), vpaddq_f32(sum.val[1], sum.val[1])}; return vgetq_lane_f32(sum2.val[0], 0) + vgetq_lane_f32(sum2.val[1], 0); +#endif } }; -#endif /******************************************************************* * DistanceComputer: combines a similarity and a quantizer to do @@ -1220,8 +1078,6 @@ struct DCTemplate : SQDistanceComputer { } }; -#ifdef USE_F16C - template struct DCTemplate : SQDistanceComputer { using Sim = Similarity; @@ -1235,7 +1091,7 @@ struct DCTemplate : SQDistanceComputer { Similarity sim(x); sim.begin_8(); for (size_t i = 0; i < quant.d; i += 8) { - __m256 xi = quant.reconstruct_8_components(code, i); + simd8float32 xi = quant.reconstruct_8_components(code, i); sim.add_8_components(xi); } return sim.result_8(); @@ -1246,8 +1102,8 @@ struct DCTemplate : SQDistanceComputer { Similarity sim(nullptr); sim.begin_8(); for (size_t i = 0; i < quant.d; i += 8) { - __m256 x1 = quant.reconstruct_8_components(code1, i); - __m256 x2 = quant.reconstruct_8_components(code2, i); + simd8float32 x1 = quant.reconstruct_8_components(code1, i); + simd8float32 x2 = quant.reconstruct_8_components(code2, i); sim.add_8_components_2(x1, x2); } return sim.result_8(); @@ -1267,55 +1123,6 @@ struct DCTemplate : SQDistanceComputer { } }; -#endif - -#ifdef __aarch64__ - -template -struct DCTemplate : SQDistanceComputer { - using Sim = Similarity; - - Quantizer quant; - - DCTemplate(size_t d, const std::vector& trained) - : quant(d, trained) {} - float compute_distance(const float* x, const uint8_t* code) const { - Similarity sim(x); - sim.begin_8(); - for (size_t i = 0; i < quant.d; i += 8) { - float32x4x2_t xi = quant.reconstruct_8_components(code, i); - sim.add_8_components(xi); - } - return sim.result_8(); - } - - float compute_code_distance(const uint8_t* code1, const uint8_t* code2) - const { - Similarity sim(nullptr); - sim.begin_8(); - for (size_t i = 0; i < quant.d; i += 8) { - float32x4x2_t x1 = quant.reconstruct_8_components(code1, i); - float32x4x2_t x2 = quant.reconstruct_8_components(code2, i); - sim.add_8_components_2(x1, x2); - } - return sim.result_8(); - } - - void set_query(const float* x) final { - q = x; - } - - float symmetric_dis(idx_t i, idx_t j) override { - return compute_code_distance( - codes + i * code_size, codes + j * code_size); - } - - float query_to_code(const uint8_t* code) const final { - return compute_distance(q, code); - } -}; -#endif - /******************************************************************* * DistanceComputerByte: computes distances in the integer domain *******************************************************************/