From af4d377eb5420ab34a003c4717ba47fda41c9a0e Mon Sep 17 00:00:00 2001 From: Yusuke Matsui Date: Sun, 5 Sep 2021 14:28:18 +0900 Subject: [PATCH 1/2] follow up commit --- .github/workflows/build.yml | 12 +++++------- README.md | 34 +++++++++++++++++++++------------- setup.py | 7 ++++++- src/distance.h | 24 ++++++++++++++++++++---- tests/test_rii.py | 20 ++++++++++++++++++-- 5 files changed, 70 insertions(+), 27 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 2ff340e..b73c775 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -9,24 +9,22 @@ jobs: strategy: matrix: # https://github.blog/2019-08-08-github-actions-now-supports-ci-cd/ - #os: [ubuntu-latest, macos-latest] os: [ubuntu-latest, macos-latest, windows-latest] python-version: [3.5, 3.6, 3.7, 3.8] # https://stackoverflow.com/questions/57810623/how-to-select-the-c-c-compiler-used-for-a-github-actions-job: compiler: [gcc, clang, cl] - # Don't check ubuntu+clang exclude: - # Excluding clang in ubuntu and windows + # ubuntu: gcc - os: ubuntu-latest compiler: clang - - os: windows-latest - compiler: clang - # Excluding cl in ubuntu and mac - os: ubuntu-latest compiler: cl + # mac: gcc, clang - os: macos-latest compiler: cl - # Excluding gcc in windows + # win: cl + - os: windows-latest + compiler: clang - os: windows-latest compiler: gcc steps: diff --git a/README.md b/README.md index 8106583..a1d0c01 100644 --- a/README.md +++ b/README.md @@ -29,23 +29,29 @@ The search can be operated for a subset of a database. | Rii remains fast even a You can install the package via pip. This library works with Python 3.5+ on linux/mac/wsl/Windows10 (x64, using MSVC:flags - /arch:AVX2, /openmp:llvm, /fp:fast'). ``` -pip install git+https://github.com/ashleyabraham/rii.git +pip install rii ``` -or use pre-compiled binary for Windows 10 (, may need MS Visual Studio Build tools) -``` -pip install https://github.com/ashleyabraham/rii/releases/download/v0.2.7/rii-0.2.7-cp38-cp38-win_amd64.whl -``` +
+ For windows (maintained by [@ashleyabraham](https://github.com/ashleyabraham)) + + ### Pre-compiled binary for Windows 10 (, may need MS Visual Studio Build tools) + ``` + pip install https://github.com/ashleyabraham/rii/releases/download/v0.2.7/rii-0.2.7-cp38-cp38-win_amd64.whl + ``` + + ### OpenMP + In order to use OpenMP 3.0 /openmp:llvm flag is used which causes warnings of multiple libs loading, use at your descretion when used with other parallel processing library loadings. To supress use -### Windows (notes) -#### OpenMP -In order to use OpenMP 3.0 /openmp:llvm flag is used which causes warnings of multiple libs loading, use at your descretion when used with other parallel processing library loadings. To supress use + `SET KMP_DUPLICATE_LIB_OK=TRUE` + + ### SIMD + The /arch:AVX2 flag is used in MSVC to set appropriate SIMD preprocessors and compiler intrinsics + +
-`SET KMP_DUPLICATE_LIB_OK=TRUE` -#### SIMD -The /arch:AVX2 flag is used in MSVC to set appropriate SIMD preprocessors and compiler intrinsics ## [Documentation](https://rii.readthedocs.io/en/latest/index.html) @@ -53,6 +59,7 @@ The /arch:AVX2 flag is used in MSVC to set appropriate SIMD preprocessors and co - [Tips](https://rii.readthedocs.io/en/latest/source/tips.html) - [API](https://rii.readthedocs.io/en/latest/source/api.html) + ## Usage ### Basic ANN @@ -104,13 +111,13 @@ print(ids, dists) # e.g., [728 85 132] [14.80522156 15.92787838 16.28690338] ```python # Add new vectors X2 = np.random.random((1000, D)).astype(np.float32) -e.add_configure(vecs=X2) # Now N is 11000 +e.add(vecs=X2) # Now N is 11000 e.query(q=q) # Ok. (0.12 msec / query) # However, if you add quite a lot of vectors, the search might become slower # because the data structure has been optimized for the initial item size (N=10000) X3 = np.random.random((1000000, D)).astype(np.float32) -e.add_configure(vecs=X3) # A lot. Now N is 1011000 +e.add(vecs=X3) # A lot. Now N is 1011000 e.query(q=q) # Slower (0.96 msec/query) # In such case, run the reconfigure function. That updates the data structure @@ -156,3 +163,4 @@ e1.merge(e2) # Now e1 contains both X1 and X2 ## Credits - The logo is designed by [@richardbmx](https://github.com/richardbmx) ([#4](https://github.com/matsui528/rii/issues/4)) +- The windows implementation is by [@ashleyabraham](https://github.com/ashleyabraham) ([#42](https://github.com/matsui528/rii/pull/42)) diff --git a/setup.py b/setup.py index 6002248..32a5238 100644 --- a/setup.py +++ b/setup.py @@ -107,11 +107,16 @@ def build_extensions(self): opts.append('/fp:fast') # -Ofast if sys.platform not in ['darwin', 'win32']: - opts.append('-fopenmp') # For pqk-means + # For linux + opts.append('-fopenmp') # For pqk-means. + + if sys.platform not in ['win32']: + # For linux and mac opts.append('-march=native') # For fast SIMD computation of L2 distance opts.append('-mtune=native') # Do optimization (It seems this doesn't boost, but just in case) opts.append('-Ofast') # This makes the program faster + for ext in self.extensions: ext.extra_compile_args = opts if not sys.platform == 'darwin': diff --git a/src/distance.h b/src/distance.h index 47f895a..cc25259 100644 --- a/src/distance.h +++ b/src/distance.h @@ -61,7 +61,6 @@ static inline __m128 masked_read (int d, const float *x) // cannot use AVX2 _mm_mask_set1_epi32 } -//#ifdef __AVX__ #if defined(__AVX__) // Reading function for AVX and AVX512 // This function is from Faiss @@ -84,7 +83,6 @@ static inline __m256 masked_read_8 (int d, const float *x) -//#ifdef __AVX512F__ #if defined(__AVX512F__) // Reading function for AVX512 // reads 0 <= d < 16 floats as __m512 @@ -109,7 +107,6 @@ static inline __m512 masked_read_16 (int d, const float *x) // ========================= Distance functions ============================ -//#ifdef __AVX512F__ #if defined(__AVX512F__) static const std::string g_simd_architecture = "avx512"; @@ -128,23 +125,29 @@ float fvec_L2sqr (const float *x, const float *y, size_t d) } __m256 msum2 = _mm512_extractf32x8_ps(msum1, 1); + // msum2 += _mm512_extractf32x8_ps(msum1, 0); msum2 = _mm256_add_ps(msum2, _mm512_extractf32x8_ps(msum1, 0)); while (d >= 8) { __m256 mx = _mm256_loadu_ps (x); x += 8; __m256 my = _mm256_loadu_ps (y); y += 8; + // const __m256 a_m_b1 = mx - my; const __m256 a_m_b1 = _mm256_sub_ps(mx, my); + // msum2 += a_m_b1 * a_m_b1; msum2 = _mm256_add_ps(msum2, _mm256_mul_ps(a_m_b1, a_m_b1)); d -= 8; } __m128 msum3 = _mm256_extractf128_ps(msum2, 1); + // msum3 += _mm256_extractf128_ps(msum2, 0); msum3 = _mm_add_ps(msum3, _mm256_extractf128_ps(msum2, 0)); if (d >= 4) { __m128 mx = _mm_loadu_ps (x); x += 4; __m128 my = _mm_loadu_ps (y); y += 4; + // const __m128 a_m_b1 = mx - my; const __m128 a_m_b1 = _mm_sub_ps(mx, my); + // msum3 += a_m_b1 * a_m_b1; msum3 = _mm_add_ps(msum3, _mm_mul_ps(a_m_b1, a_m_b1)); d -= 4; } @@ -152,7 +155,9 @@ float fvec_L2sqr (const float *x, const float *y, size_t d) if (d > 0) { __m128 mx = masked_read (d, x); __m128 my = masked_read (d, y); + // __m128 a_m_b1 = mx - my; __m128 a_m_b1 = _mm_sub_ps(mx, my); + // msum3 += a_m_b1 * a_m_b1; msum3 = _mm_add_ps(msum3, _mm_mul_ps(a_m_b1, a_m_b1)); } @@ -173,18 +178,23 @@ float fvec_L2sqr (const float *x, const float *y, size_t d) while (d >= 8) { __m256 mx = _mm256_loadu_ps (x); x += 8; __m256 my = _mm256_loadu_ps (y); y += 8; - const __m256 a_m_b1 = _mm256_sub_ps(mx, my); // mx - my; + // const __m256 a_m_b1 = mx - my; + const __m256 a_m_b1 = _mm256_sub_ps(mx, my); + // msum1 += a_m_b1 * a_m_b1; msum1 = _mm256_add_ps(msum1, _mm256_mul_ps(a_m_b1 ,a_m_b1)); d -= 8; } __m128 msum2 = _mm256_extractf128_ps(msum1, 1); + // msum2 += _mm256_extractf128_ps(msum1, 0); msum2 = _mm_add_ps(msum2, _mm256_extractf128_ps(msum1, 0)); if (d >= 4) { __m128 mx = _mm_loadu_ps (x); x += 4; __m128 my = _mm_loadu_ps (y); y += 4; + // const __m128 a_m_b1 = mx - my; const __m128 a_m_b1 = _mm_sub_ps(mx, my); + // msum2 += a_m_b1 * a_m_b1; msum2 = _mm_add_ps(msum2, _mm_mul_ps(a_m_b1, a_m_b1)); d -= 4; } @@ -192,7 +202,9 @@ float fvec_L2sqr (const float *x, const float *y, size_t d) if (d > 0) { __m128 mx = masked_read (d, x); __m128 my = masked_read (d, y); + // __m128 a_m_b1 = mx - my; __m128 a_m_b1 = _mm_sub_ps(mx, my); + // msum2 += a_m_b1 * a_m_b1; msum2 = _mm_add_ps(msum2, _mm_mul_ps(a_m_b1, a_m_b1)); } @@ -214,7 +226,9 @@ float fvec_L2sqr (const float *x, const float *y, size_t d) while (d >= 4) { __m128 mx = _mm_loadu_ps (x); x += 4; __m128 my = _mm_loadu_ps (y); y += 4; + // const __m128 a_m_b1 = mx - my; const __m128 a_m_b1 = _mm_sub_ps(mx, my); + // msum1 += a_m_b1 * a_m_b1; msum1 = _mm_add_ps(msum1, _mm_mul_ps(a_m_b1, a_m_b1)); d -= 4; } @@ -223,7 +237,9 @@ float fvec_L2sqr (const float *x, const float *y, size_t d) // add the last 1, 2 or 3 values __m128 mx = masked_read (d, x); __m128 my = masked_read (d, y); + // __m128 a_m_b1 = mx - my; __m128 a_m_b1 = _mm_sub_ps(mx, my); + // msum1 += a_m_b1 * a_m_b1; msum1 = _mm_add_ps(msum1, _mm_mul_ps(a_m_b1, a_m_b1)); } diff --git a/tests/test_rii.py b/tests/test_rii.py index 9c7fcf0..c6a28bc 100644 --- a/tests/test_rii.py +++ b/tests/test_rii.py @@ -1,5 +1,4 @@ -#from .context import rii -import rii +from .context import rii import unittest import numpy as np import nanopq @@ -54,6 +53,23 @@ def test_reconfigure(self): self.assertEqual(len(e.posting_lists), nlist) self.assertEqual(sum([len(plist) for plist in e.posting_lists]), N) + def test_simple_add_configure(self): + M, Ks = 4, 20 + N1, N2, D = 300, 700, 40 + X1 = np.random.random((N1, D)).astype(np.float32) + X2 = np.random.random((N2, D)).astype(np.float32) + e = rii.Rii(fine_quantizer=nanopq.PQ(M=M, Ks=Ks, verbose=True).fit(vecs=X1)) + e.add(vecs=X1) + self.assertEqual(e.N, N1) + e.add(vecs=X2) + self.assertEqual(e.N, N1 + N2) + for nlist in [5, 100]: + e.reconfigure(nlist=nlist) + self.assertEqual(e.nlist, nlist) + self.assertEqual(e.coarse_centers.shape, (nlist, M)) + self.assertEqual(len(e.posting_lists), nlist) + self.assertEqual(sum([len(plist) for plist in e.posting_lists]), N1 + N2) + def test_add_configure(self): M, Ks = 4, 20 N, D = 1000, 40 From 67acf615bf723f354a68dbfeeb8c80e56633ca1e Mon Sep 17 00:00:00 2001 From: Yusuke Matsui Date: Sun, 5 Sep 2021 14:31:21 +0900 Subject: [PATCH 2/2] updated readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a1d0c01..164d42c 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ pip install rii
- For windows (maintained by [@ashleyabraham](https://github.com/ashleyabraham)) + For windows (maintained by @ashleyabraham) ### Pre-compiled binary for Windows 10 (, may need MS Visual Studio Build tools) ```