From 81f2c8062a82f29bdb78743ef62331cc9964df30 Mon Sep 17 00:00:00 2001 From: KazApps Date: Mon, 7 Oct 2024 23:55:15 +0900 Subject: [PATCH 1/2] =?UTF-8?q?ifdef=E3=81=AE=E6=9D=A1=E4=BB=B6=E3=82=92?= =?UTF-8?q?=E4=BF=AE=E6=AD=A3=E3=81=97=E3=81=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- source/Makefile | 2 +- source/eval/nnue/layers/affine_transform_sparse_input.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/source/Makefile b/source/Makefile index 9f5267383..9b1e11a5d 100644 --- a/source/Makefile +++ b/source/Makefile @@ -595,7 +595,7 @@ else ifeq ($(TARGET_CPU),AVX512VNNI) # cascadelakeとicelakeとの違いがあるのかは知らないので、cascadelakeのみでいいや。 else ifeq ($(TARGET_CPU),AVXVNNI) - CPPFLAGS += -DUSE_AVX2 -DUSE_BMI2 -DUSE_VNNI -march=alderlake + CPPFLAGS += -DUSE_AVX2 -DUSE_BMI2 -DUSE_VNNI -DUSE_AVXVNNI -march=alderlake # NNUEのコード、USE_VNNIが指定されているとVNNI対応のコードになる。 # Intel Alder Lake、Sapphire Rapids 以降追加の命令群。LLVM12, GCC11 以降。 # AVXVNNI (AVX2VNNI という表記も有り?) は AVX512VNNIの256bit幅以下限定版。 diff --git a/source/eval/nnue/layers/affine_transform_sparse_input.h b/source/eval/nnue/layers/affine_transform_sparse_input.h index e3cf323b3..aab129145 100644 --- a/source/eval/nnue/layers/affine_transform_sparse_input.h +++ b/source/eval/nnue/layers/affine_transform_sparse_input.h @@ -36,7 +36,7 @@ void find_nnz(const std::int32_t* input, std::uint16_t* out, IndexType& count_ou #define vec_nnz(a) _mm512_cmpgt_epi32_mask(a, _mm512_setzero_si512()) #elif defined(USE_AVX2) using vec_t = __m256i; -#if defined(USE_VNNI) && defined(USE_AVX512) +#if defined(USE_VNNI) && !defined(USE_AVXVNNI) #define vec_nnz(a) _mm256_cmpgt_epi32_mask(a, _mm256_setzero_si256()) #else #define vec_nnz(a) \ From 62448d706ae034cf9ef7d22f7dc2b12746c8c7de Mon Sep 17 00:00:00 2001 From: KazApps Date: Tue, 8 Oct 2024 09:19:28 +0900 Subject: [PATCH 2/2] =?UTF-8?q?NEON=E3=81=A7=E3=81=AE=E3=82=A8=E3=83=A9?= =?UTF-8?q?=E3=83=BC=E3=82=92=E4=BF=AE=E6=AD=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- source/Makefile | 2 +- .../nnue/layers/affine_transform_sparse_input.h | 10 +++++++--- source/eval/nnue/layers/simd.h | 15 +++++++++++++++ 3 files changed, 23 insertions(+), 4 deletions(-) diff --git a/source/Makefile b/source/Makefile index 9b1e11a5d..e7ca2e94a 100644 --- a/source/Makefile +++ b/source/Makefile @@ -626,7 +626,7 @@ else ifeq ($(TARGET_CPU),NO_SSE) else ifeq ($(TARGET_CPU),GRAVITON2) # for Amazon Web Servece EC2, the Graviton2 CPU [M6g/M6gd, C6g/C6gd/C6gn, R6g/R6gd, T4g, X2gd] instances # https://github.com/aws/aws-graviton-getting-started/blob/main/c-c++.md - CPPFLAGS += -DIS_64BIT -DUSE_NEON -march=armv8.2-a+fp16+rcpc+dotprod+crypto + CPPFLAGS += -DIS_64BIT -DUSE_NEON=8 -march=armv8.2-a+fp16+rcpc+dotprod+crypto else ifeq ($(TARGET_CPU),APPLEAVX2) CPPFLAGS += -DIS_64BIT -DUSE_AVX2 -DUSE_BMI2 -target x86_64-apple-macos11 -mbmi -mbmi2 -mavx2 -mpopcnt else ifeq ($(TARGET_CPU),APPLESSE42) diff --git a/source/eval/nnue/layers/affine_transform_sparse_input.h b/source/eval/nnue/layers/affine_transform_sparse_input.h index aab129145..7c6725c23 100644 --- a/source/eval/nnue/layers/affine_transform_sparse_input.h +++ b/source/eval/nnue/layers/affine_transform_sparse_input.h @@ -15,6 +15,8 @@ namespace Eval::NNUE::Layers { +#if defined(USE_SSSE3) || USE_NEON >= 8 + alignas(kCacheLineSize) static inline const std::array, 256> lookup_indices = []() { std::array, 256> v{}; @@ -105,6 +107,8 @@ void find_nnz(const std::int32_t* input, std::uint16_t* out, IndexType& count_ou #undef vec128_storeu #undef vec128_add +#endif + // AffineTransform layer that takes block-sparse input // ブロック疎な入力を受け取るアフィン変換層 template @@ -131,7 +135,7 @@ class AffineTransformSparseInput { // 入力層からこの層までで使用する順伝播用バッファのサイズ static constexpr std::size_t kBufferSize = PreviousLayer::kBufferSize + kSelfBufferSize; -#if defined(USE_SSSE3) || defined(USE_NEON_DOTPROD) +#if defined(USE_SSSE3) || USE_NEON >= 8 static constexpr IndexType kChunkSize = 4; #else static constexpr IndexType kChunkSize = 1; @@ -159,7 +163,7 @@ class AffineTransformSparseInput { } static constexpr IndexType get_weight_index(IndexType i) { -#if defined(USE_SSSE3) || defined(USE_NEON_DOTPROD) +#if defined(USE_SSSE3) || USE_NEON >= 8 return get_weight_index_scrambled(i); #else return i; @@ -210,7 +214,7 @@ class AffineTransformSparseInput { } #endif -#if defined(USE_SSSE3) || defined(USE_NEON_DOTPROD) +#if defined(USE_SSSE3) || USE_NEON >= 8 #if defined(USE_AVX512) if constexpr (kOutputDimensions % 16 == 0) diff --git a/source/eval/nnue/layers/simd.h b/source/eval/nnue/layers/simd.h index abe70258d..f1963f84d 100644 --- a/source/eval/nnue/layers/simd.h +++ b/source/eval/nnue/layers/simd.h @@ -87,7 +87,11 @@ namespace Simd #if defined(USE_NEON) [[maybe_unused]] static int neon_m128_reduce_add_epi32(int32x4_t s) { + #if USE_NEON >= 8 return vaddvq_s32(s); + #else + return s[0] + s[1] + s[2] + s[3]; + #endif } [[maybe_unused]] static int neon_m128_hadd(int32x4_t sum, int bias) { @@ -96,6 +100,17 @@ namespace Simd #endif +#if USE_NEON >= 8 +[[maybe_unused]] static void neon_m128_add_dpbusd_epi32(int32x4_t& acc, int8x16_t a, int8x16_t b) { + + int16x8_t product0 = vmull_s8(vget_low_s8(a), vget_low_s8(b)); + int16x8_t product1 = vmull_high_s8(a, b); + int16x8_t sum = vpaddq_s16(product0, product1); + acc = vpadalq_s16(acc, sum); +} + +#endif + } // namespace Simd