Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ifdefの条件を修正した #291

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions source/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -595,7 +595,7 @@ else ifeq ($(TARGET_CPU),AVX512VNNI)
# cascadelakeとicelakeとの違いがあるのかは知らないので、cascadelakeのみでいいや。

else ifeq ($(TARGET_CPU),AVXVNNI)
CPPFLAGS += -DUSE_AVX2 -DUSE_BMI2 -DUSE_VNNI -march=alderlake
CPPFLAGS += -DUSE_AVX2 -DUSE_BMI2 -DUSE_VNNI -DUSE_AVXVNNI -march=alderlake
# NNUEのコード、USE_VNNIが指定されているとVNNI対応のコードになる。
# Intel Alder Lake、Sapphire Rapids 以降追加の命令群。LLVM12, GCC11 以降。
# AVXVNNI (AVX2VNNI という表記も有り?) は AVX512VNNIの256bit幅以下限定版。
Expand Down Expand Up @@ -626,7 +626,7 @@ else ifeq ($(TARGET_CPU),NO_SSE)
else ifeq ($(TARGET_CPU),GRAVITON2)
# for Amazon Web Servece EC2, the Graviton2 CPU [M6g/M6gd, C6g/C6gd/C6gn, R6g/R6gd, T4g, X2gd] instances
# https://github.com/aws/aws-graviton-getting-started/blob/main/c-c++.md
CPPFLAGS += -DIS_64BIT -DUSE_NEON -march=armv8.2-a+fp16+rcpc+dotprod+crypto
CPPFLAGS += -DIS_64BIT -DUSE_NEON=8 -march=armv8.2-a+fp16+rcpc+dotprod+crypto
else ifeq ($(TARGET_CPU),APPLEAVX2)
CPPFLAGS += -DIS_64BIT -DUSE_AVX2 -DUSE_BMI2 -target x86_64-apple-macos11 -mbmi -mbmi2 -mavx2 -mpopcnt
else ifeq ($(TARGET_CPU),APPLESSE42)
Expand Down
12 changes: 8 additions & 4 deletions source/eval/nnue/layers/affine_transform_sparse_input.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@

namespace Eval::NNUE::Layers {

#if defined(USE_SSSE3) || USE_NEON >= 8

alignas(kCacheLineSize) static inline const
std::array<std::array<std::uint16_t, 8>, 256> lookup_indices = []() {
std::array<std::array<std::uint16_t, 8>, 256> v{};
Expand All @@ -36,7 +38,7 @@ void find_nnz(const std::int32_t* input, std::uint16_t* out, IndexType& count_ou
#define vec_nnz(a) _mm512_cmpgt_epi32_mask(a, _mm512_setzero_si512())
#elif defined(USE_AVX2)
using vec_t = __m256i;
#if defined(USE_VNNI) && defined(USE_AVX512)
#if defined(USE_VNNI) && !defined(USE_AVXVNNI)
#define vec_nnz(a) _mm256_cmpgt_epi32_mask(a, _mm256_setzero_si256())
#else
#define vec_nnz(a) \
Expand Down Expand Up @@ -105,6 +107,8 @@ void find_nnz(const std::int32_t* input, std::uint16_t* out, IndexType& count_ou
#undef vec128_storeu
#undef vec128_add

#endif

// AffineTransform layer that takes block-sparse input
// ブロック疎な入力を受け取るアフィン変換層
template <typename PreviousLayer, IndexType OutputDimensions>
Expand All @@ -131,7 +135,7 @@ class AffineTransformSparseInput {
// 入力層からこの層までで使用する順伝播用バッファのサイズ
static constexpr std::size_t kBufferSize = PreviousLayer::kBufferSize + kSelfBufferSize;

#if defined(USE_SSSE3) || defined(USE_NEON_DOTPROD)
#if defined(USE_SSSE3) || USE_NEON >= 8
static constexpr IndexType kChunkSize = 4;
#else
static constexpr IndexType kChunkSize = 1;
Expand Down Expand Up @@ -159,7 +163,7 @@ class AffineTransformSparseInput {
}

static constexpr IndexType get_weight_index(IndexType i) {
#if defined(USE_SSSE3) || defined(USE_NEON_DOTPROD)
#if defined(USE_SSSE3) || USE_NEON >= 8
return get_weight_index_scrambled(i);
#else
return i;
Expand Down Expand Up @@ -210,7 +214,7 @@ class AffineTransformSparseInput {
}
#endif

#if defined(USE_SSSE3) || defined(USE_NEON_DOTPROD)
#if defined(USE_SSSE3) || USE_NEON >= 8

#if defined(USE_AVX512)
if constexpr (kOutputDimensions % 16 == 0)
Expand Down
15 changes: 15 additions & 0 deletions source/eval/nnue/layers/simd.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,11 @@ namespace Simd
#if defined(USE_NEON)

[[maybe_unused]] static int neon_m128_reduce_add_epi32(int32x4_t s) {
#if USE_NEON >= 8
return vaddvq_s32(s);
#else
return s[0] + s[1] + s[2] + s[3];
#endif
}

[[maybe_unused]] static int neon_m128_hadd(int32x4_t sum, int bias) {
Expand All @@ -96,6 +100,17 @@ namespace Simd

#endif

#if USE_NEON >= 8
[[maybe_unused]] static void neon_m128_add_dpbusd_epi32(int32x4_t& acc, int8x16_t a, int8x16_t b) {

int16x8_t product0 = vmull_s8(vget_low_s8(a), vget_low_s8(b));
int16x8_t product1 = vmull_high_s8(a, b);
int16x8_t sum = vpaddq_s16(product0, product1);
acc = vpadalq_s16(acc, sum);
}

#endif


} // namespace Simd

Expand Down
Loading