From 54544972f8edcafdf4463a53068f2038e4c88ede Mon Sep 17 00:00:00 2001 From: Force Charlie Date: Sun, 14 Jul 2024 20:11:11 +0800 Subject: [PATCH] Update BLAKE3 --- src/belahash/blake3.lock | 2 +- src/belahash/blake3/.gitignore | 3 + src/belahash/blake3/CMakeLists.txt | 192 +++++++++++++++++--------- src/belahash/blake3/blake3.c | 25 ++-- src/belahash/blake3/blake3.h | 2 +- src/belahash/blake3/blake3_dispatch.c | 11 +- src/belahash/blake3/blake3_impl.h | 4 +- src/belahash/blake3/blake3_neon.c | 8 +- 8 files changed, 158 insertions(+), 89 deletions(-) diff --git a/src/belahash/blake3.lock b/src/belahash/blake3.lock index 74922d5f..11c8febe 100644 --- a/src/belahash/blake3.lock +++ b/src/belahash/blake3.lock @@ -1,2 +1,2 @@ https://github.com/BLAKE3-team/BLAKE3 -4d32708f511fd85c6b0fb131295cc73224246738 +fc2f7e4206f016b0cac0593f23a7d5976ce066e6 diff --git a/src/belahash/blake3/.gitignore b/src/belahash/blake3/.gitignore index 0bf608ce..ff52a803 100644 --- a/src/belahash/blake3/.gitignore +++ b/src/belahash/blake3/.gitignore @@ -1,3 +1,6 @@ blake3 example +build/ *.o + +CMakeUserPresets.json diff --git a/src/belahash/blake3/CMakeLists.txt b/src/belahash/blake3/CMakeLists.txt index 47706d1a..3d674f96 100644 --- a/src/belahash/blake3/CMakeLists.txt +++ b/src/belahash/blake3/CMakeLists.txt @@ -1,7 +1,16 @@ -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.9 FATAL_ERROR) + +# respect C_EXTENSIONS OFF without explicitly setting C_STANDARD +if (POLICY CMP0128) + cmake_policy(SET CMP0128 NEW) +endif() +# mark_as_advanced does not implicitly create UNINITIALIZED cache entries +if (POLICY CMP0102) + cmake_policy(SET CMP0102 NEW) +endif() project(libblake3 - VERSION 1.4.0 + VERSION 1.5.2 DESCRIPTION "BLAKE3 C implementation" LANGUAGES C ASM ) @@ -9,14 +18,25 @@ project(libblake3 include(FeatureSummary) include(GNUInstallDirs) +# architecture lists for which to enable assembly / SIMD sources +set(BLAKE3_AMD64_NAMES amd64 AMD64 x86_64) +set(BLAKE3_X86_NAMES i686 x86 X86) +set(BLAKE3_ARMv8_NAMES aarch64 AArch64 arm64 ARM64 armv8 armv8a) # default SIMD compiler flag configuration (can be overriden by toolchains or CLI) -if(CMAKE_C_COMPILER_ID STREQUAL "MSVC") +if(MSVC) set(BLAKE3_CFLAGS_SSE2 "/arch:SSE2" CACHE STRING "the compiler flags to enable SSE2") # MSVC has no dedicated sse4.1 flag (see https://learn.microsoft.com/en-us/cpp/build/reference/arch-x86?view=msvc-170) set(BLAKE3_CFLAGS_SSE4.1 "/arch:AVX" CACHE STRING "the compiler flags to enable SSE4.1") set(BLAKE3_CFLAGS_AVX2 "/arch:AVX2" CACHE STRING "the compiler flags to enable AVX2") set(BLAKE3_CFLAGS_AVX512 "/arch:AVX512" CACHE STRING "the compiler flags to enable AVX512") + set(BLAKE3_AMD64_ASM_SOURCES + blake3_avx2_x86-64_windows_msvc.asm + blake3_avx512_x86-64_windows_msvc.asm + blake3_sse2_x86-64_windows_msvc.asm + blake3_sse41_x86-64_windows_msvc.asm + ) + elseif(CMAKE_C_COMPILER_ID STREQUAL "GNU" OR CMAKE_C_COMPILER_ID STREQUAL "Clang" OR CMAKE_C_COMPILER_ID STREQUAL "AppleClang") @@ -24,11 +44,71 @@ elseif(CMAKE_C_COMPILER_ID STREQUAL "GNU" set(BLAKE3_CFLAGS_SSE4.1 "-msse4.1" CACHE STRING "the compiler flags to enable SSE4.1") set(BLAKE3_CFLAGS_AVX2 "-mavx2" CACHE STRING "the compiler flags to enable AVX2") set(BLAKE3_CFLAGS_AVX512 "-mavx512f -mavx512vl" CACHE STRING "the compiler flags to enable AVX512") + + if (WIN32) + set(BLAKE3_AMD64_ASM_SOURCES + blake3_avx2_x86-64_windows_gnu.S + blake3_avx512_x86-64_windows_gnu.S + blake3_sse2_x86-64_windows_gnu.S + blake3_sse41_x86-64_windows_gnu.S + ) + + elseif(UNIX) + set(BLAKE3_AMD64_ASM_SOURCES + blake3_avx2_x86-64_unix.S + blake3_avx512_x86-64_unix.S + blake3_sse2_x86-64_unix.S + blake3_sse41_x86-64_unix.S + ) + endif() + + if (CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES + AND NOT CMAKE_SIZEOF_VOID_P EQUAL 8) + # 32-bit ARMv8 needs NEON to be enabled explicitly + set(BLAKE3_CFLAGS_NEON "-mfpu=neon" CACHE STRING "the compiler flags to enable NEON") + endif() endif() -# architecture lists for which to enable assembly / SIMD sources -set(BLAKE3_AMD64_NAMES amd64 AMD64 x86_64) -set(BLAKE3_X86_NAMES i686 x86 X86) -set(BLAKE3_ARMv8_NAMES aarch64 AArch64 arm64 ARM64 armv8 armv8a) + +mark_as_advanced(BLAKE3_CFLAGS_SSE2 BLAKE3_CFLAGS_SSE4.1 BLAKE3_CFLAGS_AVX2 BLAKE3_CFLAGS_AVX512 BLAKE3_CFLAGS_NEON) +mark_as_advanced(BLAKE3_AMD64_ASM_SOURCES) + +message(STATUS "BLAKE3 SIMD configuration: ${CMAKE_C_COMPILER_ARCHITECTURE_ID}") +if(MSVC AND DEFINED CMAKE_C_COMPILER_ARCHITECTURE_ID) + if(CMAKE_C_COMPILER_ARCHITECTURE_ID MATCHES "[Xx]86") + set(BLAKE3_SIMD_TYPE "x86-intrinsics" CACHE STRING "the SIMD acceleration type to use") + + elseif(CMAKE_C_COMPILER_ARCHITECTURE_ID MATCHES "[Xx]64") + set(BLAKE3_SIMD_TYPE "amd64-asm" CACHE STRING "the SIMD acceleration type to use") + + elseif(CMAKE_C_COMPILER_ARCHITECTURE_ID MATCHES "[Aa][Rr][Mm]64") + set(BLAKE3_SIMD_TYPE "neon-intrinsics" CACHE STRING "the SIMD acceleration type to use") + + else() + set(BLAKE3_SIMD_TYPE "none" CACHE STRING "the SIMD acceleration type to use") + endif() + +elseif(CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_AMD64_NAMES) + set(BLAKE3_SIMD_TYPE "amd64-asm" CACHE STRING "the SIMD acceleration type to use") + +elseif(CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_X86_NAMES + AND DEFINED BLAKE3_CFLAGS_SSE2 + AND DEFINED BLAKE3_CFLAGS_SSE4.1 + AND DEFINED BLAKE3_CFLAGS_AVX2 + AND DEFINED BLAKE3_CFLAGS_AVX512) + set(BLAKE3_SIMD_TYPE "x86-intrinsics" CACHE STRING "the SIMD acceleration type to use") + +elseif((CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES + OR ANDROID_ABI STREQUAL "armeabi-v7a" + OR BLAKE3_USE_NEON_INTRINSICS) + AND (DEFINED BLAKE3_CFLAGS_NEON + OR CMAKE_SIZEOF_VOID_P EQUAL 8)) + set(BLAKE3_SIMD_TYPE "neon-intrinsics" CACHE STRING "the SIMD acceleration type to use") + +else() + set(BLAKE3_SIMD_TYPE "none" CACHE STRING "the SIMD acceleration type to use") +endif() + +mark_as_advanced(BLAKE3_SIMD_TYPE) # library target add_library(blake3 @@ -41,73 +121,49 @@ add_library(BLAKE3::blake3 ALIAS blake3) # library configuration set(BLAKE3_PKGCONFIG_CFLAGS) if (BUILD_SHARED_LIBS) - target_compile_definitions(blake3 + target_compile_definitions(blake3 PUBLIC BLAKE3_DLL PRIVATE BLAKE3_DLL_EXPORTS ) list(APPEND BLAKE3_PKGCONFIG_CFLAGS -DBLAKE3_DLL) endif() -target_include_directories(blake3 PUBLIC $) +target_include_directories(blake3 PUBLIC + $ + $ +) set_target_properties(blake3 PROPERTIES VERSION ${PROJECT_VERSION} SOVERSION 0 C_VISIBILITY_PRESET hidden + C_EXTENSIONS OFF ) +target_compile_features(blake3 PUBLIC c_std_99) +# ensure C_EXTENSIONS OFF is respected without overriding CMAKE_C_STANDARD +# which may be set by the user or toolchain file +if (NOT POLICY CMP0128 AND NOT DEFINED CMAKE_C_STANDARD) + set_target_properties(blake3 PROPERTIES C_STANDARD 99) +endif() # optional SIMD sources -macro(BLAKE3_DISABLE_SIMD) - set(BLAKE3_SIMD_AMD64_ASM OFF) - set(BLAKE3_SIMD_X86_INTRINSICS OFF) - set(BLAKE3_SIMD_NEON_INTRINSICS OFF) - set_source_files_properties(blake3_dispatch.c PROPERTIES - COMPILE_DEFINITIONS BLAKE3_USE_NEON=0;BLAKE3_NO_SSE2;BLAKE3_NO_SSE41;BLAKE3_NO_AVX2;BLAKE3_NO_AVX512 - ) -endmacro() - -if(CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_AMD64_NAMES OR BLAKE3_USE_AMD64_ASM) +if(BLAKE3_SIMD_TYPE STREQUAL "amd64-asm") + if (NOT DEFINED BLAKE3_AMD64_ASM_SOURCES) + message(FATAL_ERROR "BLAKE3_SIMD_TYPE is set to 'amd64-asm' but no assembly sources are available for the target architecture.") + endif() set(BLAKE3_SIMD_AMD64_ASM ON) - if(CMAKE_C_COMPILER_ID STREQUAL "MSVC") + if(MSVC) enable_language(ASM_MASM) - target_sources(blake3 PRIVATE - blake3_avx2_x86-64_windows_msvc.asm - blake3_avx512_x86-64_windows_msvc.asm - blake3_sse2_x86-64_windows_msvc.asm - blake3_sse41_x86-64_windows_msvc.asm - ) - - elseif(CMAKE_C_COMPILER_ID STREQUAL "GNU" - OR CMAKE_C_COMPILER_ID STREQUAL "Clang" - OR CMAKE_C_COMPILER_ID STREQUAL "AppleClang") - if (WIN32) - target_sources(blake3 PRIVATE - blake3_avx2_x86-64_windows_gnu.S - blake3_avx512_x86-64_windows_gnu.S - blake3_sse2_x86-64_windows_gnu.S - blake3_sse41_x86-64_windows_gnu.S - ) - - elseif(UNIX) - target_sources(blake3 PRIVATE - blake3_avx2_x86-64_unix.S - blake3_avx512_x86-64_unix.S - blake3_sse2_x86-64_unix.S - blake3_sse41_x86-64_unix.S - ) - - else() - BLAKE3_DISABLE_SIMD() - endif() - - else() - BLAKE3_DISABLE_SIMD() endif() -elseif((CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_X86_NAMES OR BLAKE3_USE_X86_INTRINSICS) - AND DEFINED BLAKE3_CFLAGS_SSE2 - AND DEFINED BLAKE3_CFLAGS_SSE4.1 - AND DEFINED BLAKE3_CFLAGS_AVX2 - AND DEFINED BLAKE3_CFLAGS_AVX512) + target_sources(blake3 PRIVATE ${BLAKE3_AMD64_ASM_SOURCES}) + +elseif(BLAKE3_SIMD_TYPE STREQUAL "x86-intrinsics") + if (NOT DEFINED BLAKE3_CFLAGS_SSE2 + OR NOT DEFINED BLAKE3_CFLAGS_SSE4.1 + OR NOT DEFINED BLAKE3_CFLAGS_AVX2 + OR NOT DEFINED BLAKE3_CFLAGS_AVX512) + message(FATAL_ERROR "BLAKE3_SIMD_TYPE is set to 'x86-intrinsics' but no compiler flags are available for the target architecture.") + endif() set(BLAKE3_SIMD_X86_INTRINSICS ON) target_sources(blake3 PRIVATE @@ -121,24 +177,31 @@ elseif((CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_X86_NAMES OR BLAKE3_USE_X86_INTRIN set_source_files_properties(blake3_sse2.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_SSE2}") set_source_files_properties(blake3_sse41.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_SSE4.1}") -elseif(CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES - OR ((ANDROID_ABI STREQUAL "armeabi-v7a" - OR BLAKE3_USE_NEON_INTRINSICS) - AND (DEFINED BLAKE3_CFLAGS_NEON - OR CMAKE_SIZEOF_VOID_P EQUAL 8))) +elseif(BLAKE3_SIMD_TYPE STREQUAL "neon-intrinsics") set(BLAKE3_SIMD_NEON_INTRINSICS ON) target_sources(blake3 PRIVATE blake3_neon.c ) - set_source_files_properties(blake3_dispatch.c PROPERTIES COMPILE_DEFINITIONS BLAKE3_USE_NEON=1) + target_compile_definitions(blake3 PRIVATE + BLAKE3_USE_NEON=1 + ) if (DEFINED BLAKE3_CFLAGS_NEON) set_source_files_properties(blake3_neon.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_NEON}") endif() +elseif(BLAKE3_SIMD_TYPE STREQUAL "none") + target_compile_definitions(blake3 PRIVATE + BLAKE3_USE_NEON=0 + BLAKE3_NO_SSE2 + BLAKE3_NO_SSE41 + BLAKE3_NO_AVX2 + BLAKE3_NO_AVX512 + ) + else() - BLAKE3_DISABLE_SIMD() + message(FATAL_ERROR "BLAKE3_SIMD_TYPE is set to an unknown value: '${BLAKE3_SIMD_TYPE}'") endif() # cmake install support @@ -171,6 +234,7 @@ install(FILES "${CMAKE_BINARY_DIR}/libblake3.pc" DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig") # print feature summary +# add_feature_info cannot directly use the BLAKE3_SIMD_TYPE :( add_feature_info("AMD64 assembly" BLAKE3_SIMD_AMD64_ASM "The library uses hand written amd64 SIMD assembly.") add_feature_info("x86 SIMD intrinsics" BLAKE3_SIMD_X86_INTRINSICS "The library uses x86 SIMD intrinsics.") add_feature_info("NEON SIMD intrinsics" BLAKE3_SIMD_NEON_INTRINSICS "The library uses NEON SIMD intrinsics.") diff --git a/src/belahash/blake3/blake3.c b/src/belahash/blake3/blake3.c index 692f4b02..9becead0 100644 --- a/src/belahash/blake3/blake3.c +++ b/src/belahash/blake3/blake3.c @@ -134,9 +134,7 @@ INLINE void chunk_state_update(blake3_chunk_state *self, const uint8_t *input, input_len -= BLAKE3_BLOCK_LEN; } - size_t take = chunk_state_fill_buf(self, input, input_len); - input += take; - input_len -= take; + chunk_state_fill_buf(self, input, input_len); } INLINE output_t chunk_state_output(const blake3_chunk_state *self) { @@ -341,21 +339,24 @@ INLINE void compress_subtree_to_parent_node( size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key, chunk_counter, flags, cv_array); assert(num_cvs <= MAX_SIMD_DEGREE_OR_2); - - // If MAX_SIMD_DEGREE is greater than 2 and there's enough input, + // The following loop never executes when MAX_SIMD_DEGREE_OR_2 is 2, because + // as we just asserted, num_cvs will always be <=2 in that case. But GCC + // (particularly GCC 8.5) can't tell that it never executes, and if NDEBUG is + // set then it emits incorrect warnings here. We tried a few different + // hacks to silence these, but in the end our hacks just produced different + // warnings (see https://github.com/BLAKE3-team/BLAKE3/pull/380). Out of + // desperation, we ifdef out this entire loop when we know it's not needed. +#if MAX_SIMD_DEGREE_OR_2 > 2 + // If MAX_SIMD_DEGREE_OR_2 is greater than 2 and there's enough input, // compress_subtree_wide() returns more than 2 chaining values. Condense // them into 2 by forming parent nodes repeatedly. uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2]; - // The second half of this loop condition is always true, and we just - // asserted it above. But GCC can't tell that it's always true, and if NDEBUG - // is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious - // warnings here. GCC 8.5 is particularly sensitive, so if you're changing - // this code, test it against that version. - while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) { + while (num_cvs > 2) { num_cvs = compress_parents_parallel(cv_array, num_cvs, key, flags, out_array); memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN); } +#endif memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); } @@ -427,7 +428,7 @@ INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) { // of the whole tree, and it would need to be ROOT finalized. We can't // compress it until we know. // 2) This 64 KiB input might complete a larger tree, whose root node is -// similarly going to be the the root of the whole tree. For example, maybe +// similarly going to be the root of the whole tree. For example, maybe // we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the // node at the root of the 256 KiB subtree until we know how to finalize it. // diff --git a/src/belahash/blake3/blake3.h b/src/belahash/blake3/blake3.h index f694dcf2..c38545fe 100644 --- a/src/belahash/blake3/blake3.h +++ b/src/belahash/blake3/blake3.h @@ -30,7 +30,7 @@ extern "C" { #endif -#define BLAKE3_VERSION_STRING "1.5.0" +#define BLAKE3_VERSION_STRING "1.5.2" #define BLAKE3_KEY_LEN 32 #define BLAKE3_OUT_LEN 32 #define BLAKE3_BLOCK_LEN 64 diff --git a/src/belahash/blake3/blake3_dispatch.c b/src/belahash/blake3/blake3_dispatch.c index e681e871..c9abc13f 100644 --- a/src/belahash/blake3/blake3_dispatch.c +++ b/src/belahash/blake3/blake3_dispatch.c @@ -4,9 +4,12 @@ #include "blake3_impl.h" -#if defined(IS_X86) #if defined(_MSC_VER) #include +#endif + +#if defined(IS_X86) +#if defined(_MSC_VER) #include #elif defined(__GNUC__) #include @@ -32,9 +35,9 @@ #define ATOMIC_LOAD(x) x #define ATOMIC_STORE(x, y) x = y #elif defined(_MSC_VER) -#define ATOMIC_INT long -#define ATOMIC_LOAD(x) _InterlockedOr(&x, 0) -#define ATOMIC_STORE(x, y) _InterlockedExchange(&x, y) +#define ATOMIC_INT LONG +#define ATOMIC_LOAD(x) InterlockedOr(&x, 0) +#define ATOMIC_STORE(x, y) InterlockedExchange(&x, y) #else #define ATOMIC_INT int #define ATOMIC_LOAD(x) x diff --git a/src/belahash/blake3/blake3_impl.h b/src/belahash/blake3/blake3_impl.h index beab5cf5..98611c31 100644 --- a/src/belahash/blake3/blake3_impl.h +++ b/src/belahash/blake3/blake3_impl.h @@ -28,7 +28,7 @@ enum blake3_flags { #define INLINE static inline __attribute__((always_inline)) #endif -#if defined(__x86_64__) || defined(_M_X64) +#if (defined(__x86_64__) || defined(_M_X64)) && !defined(_M_ARM64EC) #define IS_X86 #define IS_X86_64 #endif @@ -38,7 +38,7 @@ enum blake3_flags { #define IS_X86_32 #endif -#if defined(__aarch64__) || defined(_M_ARM64) +#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) #define IS_AARCH64 #endif diff --git a/src/belahash/blake3/blake3_neon.c b/src/belahash/blake3/blake3_neon.c index 8a818fc7..53ce83c3 100644 --- a/src/belahash/blake3/blake3_neon.c +++ b/src/belahash/blake3/blake3_neon.c @@ -10,14 +10,12 @@ INLINE uint32x4_t loadu_128(const uint8_t src[16]) { // vld1q_u32 has alignment requirements. Don't use it. - uint32x4_t x; - memcpy(&x, src, 16); - return x; + return vreinterpretq_u32_u8(vld1q_u8(src)); } INLINE void storeu_128(uint32x4_t src, uint8_t dest[16]) { // vst1q_u32 has alignment requirements. Don't use it. - memcpy(dest, &src, 16); + vst1q_u8(dest, vreinterpretq_u8_u32(src)); } INLINE uint32x4_t add_128(uint32x4_t a, uint32x4_t b) { @@ -36,7 +34,7 @@ INLINE uint32x4_t set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { } INLINE uint32x4_t rot16_128(uint32x4_t x) { - // The straightfoward implementation would be two shifts and an or, but that's + // The straightforward implementation would be two shifts and an or, but that's // slower on microarchitectures we've tested. See // https://github.com/BLAKE3-team/BLAKE3/pull/319. // return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16));