diff --git a/docs/build-s390x.md b/docs/build-s390x.md index f44038c586ddc..9b2f421774ae6 100644 --- a/docs/build-s390x.md +++ b/docs/build-s390x.md @@ -28,8 +28,9 @@ cmake --build build --config Release -j $(nproc) ``` **Notes**: -- For faster repeated compilation, install [ccache](https://ccache.dev/) -- By default, VXE/VXE2 is enabled. To disable it (not recommended): + +- For faster repeated compilation, install [ccache](https://ccache.dev/) +- By default, VXE/VXE2 is enabled. To disable it (not recommended): ```bash cmake -S . -B build \ @@ -41,18 +42,29 @@ cmake --build build --config Release -j $(nproc) cmake --build build --config Release -j $(nproc) ``` -- For debug builds: +- By default, NNPA is enabled when available. To disable it (not recommended): + + ```bash + cmake -S . -B build \ + -DCMAKE_BUILD_TYPE=Release \ + -DGGML_BLAS=ON \ + -DGGML_BLAS_VENDOR=OpenBLAS \ + -DGGML_NNPA=OFF + + cmake --build build --config Release -j $(nproc) + ``` + +- For debug builds: ```bash cmake -S . -B build \ -DCMAKE_BUILD_TYPE=Debug \ -DGGML_BLAS=ON \ -DGGML_BLAS_VENDOR=OpenBLAS - cmake --build build --config Debug -j $(nproc) ``` -- For static builds, add `-DBUILD_SHARED_LIBS=OFF`: +- For static builds, add `-DBUILD_SHARED_LIBS=OFF`: ```bash cmake -S . -B build \ @@ -101,27 +113,33 @@ All models need to be converted to Big-Endian. You can achieve this in three cas ``` For example, + ```bash python3 gguf-py/gguf/scripts/gguf_convert_endian.py granite-3.3-2b-instruct-le.f16.gguf BIG mv granite-3.3-2b-instruct-le.f16.gguf granite-3.3-2b-instruct-be.f16.gguf ``` **Notes:** + - The GGUF endian conversion script may not support all data types at the moment and may fail for some models/quantizations. When that happens, please try manually converting the safetensors model to GGUF Big-Endian via Step 2. ## IBM Accelerators ### 1. SIMD Acceleration -Only available in IBM z15 or later system with the `-DGGML_VXE=ON` (turned on by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z14 or EC13. In such systems, the APIs can still run but will use a scalar implementation. +Only available in IBM z15 or later system with the `-DGGML_VXE=ON` (turned on by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z14/arch12. In such systems, the APIs can still run but will use a scalar implementation. + +### 2. NNPA Vector Intrinsics Acceleration -### 2. zDNN Accelerator +Only available in IBM z16 or later system with the `-DGGML_NNPA=ON` (turned on when available) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation. -*Only available in IBM z16 or later system. No direction at the moment.* +### 3. zDNN Accelerator -### 3. Spyre Accelerator +_Only available in IBM z16 or later system. No direction at the moment._ -*No direction at the moment.* +### 4. Spyre Accelerator + +_No direction at the moment._ ## Performance Tuning @@ -154,4 +172,3 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl 2. **Other Questions** Please reach out directly to [aionz@us.ibm.com](mailto:aionz@us.ibm.com). - diff --git a/docs/build.md b/docs/build.md index 680b0d8398741..896a9946423cb 100644 --- a/docs/build.md +++ b/docs/build.md @@ -557,6 +557,10 @@ ninja To read documentation for how to build on Android, [click here](./android.md) +## IBM Z & LinuxONE + +To read documentation for how to build on IBM Z & LinuxONE, [click here](./build-s390x.md) + ## Notes about GPU-accelerated backends The GPU may still be used to accelerate some parts of the computation even when using the `-ngl 0` option. You can fully disable GPU acceleration by using `--device none`. diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 4e7399f9e68f9..215eb23486814 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -131,6 +131,7 @@ option(GGML_RVV "ggml: enable rvv" ON) option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF) option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF) option(GGML_VXE "ggml: enable vxe" ON) +option(GGML_NNPA "ggml: enable nnpa" ON) option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF) set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM") diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index de77a875ec533..e3b79d09bb66f 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -101,6 +101,7 @@ extern "C" { GGML_BACKEND_API int ggml_cpu_has_riscv_v (void); GGML_BACKEND_API int ggml_cpu_has_vsx (void); GGML_BACKEND_API int ggml_cpu_has_vxe (void); + GGML_BACKEND_API int ggml_cpu_has_nnpa (void); GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void); GGML_BACKEND_API int ggml_cpu_has_llamafile (void); diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index 52cae778cac18..fa4a655a70fdd 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -427,6 +427,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name) # TODO: Separation to determine activation of VX/VXE/VXE2 if (${S390X_M} MATCHES "8561|8562") + set(GGML_NNPA OFF) message(STATUS "z15 target") list(APPEND ARCH_FLAGS -march=z15) elseif (${S390X_M} MATCHES "3931") @@ -443,8 +444,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name) endif() if (GGML_VXE) + message(STATUS "VX/VXE/VXE2 enabled") list(APPEND ARCH_FLAGS -mvx -mzvector) endif() + + if (GGML_NNPA) + target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_NNPA) + message(STATUS "NNPA enabled") + endif() elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm") message(STATUS "Wasm detected") list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c) diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h index 73a8f93987aa3..d839cf5c55e81 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-impl.h +++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h @@ -62,11 +62,17 @@ struct ggml_compute_params { #if defined(__s390x__) && defined(__VEC__) #ifndef __VXE__ #define __VXE__ -#endif +#endif // __VXE__ #ifndef __VXE2__ #define __VXE2__ -#endif -#endif +#endif // __VXE2__ +#endif // __s390x__ && __VEC__ + +#if defined(__s390x__) && defined(GGML_NNPA) +#ifndef __NNPA__ +#define __NNPA__ +#endif // __NNPA__ +#endif // __s390x__ && GGML_NNPA #if defined(__ARM_FEATURE_SVE) #include diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 1bb9c4e367f0f..774f91e1fee1c 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -3137,6 +3137,25 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) { _mm_storel_epi64((__m128i *)(y + i), y_vec); } #endif + +#if defined(__NNPA__) + for (; i + 7 < n; i += 8) { + uint16_t tmp[8]; + float32x4_t v_x1 = vec_xl(0, x + i + 0); + float32x4_t v_x2 = vec_xl(0, x + i + 4); + uint16x8_t v_dlf16 = vec_round_from_fp32(v_x1, v_x2, 0); + vec_xst(v_dlf16, 0, tmp); + // raise(SIGINT); + } + // TODO: Enable bottom code once checks are done + // for (; i + 3 < n; i += 4) { + // float32x4_t v_x = vec_xl(i, x); + // float32x4_t v_zero = vec_splats(0.0f); + // uint16x4_t v_dlf16 = vec_round_from_fp32(v_x, v_zero, 0); + // vec_xst(v_dlf16, i, (uint16_t *)y); + // } +#endif + for (; i < n; ++i) { y[i] = GGML_FP32_TO_FP16(x[i]); } @@ -3364,6 +3383,14 @@ int ggml_cpu_has_vxe(void) { #endif } +int ggml_cpu_has_nnpa(void) { +#if defined(GGML_NNPA) + return 1; +#else + return 0; +#endif +} + int ggml_cpu_has_neon(void) { #if defined(__ARM_ARCH) && defined(__ARM_NEON) return 1; diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index 735ef3f015c13..a98866a2d8052 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -578,6 +578,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r if (ggml_cpu_has_vxe()) { features.push_back({ "VXE", "1" }); } + if (ggml_cpu_has_nnpa()) { + features.push_back({ "NNPA", "1" }); + } if (ggml_cpu_has_wasm_simd()) { features.push_back({ "WASM_SIMD", "1" }); } diff --git a/ggml/src/ggml-cpu/simd-mappings.h b/ggml/src/ggml-cpu/simd-mappings.h index e42364c59aa10..e95fdd0fd9b49 100644 --- a/ggml/src/ggml-cpu/simd-mappings.h +++ b/ggml/src/ggml-cpu/simd-mappings.h @@ -1,6 +1,7 @@ #pragma once #include "ggml-cpu-impl.h" +#include // // simd mappings @@ -922,7 +923,7 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) { #define GGML_F32_STEP 32 #define GGML_F32_EPR 4 -#define GGML_F32x4 __vector float +#define GGML_F32x4 float32x4_t #define GGML_F32x4_ZERO vec_splats(0.0f) #define GGML_F32x4_SET1 vec_splats #define GGML_F32x4_LOAD(p) vec_xl(0, p) @@ -962,7 +963,12 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) { #define GGML_F16_STEP GGML_F32_STEP #define GGML_F16_EPR GGML_F32_EPR -static inline __vector float __lzs_f16cx4_load(const ggml_fp16_t * x) { +static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) { +#ifdef __NNPA__ + uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)x); + uint16x8_t nnpa_dlf16 = vec_convert_from_fp16(v_x, 0); + return vec_extend_to_fp32_hi(nnpa_dlf16, 0); +#else float tmp[4]; for (int i = 0; i < 4; i++) { @@ -972,18 +978,29 @@ static inline __vector float __lzs_f16cx4_load(const ggml_fp16_t * x) { // note: keep type-cast here to prevent compiler bugs // see: https://github.com/ggml-org/llama.cpp/issues/12846 return vec_xl(0, (const float *)(tmp)); +#endif } -static inline void __lzs_f16cx4_store(ggml_fp16_t * x, __vector float y) { +static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) { +#ifdef __NNPA__ + float32x4_t v_zero = vec_splats(0.0f); + uint16x8_t v_x = vec_round_from_fp32(v_y, v_zero, 0); + x[0] = vec_extract(v_x, 0); + x[1] = vec_extract(v_x, 1); + x[2] = vec_extract(v_x, 2); + x[3] = vec_extract(v_x, 3); + raise(SIGINT); +#else float arr[4]; // note: keep type-cast here to prevent compiler bugs // see: https://github.com/ggml-org/llama.cpp/issues/12846 - vec_xst(y, 0, (float *)(arr)); + vec_xst(v_y, 0, (float *)(arr)); for (int i = 0; i < 4; i++) { x[i] = GGML_FP32_TO_FP16(arr[i]); } +#endif } #define GGML_F16_VEC GGML_F32x4 diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 6dc5ce0d92fd8..db90eb1de9932 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -322,6 +322,7 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size); // 16-bit float // on Arm, we use __fp16 // on x86, we use uint16_t +// on s390x, we use ZDNN_DLFLOAT16 with NNPA // // for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616 // for MUSA compilers , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843 @@ -417,6 +418,27 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size); #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x) #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) +#elif defined(__NNPA__) + + #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) + #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) + + #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x) + #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) + + static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { + uint16x8_t v_h = vec_splats(h); + uint16x8_t nnpa_dlf16 = vec_convert_from_fp16(v_h, 0); + return vec_extend_to_fp32_hi(nnpa_dlf16, 0)[0]; + } + + static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { + float32x4_t v_f = vec_splats(f); + float32x4_t v_zero = vec_splats(0.0f); + uint16x8_t v_h = vec_round_from_fp32(v_f, v_zero, 0); + return vec_extract(v_h, 0); + } + #else // FP16 <-> FP32