diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d5aa70dc..cc5aa422 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -23,7 +23,8 @@ jobs: strategy: matrix: - sanitizer: [ADDRESS, THREAD, UNDEFINED] + # THREAD sanitizer is broken here and in llama.cpp + sanitizer: [ADDRESS, UNDEFINED] build_type: [Debug, Release] steps: @@ -51,7 +52,7 @@ jobs: id: cmake_test run: | cd build - ctest --verbose + ASAN_OPTIONS=detect_odr_violation=0 ctest --verbose ubuntu-latest-cmake: runs-on: ubuntu-latest @@ -134,7 +135,7 @@ jobs: run: | mkdir build cd build - cmake -DRWKV_AVX2=OFF -DRWKV_FMA=OFF -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" .. + cmake -DRWKV_AVX2=OFF -DRWKV_FMA=OFF -DRWKV_METAL=ON -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" .. cmake --build . --config Release - name: Test @@ -170,7 +171,7 @@ jobs: rwkv-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip windows-latest-cmake: - runs-on: windows-latest + runs-on: windows-2019 continue-on-error: true @@ -186,7 +187,7 @@ jobs: - build: 'cuda12' defines: '-DRWKV_CUBLAS=ON' - build: 'rocm5.5' - defines: '-G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DRWKV_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx1100;gfx1102;gfx1030"' + defines: '-G "Unix Makefiles" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DRWKV_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx1100;gfx1102;gfx1030"' steps: - name: Clone diff --git a/.gitmodules b/.gitmodules index 67eaf4a3..85278985 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,4 +1,4 @@ [submodule "ggml"] path = ggml - url = https://github.com/saharNooby/ggml - branch = increased-node-limit-2023-09-19 + url = https://github.com/ggerganov/ggml + branch = master diff --git a/CMakeLists.txt b/CMakeLists.txt index 217e22f7..90ae23b4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.12) +cmake_minimum_required(VERSION 3.21) project("rwkv.cpp" C CXX) @@ -42,10 +42,34 @@ option(RWKV_OPENBLAS "rwkv: use OpenBLAS" option(RWKV_CUBLAS "rwkv: use cuBLAS" OFF) option(RWKV_CLBLAST "rwkv: use CLBlast" OFF) option(RWKV_HIPBLAS "rwkv: use hipBLAS" OFF) +option(RWKV_METAL "rwkv: use Metal" OFF) # Build only shared library without building tests and extras option(RWKV_STANDALONE "rwkv: build only RWKV library" OFF) + +# transition helpers (from llama.cpp) +function (rwkv_option_depr TYPE OLD NEW) + if (${OLD}) + message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n") + set(${NEW} ON PARENT_SCOPE) + endif() +endfunction() + +set(GGML_ACCELERATE ${RWKV_ACCELERATE}) +set(GGML_CUDA ${RWKV_CUBLAS}) +set(GGML_HIPBLAS ${RWKV_HIPBLAS}) +set(GGML_METAL ${RWKV_METAL}) +if (RWKV_OPENBLAS) + set(GGML_BLAS_VENDOR "OpenBLAS") + set(GGML_BLAS ON) +endif() + +set(GGML_AVX ${RWKV_AVX}) +set(GGML_AVX2 ${RWKV_AVX2}) +set(GGML_AVX512 ${RWKV_AVX512}) +set(GGML_FMA ${RWKV_FMA}) + # # Compile flags # @@ -58,179 +82,9 @@ set(THREADS_PREFER_PTHREAD_FLAG ON) find_package(Threads REQUIRED) if (NOT MSVC) - if (RWKV_SANITIZE_THREAD) - add_compile_options(-fsanitize=thread) - link_libraries(-fsanitize=thread) - endif() - - if (RWKV_SANITIZE_ADDRESS) - add_compile_options(-fsanitize=address -fno-omit-frame-pointer) - link_libraries(-fsanitize=address) - endif() - - if (RWKV_SANITIZE_UNDEFINED) - add_compile_options(-fsanitize=undefined) - link_libraries(-fsanitize=undefined) - endif() -endif() - -if (APPLE AND RWKV_ACCELERATE) - find_library(ACCELERATE_FRAMEWORK Accelerate) - if (ACCELERATE_FRAMEWORK) - message(STATUS "Accelerate framework found") - - add_compile_definitions(GGML_USE_ACCELERATE) - set(RWKV_EXTRA_LIBS ${RWKV_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK}) - else() - message(WARNING "Accelerate framework not found") - endif() -endif() - -if (RWKV_OPENBLAS) - if (RWKV_STATIC) - set(BLA_STATIC ON) - endif() - - set(BLA_VENDOR OpenBLAS) - find_package(BLAS) - if (BLAS_FOUND) - message(STATUS "OpenBLAS found") - - add_compile_definitions(GGML_USE_OPENBLAS) - add_link_options(${BLAS_LIBRARIES}) - else() - message(WARNING "OpenBLAS not found") - endif() -endif() - -if (RWKV_CUBLAS) - cmake_minimum_required(VERSION 3.17) - set(CMAKE_CUDA_COMPILER_FORCED TRUE) - - find_package(CUDAToolkit) - - if (CUDAToolkit_FOUND) - message(STATUS "cuBLAS found") - - enable_language(CUDA) - - set(GGML_CUDA_SOURCES ${CMAKE_SOURCE_DIR}/ggml/src/ggml-cuda.cu ${CMAKE_SOURCE_DIR}/ggml/src/ggml-cuda.h) - - add_compile_definitions(GGML_USE_CUBLAS) - - # By default, GGML_CUDA_MMV_Y is set to 1. This value leads to CUDA error on my machine: - # CUDA error 9 at ...\rwkv.cpp\ggml\src\ggml-cuda.cu:6107: invalid configuration argument - # The error appears when the head matrix of v5 3B and v5 7B models is offloaded. I guess the matrix is so large that block_num_y becomes too big. - # Changing it to 2 makes it work. I did not see any performance impact when measuring v5 3B & v5 7B. Hopefully, this will not break other use-cases. - # TODO Re-check after updating ggml whether this is needed - add_compile_definitions(GGML_CUDA_MMV_Y=2) - - if (RWKV_STATIC) - set(RWKV_EXTRA_LIBS ${RWKV_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) - else() - set(RWKV_EXTRA_LIBS ${RWKV_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt) - endif() - - # Architecture set-up copy-pasted from https://github.com/ggerganov/llama.cpp/blob/111163e2463171891680feed94371eb9becd9817/CMakeLists.txt#L317 - if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES) - # 52: lowest CUDA 12 standard - # 60: f16 CUDA intrinsics - # 61: integer CUDA intrinsics - # 70: compute capability at which unrolling a loop in mul_mat_q kernels is faster - - # Lowest CUDA 12 standard + lowest for integer intrinsics. - set(CMAKE_CUDA_ARCHITECTURES "52;61;70") - endif() - message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") - else() - message(WARNING "cuBLAS not found") - endif() -endif() - -if (RWKV_CLBLAST) - cmake_minimum_required(VERSION 3.17) - - file(GLOB OPENCL_INCLUDE_SEARCH_PATHS ${CMAKE_SOURCE_DIR}/OpenCL-*/) - file(GLOB CLBLAST_INCLUDE_SEARCH_PATHS ${CMAKE_SOURCE_DIR}/CLBlast-*/) - - set(OPENCL_INCLUDE_SEARCH_PATHS - /usr/include - /usr/local/include - $ENV{OPENCL_HOME} - $ENV{OPENCL_HOME}/include - ${OPENCL_INCLUDE_SEARCH_PATHS} - ) - - set(CLBLAST_INCLUDE_SEARCH_PATHS - /usr/include - /usr/local/include - $ENV{CLBLAST_HOME} - $ENV{CLBLAST_HOME}/include - ${CLBLAST_INCLUDE_SEARCH_PATHS} - ) - - find_path(OPENCL_INC NAMES opencl.h PATHS ${OPENCL_INCLUDE_SEARCH_PATHS} PATH_SUFFIXES include/CL) - find_library(OPENCL_LIB NAMES OpenCL PATHS ${OPENCL_INCLUDE_SEARCH_PATHS} PATH_SUFFIXES lib) - find_path(CLBLAST_INC NAMES clblast.h PATHS ${CLBLAST_INCLUDE_SEARCH_PATHS} PATH_SUFFIXES include) - find_library(CLBLAST_LIB NAMES clblast PATHS ${CLBLAST_INCLUDE_SEARCH_PATHS} PATH_SUFFIXES lib) - - if (OPENCL_LIB) - set(OPENCL_INC ${OPENCL_INC}/..) # disgusting - message(STATUS "OpenCL SDK found: ${OPENCL_INC}") - - if (CLBLAST_LIB) - message(STATUS "CLBlast found: ${CLBLAST_INC}") - add_compile_definitions(GGML_USE_CLBLAST) - set(GGML_OPENCL_SOURCES ${CMAKE_SOURCE_DIR}/ggml/src/ggml-opencl.cpp ${CMAKE_SOURCE_DIR}/ggml/src/ggml-opencl.h) - set(GGML_OPENCL_DIRS ${GGML_OPENCL_DIRS} ${OPENCL_INC} ${CLBLAST_INC}) - set(RWKV_EXTRA_LIBS ${RWKV_EXTRA_LIBS} ${OPENCL_LIB} ${CLBLAST_LIB}) - link_libraries("-Wl,--copy-dt-needed-entries") - else() - message(WARNING "CLBlast not found") - endif() - else() - message(WARNING "OpenCL SDK not found, CLBlast cannot be enabled") - endif() -endif() - -if (RWKV_HIPBLAS) - list(APPEND CMAKE_PREFIX_PATH /opt/rocm) - - if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang") - message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang") - endif() - - if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang") - message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++") - endif() - - find_package(hip) - find_package(hipblas) - find_package(rocblas) - - if (${hipblas_FOUND} AND ${hip_FOUND}) - message(STATUS "HIP and hipBLAS found") - add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS) - add_library(ggml-rocm OBJECT - ${CMAKE_SOURCE_DIR}/ggml/src/ggml-cuda.cu - ${CMAKE_SOURCE_DIR}/ggml/src/ggml-cuda.h) - - if (RWKV_BUILD_SHARED_LIBRARY) - set_target_properties(ggml-rocm PROPERTIES POSITION_INDEPENDENT_CODE ON) - endif() - - target_include_directories(ggml-rocm PUBLIC ${CMAKE_SOURCE_DIR}/ggml/include/ggml) - set_source_files_properties(${CMAKE_SOURCE_DIR}/ggml/src/ggml-cuda.cu PROPERTIES LANGUAGE CXX) - target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas) - - if (RWKV_STATIC) - message(FATAL_ERROR "Static linking not supported for HIP/ROCm") - endif() - - set(RWKV_EXTRA_LIBS ${RWKV_EXTRA_LIBS} ggml-rocm) - else() - message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm") - endif() + set(GGML_SANITIZE_THREAD ${RWKV_SANITIZE_THREAD}) + set(GGML_SANITIZE_ADDRESS ${RWKV_SANITIZE_ADDRESS}) + set(GGML_SANITIZE_UNDEFINED ${RWKV_SANITIZE_UNDEFINED}) endif() if (RWKV_ALL_WARNINGS) @@ -278,6 +132,7 @@ if (RWKV_LTO) else() message(WARNING "IPO is not supported: ${output}") endif() + set(GGML_LTO ON) endif() # Architecture specific @@ -286,6 +141,7 @@ endif() message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") if (NOT MSVC) if (RWKV_STATIC) + set(GGML_STATIC ON) add_link_options(-static) if (MINGW) add_link_options(-static-libgcc -static-libstdc++) @@ -299,162 +155,121 @@ if (NOT MSVC) endif() endif() -if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") - message(STATUS "ARM detected") - if (MSVC) - # TODO [llama.cpp]: arm msvc? - else() - if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") - add_compile_options(-mcpu=native) - endif() - # TODO [llama.cpp]: armv6,7,8 version specific flags - endif() -elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$") - message(STATUS "x86 detected") - if (MSVC) - if (RWKV_AVX512) - add_compile_options($<$:/arch:AVX512>) - add_compile_options($<$:/arch:AVX512>) - # MSVC has no compile-time flags enabling specific - # AVX512 extensions, neither it defines the - # macros corresponding to the extensions. - # Do it manually. - elseif (RWKV_AVX2) - add_compile_options($<$:/arch:AVX2>) - add_compile_options($<$:/arch:AVX2>) - elseif (RWKV_AVX) - add_compile_options($<$:/arch:AVX>) - add_compile_options($<$:/arch:AVX>) - endif() - else() - add_compile_options(-mf16c) - if (RWKV_FMA) - add_compile_options(-mfma) - endif() - if (RWKV_AVX) - add_compile_options(-mavx) - endif() - if (RWKV_AVX2) - add_compile_options(-mavx2) - endif() - if (RWKV_AVX512) - add_compile_options(-mavx512f) - add_compile_options(-mavx512bw) - endif() - endif() -else() - # TODO [llama.cpp]: support PowerPC - message(STATUS "Unknown architecture") -endif() - # -# POSIX conformance -# Section copy-pasted from https://github.com/ggerganov/llama.cpp/blob/8781013ef654270cbead3e0011e33a6d690fb168/CMakeLists.txt#L580C20-L580C20 +# Build libraries # -# clock_gettime came in POSIX.1b (1993) -# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional -# posix_memalign came in POSIX.1-2001 / SUSv3 -# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985) -add_compile_definitions(_XOPEN_SOURCE=600) - -# Somehow in OpenBSD whenever POSIX conformance is specified -# some string functions rely on locale_t availability, -# which was introduced in POSIX.1-2008, forcing us to go higher. -if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD") - remove_definitions(-D_XOPEN_SOURCE=600) - add_compile_definitions(_XOPEN_SOURCE=700) +if (MSVC) + add_compile_definitions(_CRT_SECURE_NO_WARNINGS) endif() -# Data types, macros and functions related to controlling CPU affinity and -# some memory allocation are available on Linux through GNU extensions in libc. -if (CMAKE_SYSTEM_NAME MATCHES "Linux") - add_compile_definitions(_GNU_SOURCE) +if (NOT RWKV_STANDALONE) + set(GGML_STANDALONE OFF) + enable_testing() + add_subdirectory(tests) + add_subdirectory(extras) +elseif() + set(GGML_STANDALONE ON) endif() -# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1, -# and on macOS its availability depends on enabling Darwin extensions. -# Similarly on DragonFly, enabling BSD extensions is necessary. -if ( - CMAKE_SYSTEM_NAME MATCHES "Darwin" OR - CMAKE_SYSTEM_NAME MATCHES "iOS" OR - CMAKE_SYSTEM_NAME MATCHES "tvOS" OR - CMAKE_SYSTEM_NAME MATCHES "DragonFly" -) - add_compile_definitions(_DARWIN_C_SOURCE) +set(BUILD_SHARED_LIBS OFF) +if (NOT TARGET ggml) + add_subdirectory(ggml) + # ... otherwise assume ggml is added by a parent CMakeLists.txt endif() -# alloca is a non-standard interface that is not visible on BSDs when -# POSIX conformance is specified, but not all of them provide a clean way -# to enable it in such cases. -if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD") - add_compile_definitions(__BSD_VISIBLE) -endif() -if (CMAKE_SYSTEM_NAME MATCHES "NetBSD") - add_compile_definitions(_NETBSD_SOURCE) -endif() -if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD") - add_compile_definitions(_BSD_SOURCE) +if (RWKV_BUILD_SHARED_LIBRARY) + add_library(rwkv SHARED rwkv.cpp rwkv.h) +else() + add_library(rwkv rwkv.cpp rwkv.h) endif() -# -# Build libraries -# - -if (MSVC) - add_compile_definitions(_CRT_SECURE_NO_WARNINGS) +if (GGML_OPENMP) + find_package(OpenMP) + if (OpenMP_FOUND) + set(RWKV_EXTRA_LIBS ${RWKV_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX) + endif() endif() -add_library(ggml OBJECT - ${CMAKE_SOURCE_DIR}/ggml/src/ggml.c - ${CMAKE_SOURCE_DIR}/ggml/src/ggml-alloc.c - ${CMAKE_SOURCE_DIR}/ggml/include/ggml/ggml.h - ${CMAKE_SOURCE_DIR}/ggml/include/ggml/ggml-alloc.h - ${GGML_CUDA_SOURCES} - ${GGML_OPENCL_SOURCES}) +if (GGML_CUDA) + find_package(CUDAToolkit) -target_include_directories(ggml PUBLIC ${CMAKE_SOURCE_DIR}/ggml/include/ggml ${GGML_OPENCL_DIRS}) -target_compile_features(ggml PUBLIC c_std_11) # Don't bump + if (CUDAToolkit_FOUND) + add_compile_definitions(GGML_USE_CUDA) + if (GGML_STATIC) + if (WIN32) + # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library + set(RWKV_EXTRA_LIBS ${RWKV_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt) + else () + set(RWKV_EXTRA_LIBS ${RWKV_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) + endif() + else() + set(RWKV_EXTRA_LIBS ${RWKV_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt) + endif() -if (MSVC) - target_link_libraries(ggml PUBLIC ${RWKV_EXTRA_LIBS} Threads::Threads) - if (RWKV_CUBLAS) - target_compile_options(ggml PRIVATE $<$: - -allow-unsupported-compiler - >) + if (GGML_CUDA_NO_VMM) + # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so) + else() + set(RWKV_EXTRA_LIBS ${RWKV_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ... + endif() endif() -else() - if (WIN32 AND RWKV_HIPBLAS) - target_link_libraries(ggml PUBLIC ${RWKV_EXTRA_LIBS} Threads::Threads) - else() - target_link_libraries(ggml PUBLIC m ${RWKV_EXTRA_LIBS} Threads::Threads) +endif() + +if (APPLE AND GGML_ACCELERATE) + find_library(ACCELERATE_FRAMEWORK Accelerate) + if (ACCELERATE_FRAMEWORK) + set(RWKV_EXTRA_LIBS ${RWKV_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK}) endif() endif() -if (RWKV_BUILD_SHARED_LIBRARY) - set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON) +if (GGML_METAL) + find_library(FOUNDATION_LIBRARY Foundation REQUIRED) + find_library(METAL_FRAMEWORK Metal REQUIRED) + find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) + add_compile_definitions(GGML_USE_METAL) + + set(RWKV_EXTRA_LIBS ${RWKV_EXTRA_LIBS} + ${FOUNDATION_LIBRARY} + ${METAL_FRAMEWORK} + ${METALKIT_FRAMEWORK} + ) endif() -if (RWKV_BUILD_SHARED_LIBRARY) - add_library(rwkv SHARED rwkv.cpp rwkv.h) -else() - add_library(rwkv rwkv.cpp rwkv.h) +if (GGML_HIPBLAS) + # CMake on Windows doesn't support the HIP language yet + if (WIN32) + set(CXX_IS_HIPCC TRUE) + else() + string(REGEX MATCH "hipcc(\.bat)?$" CXX_IS_HIPCC "${CMAKE_CXX_COMPILER}") + endif() + + find_package(hip REQUIRED) + find_package(hipblas REQUIRED) + find_package(rocblas REQUIRED) + + list(APPEND GGML_CDEF_PUBLIC GGML_USE_CUDA) + + add_compile_definitions(GGML_USE_HIPBLAS) + + if (CXX_IS_HIPCC) + set(RWKV_EXTRA_LIBS ${RWKV_EXTRA_LIBS} hip::device) + endif() + + if (GGML_STATIC) + message(FATAL_ERROR "Static linking not supported for HIP/ROCm") + endif() + + set(RWKV_EXTRA_LIBS ${RWKV_EXTRA_LIBS} PUBLIC hip::host roc::rocblas roc::hipblas) endif() target_include_directories(rwkv PUBLIC .) +target_include_directories(rwkv PRIVATE ggml/include) target_compile_features(rwkv PUBLIC cxx_std_11) -target_link_libraries(rwkv PRIVATE ggml ${RWKV_EXTRA_LIBS}) +target_link_libraries(rwkv PRIVATE $ ${RWKV_EXTRA_LIBS}) if (RWKV_BUILD_SHARED_LIBRARY) + set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON) + target_compile_definitions(ggml PRIVATE GGML_SHARED GGML_BUILD) set_target_properties(rwkv PROPERTIES POSITION_INDEPENDENT_CODE ON) target_compile_definitions(rwkv PRIVATE RWKV_SHARED RWKV_BUILD) endif() - -if (NOT RWKV_STANDALONE) - set_property(TARGET ggml PROPERTY GGML_STANDALONE OFF) - enable_testing() - add_subdirectory(tests) - add_subdirectory(extras) -elseif() - set_property(TARGET ggml PROPERTY GGML_STANDALONE ON) -endif() diff --git a/extras/CMakeLists.txt b/extras/CMakeLists.txt index 2787b68b..c438d8a7 100644 --- a/extras/CMakeLists.txt +++ b/extras/CMakeLists.txt @@ -1,11 +1,8 @@ function(rwkv_add_extra source) get_filename_component(EXTRA_TARGET ${source} NAME_WE) add_executable(rwkv_${EXTRA_TARGET} ${source}) - if(RWKV_HIPBLAS) - target_link_libraries(rwkv_${EXTRA_TARGET} PRIVATE ggml-rocm ggml rwkv) - else() - target_link_libraries(rwkv_${EXTRA_TARGET} PRIVATE ggml rwkv) - endif() + + target_link_libraries(rwkv_${EXTRA_TARGET} PRIVATE ggml rwkv) if (RWKV_STATIC) if(RWKV_HIPBLAS) diff --git a/ggml b/ggml index 46f083d1..3e7e5e26 160000 --- a/ggml +++ b/ggml @@ -1 +1 @@ -Subproject commit 46f083d15bb31c62933300ffbfffa5aa6ae2ecae +Subproject commit 3e7e5e26f90fecf4f7c2808df7d94454630b219c diff --git a/python/generate_completions.py b/python/generate_completions.py index 4685720c..f0d9aa14 100644 --- a/python/generate_completions.py +++ b/python/generate_completions.py @@ -39,7 +39,7 @@ print(f'System info: {library.rwkv_get_system_info_string()}') print('Loading RWKV model') -model = rwkv_cpp_model.RWKVModel(library, args.model_path) +model = rwkv_cpp_model.RWKVModel(library, args.model_path, gpu_layers_count=0) tokenizer_decode, tokenizer_encode = get_tokenizer(args.tokenizer, model.n_vocab) diff --git a/python/rwkv_cpp/rwkv_cpp_model.py b/python/rwkv_cpp/rwkv_cpp_model.py index 59dd304d..80304db2 100644 --- a/python/rwkv_cpp/rwkv_cpp_model.py +++ b/python/rwkv_cpp/rwkv_cpp_model.py @@ -63,37 +63,13 @@ def __init__( self._library: rwkv_cpp_shared_library.RWKVSharedLibrary = shared_library - self._ctx: rwkv_cpp_shared_library.RWKVContext = self._library.rwkv_init_from_file(model_path, thread_count) - - if gpu_layer_count > 0: - self.gpu_offload_layers(gpu_layer_count) + self._ctx: rwkv_cpp_shared_library.RWKVContext = self._library.rwkv_init_from_file(model_path, thread_count, gpu_layer_count) self._state_buffer_element_count: int = self._library.rwkv_get_state_buffer_element_count(self._ctx) self._logits_buffer_element_count: int = self._library.rwkv_get_logits_buffer_element_count(self._ctx) self._valid: bool = True - def gpu_offload_layers(self, layer_count: int) -> bool: - """ - Offloads specified count of model layers onto the GPU. Offloaded layers are evaluated using cuBLAS or CLBlast. - For the purposes of this function, model head (unembedding matrix) is treated as an additional layer: - - pass `model.n_layer` to offload all layers except model head - - pass `model.n_layer + 1` to offload all layers, including model head - - Returns true if at least one layer was offloaded. - If rwkv.cpp was compiled without cuBLAS and CLBlast support, this function is a no-op and always returns false. - - Parameters - ---------- - layer_count : int - Count of layers to offload onto the GPU, must be >= 0. - """ - - if not (layer_count >= 0): - raise ValueError('Layer count must be >= 0') - - return self._library.rwkv_gpu_offload_layers(self._ctx, layer_count) - @property def n_vocab(self) -> int: return self._library.rwkv_get_n_vocab(self._ctx) diff --git a/python/rwkv_cpp/rwkv_cpp_shared_library.py b/python/rwkv_cpp/rwkv_cpp_shared_library.py index 4c095a73..3f59b2ed 100644 --- a/python/rwkv_cpp/rwkv_cpp_shared_library.py +++ b/python/rwkv_cpp/rwkv_cpp_shared_library.py @@ -44,12 +44,9 @@ def __init__(self, shared_library_path: str) -> None: else: self.library = ctypes.cdll.LoadLibrary(shared_library_path) - self.library.rwkv_init_from_file.argtypes = [ctypes.c_char_p, ctypes.c_uint32] + self.library.rwkv_init_from_file.argtypes = [ctypes.c_char_p, ctypes.c_uint32, ctypes.c_uint32] self.library.rwkv_init_from_file.restype = ctypes.c_void_p - self.library.rwkv_gpu_offload_layers.argtypes = [ctypes.c_void_p, ctypes.c_uint32] - self.library.rwkv_gpu_offload_layers.restype = ctypes.c_bool - self.library.rwkv_eval.argtypes = [ ctypes.c_void_p, # ctx ctypes.c_int32, # token @@ -109,7 +106,7 @@ def __init__(self, shared_library_path: str) -> None: self.nullptr = ctypes.cast(0, ctypes.c_void_p) - def rwkv_init_from_file(self, model_file_path: str, thread_count: int) -> RWKVContext: + def rwkv_init_from_file(self, model_file_path: str, thread_count: int, offload_layers: int) -> RWKVContext: """ Loads the model from a file and prepares it for inference. Throws an exception in case of any error. Error messages would be printed to stderr. @@ -122,35 +119,13 @@ def rwkv_init_from_file(self, model_file_path: str, thread_count: int) -> RWKVCo Count of threads to use, must be positive. """ - ptr = self.library.rwkv_init_from_file(model_file_path.encode('utf-8'), ctypes.c_uint32(thread_count)) + ptr = self.library.rwkv_init_from_file(model_file_path.encode('utf-8'), ctypes.c_uint32(thread_count), ctypes.c_uint32(offload_layers)) if ptr is None: raise ValueError('rwkv_init_from_file failed, check stderr') return RWKVContext(ptr) - def rwkv_gpu_offload_layers(self, ctx: RWKVContext, layer_count: int) -> bool: - """ - Offloads specified count of model layers onto the GPU. Offloaded layers are evaluated using cuBLAS or CLBlast. - For the purposes of this function, model head (unembedding matrix) is treated as an additional layer: - - pass `rwkv_get_n_layer(ctx)` to offload all layers except model head - - pass `rwkv_get_n_layer(ctx) + 1` to offload all layers, including model head - Returns true if at least one layer was offloaded. - If rwkv.cpp was compiled without cuBLAS and CLBlast support, this function is a no-op and always returns false. - - Parameters - ---------- - ctx : RWKVContext - RWKV context obtained from rwkv_init_from_file. - layer_count : int - Count of layers to offload onto the GPU, must be >= 0. - """ - - if not (layer_count >= 0): - raise ValueError('Layer count must be >= 0') - - return self.library.rwkv_gpu_offload_layers(ctx.ptr, ctypes.c_uint32(layer_count)) - def rwkv_eval( self, ctx: RWKVContext, diff --git a/rwkv.cpp b/rwkv.cpp index 84f7c1ec..08ede0e4 100644 --- a/rwkv.cpp +++ b/rwkv.cpp @@ -1,6 +1,19 @@ #include "rwkv.h" #include "ggml.h" #include "ggml-alloc.h" +#include "ggml-backend.h" + +#ifdef GGML_USE_CUDA +#include "ggml-cuda.h" +#endif + +#ifdef GGML_USE_METAL +#include "ggml-metal.h" +#endif + +#ifdef GGML_USE_BLAS +#include "ggml-blas.h" +#endif #include #include @@ -37,6 +50,8 @@ static_assert(sizeof(stat::st_size) >= 8, "File offsets should be 64-bit or else rwkv.cpp will not be able to load model files over 2 GB"); static_assert(sizeof(decltype(ftell(NULL))) >= 8, "File offsets should be 64-bit or else rwkv.cpp will not be able to load model files over 2 GB"); +#define RWKV_MAX_NODES 80000 + #include "rwkv_error_handling.inc" #include "rwkv_utilities.inc" @@ -54,7 +69,7 @@ static_assert(sizeof(decltype(ftell(NULL))) >= 8, "File offsets should be 64-bit #include "rwkv_graph.inc" // API function. -struct rwkv_context * rwkv_init_from_file(const char * file_path, const uint32_t n_threads) { +struct rwkv_context * rwkv_init_from_file(const char * file_path, const uint32_t n_threads, const uint32_t n_gpu_layers) { global_last_error = RWKV_ERROR_NONE; std::unique_ptr ctx(new(std::nothrow) struct rwkv_context()); @@ -62,10 +77,40 @@ struct rwkv_context * rwkv_init_from_file(const char * file_path, const uint32_t ctx->model = new(std::nothrow) struct rwkv_model(); ctx->model->reference_count++; - RWKV_ENSURE_OR_NULL(rwkv_load_model_from_file(file_path, *ctx->model)); ctx->n_threads = n_threads; + if (n_gpu_layers) { + ggml_backend_t backend; + +#ifdef GGML_USE_CUDA + backend = ggml_backend_cuda_init(0); + RWKV_ENSURE_OR_NULL(backend); +#endif + +#ifdef GGML_USE_METAL + backend = ggml_backend_metal_init(); + RWKV_ENSURE_OR_NULL(backend); + ggml_backend_metal_set_n_cb(backend, ctx->n_threads); +#endif + +#ifdef GGML_USE_BLAS + backend = ggml_backend_blas_init(); + RWKV_ENSURE_OR_NULL(backend); + ggml_backend_blas_set_n_threads(backend, ctx->n_threads); +#endif + RWKV_ENSURE_OR_NULL(backend); + + ctx->model->backends.push_back(backend); + } + + ggml_backend_t cpu_backend = ggml_backend_cpu_init(); + RWKV_ENSURE_OR_NULL(cpu_backend); + ggml_backend_cpu_set_n_threads(cpu_backend, n_threads); + ctx->model->backends.push_back(cpu_backend); + + RWKV_ENSURE_OR_NULL(rwkv_load_model_from_file(file_path, *ctx->model, n_gpu_layers)); + RWKV_ENSURE_OR_NULL(rwkv_measure_and_build_serial_context(*ctx->model, ctx->serial_graph)); return ctx.release(); @@ -90,8 +135,6 @@ struct rwkv_context * rwkv_clone_context(struct rwkv_context * ctx, const uint32 return clone.release(); } -#include "rwkv_gpu_offload.inc" - #include "rwkv_eval.inc" // API function. @@ -144,14 +187,24 @@ void rwkv_free(struct rwkv_context * ctx) { } if (--ctx->model->reference_count == 0) { + for (auto buffer : ctx->model->buffers_w) { + ggml_backend_buffer_free(buffer); + } + + for (auto backend : ctx->model->backends) { + ggml_backend_free(backend); + } + ggml_free(ctx->model->ggml_ctx); delete ctx->model; } + ggml_backend_sched_free(ctx->serial_graph.sched); ggml_free(ctx->serial_graph.ggml_ctx); if (ctx->last_used_sequence_length > 0) { + ggml_backend_sched_free(ctx->sequential_graph.sched); ggml_free(ctx->sequential_graph.ggml_ctx); } diff --git a/rwkv.h b/rwkv.h index 40b9266c..978ed45c 100644 --- a/rwkv.h +++ b/rwkv.h @@ -87,7 +87,8 @@ extern "C" { // Returns NULL on any error. // - model_file_path: path to model file in ggml format. // - n_threads: count of threads to use, must be positive. - RWKV_API struct rwkv_context * rwkv_init_from_file(const char * model_file_path, const uint32_t n_threads); + // - n_gpu_layer: count of layers need to load to gpu + RWKV_API struct rwkv_context * rwkv_init_from_file(const char * model_file_path, const uint32_t n_threads, const uint32_t n_gpu_layers); // Creates a new context from an existing one. // This can allow you to run multiple rwkv_eval's in parallel, without having to load a single model multiple times. @@ -97,14 +98,6 @@ extern "C" { // - n_threads: count of threads to use, must be positive. RWKV_API struct rwkv_context * rwkv_clone_context(struct rwkv_context * ctx, const uint32_t n_threads); - // Offloads specified count of model layers onto the GPU. Offloaded layers are evaluated using cuBLAS or CLBlast. - // For the purposes of this function, model head (unembedding matrix) is treated as an additional layer: - // - pass `rwkv_get_n_layer(ctx)` to offload all layers except model head - // - pass `rwkv_get_n_layer(ctx) + 1` to offload all layers, including model head - // Returns true if at least one layer was offloaded. - // If rwkv.cpp was compiled without cuBLAS and CLBlast support, this function is a no-op and always returns false. - RWKV_API bool rwkv_gpu_offload_layers(struct rwkv_context * ctx, const uint32_t n_layers); - // Evaluates the model for a single token. // You can pass NULL to logits_out whenever logits are not needed. This can improve speed by ~10 ms per iteration, because logits are not calculated. // Not thread-safe. For parallel inference, call rwkv_clone_context to create one rwkv_context for each thread. diff --git a/rwkv_eval.inc b/rwkv_eval.inc index 79979006..215b34e1 100644 --- a/rwkv_eval.inc +++ b/rwkv_eval.inc @@ -1,20 +1,23 @@ // Copies state from an input buffer to the ggml tensor of the graph. static void rwkv_set_inputs(const struct rwkv_context * ctx, const struct rwkv_computation_graph & graph, const float * state_in) { if (state_in) { - memcpy(graph.input_state->data, state_in, rwkv_tensor_nbytes(graph.input_state)); + ggml_backend_tensor_set(graph.input_state, state_in, 0, rwkv_tensor_nbytes(graph.input_state)); } else { - rwkv_init_state(ctx, (float *) graph.input_state->data); + float * state_data = (float *) malloc(rwkv_tensor_nbytes(graph.input_state)); + rwkv_init_state(ctx, state_data); + ggml_backend_tensor_set(graph.input_state, state_data, 0, rwkv_tensor_nbytes(graph.input_state)); + free(state_data); } } // Copies state and logits from ggml tensors of the graph to output buffers. static void rwkv_get_outputs(const struct rwkv_computation_graph & graph, float * state_out, float * logits_out) { if (state_out) { - memcpy(state_out, graph.output_state->data, rwkv_tensor_nbytes(graph.output_state)); + ggml_backend_tensor_get(graph.output_state, state_out, 0, rwkv_tensor_nbytes(graph.output_state)); } if (logits_out) { - memcpy(logits_out, graph.logits->data, rwkv_tensor_nbytes(graph.logits)); + ggml_backend_tensor_get(graph.logits, logits_out, 0, rwkv_tensor_nbytes(graph.logits)); } } @@ -28,14 +31,7 @@ static void rwkv_eval_graph(struct rwkv_computation_graph & graph, const uint32_ graph.cgraph->n_leafs = graph.post_logits_leafs; } - struct ggml_cplan * plan = ggml_graph_plan(graph.cgraph.get(), n_threads); - - std::unique_ptr work_data{ new(std::nothrow) uint8_t[plan->work_size] }; - plan->work_data = work_data.get(); - - ggml_graph_compute(graph.cgraph.get(), plan); - - free(plan); + ggml_backend_sched_graph_compute(graph.sched, graph.cgraph.get()); } // API function. @@ -46,8 +42,31 @@ bool rwkv_eval(struct rwkv_context * ctx, const uint32_t token, const float * st const size_t n_vocab = header.n_vocab; RWKV_CTX_ASSERT_FALSE_MSG(ctx, RWKV_ERROR_ARGS, token < n_vocab, "Token (%" PRId32 ") is out of range (0 .. %zu)", token, n_vocab - 1); + if (!ctx->serial_graph.sched) { + ctx->serial_graph.sched = ggml_backend_sched_new(ctx->model->backends.data(), NULL, ctx->model->backends.size(), RWKV_MAX_NODES, false); + + auto graph = ctx->serial_graph.cgraph.get(); + for (int i = 0; i < graph->n_nodes; i++) { + auto node = graph->nodes[i]; + if (std::string(node->name).find(".in.") != std::string::npos || + std::string(node->name).find(".out.") != std::string::npos) { + ggml_backend_sched_set_tensor_backend(ctx->serial_graph.sched, node, ctx->model->backends.back()); + } + } + for (int i = 0; i < graph->n_leafs; i++) { + auto leaf = graph->leafs[i]; + if (std::string(leaf->name).find("state.in") != std::string::npos || + std::string(leaf->name).find("state.out") != std::string::npos) { + ggml_backend_sched_set_tensor_backend(ctx->serial_graph.sched, leaf, ctx->model->backends.back()); + } + } + ggml_backend_sched_set_tensor_backend(ctx->serial_graph.sched, ctx->serial_graph.tokens, ctx->model->backends.back()); + + ggml_backend_sched_alloc_graph(ctx->serial_graph.sched, ctx->serial_graph.cgraph.get()); + } + rwkv_set_inputs(ctx, ctx->serial_graph, state_in); - ggml_set_i32(ctx->serial_graph.tokens, token); + ggml_backend_tensor_set(ctx->serial_graph.tokens, &token, 0, rwkv_tensor_nbytes(ctx->serial_graph.tokens)); rwkv_eval_graph(ctx->serial_graph, ctx->n_threads, logits_out != NULL); @@ -91,14 +110,41 @@ bool rwkv_eval_sequence( } if (ctx->last_used_sequence_length != sequence_len) { + if (ctx->sequential_graph.sched) { + ggml_backend_sched_free(ctx->sequential_graph.sched); + ctx->sequential_graph.sched = NULL; + } RWKV_ENSURE_OR_FALSE(rwkv_measure_and_build_sequential_context(*ctx->model, ctx->sequential_graph, sequence_len)); ctx->last_used_sequence_length = sequence_len; } if (sequence) { + if (!ctx->sequential_graph.sched) { + ctx->sequential_graph.sched = ggml_backend_sched_new(ctx->model->backends.data(), NULL, ctx->model->backends.size(), RWKV_MAX_NODES, false); + auto graph = ctx->sequential_graph.cgraph.get(); + + for (int i = 0; i < graph->n_nodes; i++) { + auto node = graph->nodes[i]; + if (std::string(node->name).find(".in.") != std::string::npos || + std::string(node->name).find(".out.") != std::string::npos) { + ggml_backend_sched_set_tensor_backend(ctx->sequential_graph.sched, node, ctx->model->backends.back()); + } + } + for (int i = 0; i < graph->n_leafs; i++) { + auto leaf = graph->leafs[i]; + if (std::string(leaf->name).find("state.in") != std::string::npos || + std::string(leaf->name).find("state.out") != std::string::npos) { + ggml_backend_sched_set_tensor_backend(ctx->sequential_graph.sched, leaf, ctx->model->backends.back()); + } + } + ggml_backend_sched_set_tensor_backend(ctx->sequential_graph.sched, ctx->sequential_graph.tokens, ctx->model->backends.back()); + + ggml_backend_sched_alloc_graph(ctx->sequential_graph.sched, ctx->sequential_graph.cgraph.get()); + } + rwkv_set_inputs(ctx, ctx->sequential_graph, state_in); - memcpy(ctx->sequential_graph.tokens->data, sequence, sequence_len * sizeof(uint32_t)); + ggml_backend_tensor_set(ctx->sequential_graph.tokens, sequence, 0, sequence_len * sizeof(uint32_t)); rwkv_eval_graph(ctx->sequential_graph, ctx->n_threads, logits_out != NULL); diff --git a/rwkv_file_format.inc b/rwkv_file_format.inc index 390f7b8d..4c08ea33 100644 --- a/rwkv_file_format.inc +++ b/rwkv_file_format.inc @@ -213,7 +213,7 @@ static bool rwkv_fwrite_tensor(FILE * file, const struct rwkv_tensor & tensor) { // Reading ggml tensors -static bool rwkv_fread_ggml_tensor(FILE * file, struct ggml_context * ctx, std::string & name, struct ggml_tensor *& tensor) { +static bool rwkv_fread_ggml_tensor_info(FILE * file, struct ggml_context * ctx, std::string & name, struct ggml_tensor *& tensor) { struct rwkv_tensor_header header; RWKV_ENSURE_OR_FALSE_MSG(rwkv_fread_tensor_header(file, header), "Invalid tensor header"); @@ -242,10 +242,51 @@ static bool rwkv_fread_ggml_tensor(FILE * file, struct ggml_context * ctx, std:: RWKV_ASSERT_FALSE_MSG( RWKV_ERROR_FILE_READ, - rwkv_fread_data(file, rwkv_tensor_nbytes(tensor), tensor->data), + !fseek(file, rwkv_tensor_nbytes(tensor), SEEK_CUR), + "Failed to seek to next tensor after parameter %s", + name.c_str() + ); + + return true; +} + +static bool rwkv_fread_ggml_tensor_data(FILE * file, struct ggml_context * ctx, std::unordered_map & parameters) { + struct rwkv_tensor_header header; + std::string name; + RWKV_ENSURE_OR_FALSE_MSG(rwkv_fread_tensor_header(file, header), "Invalid tensor header"); + + RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_FILE_READ, rwkv_fread_string(file, header.key_length, name), "Failed to read tensor name"); + + enum ggml_type ggml_type = rwkv_type_to_ggml[header.data_type]; + RWKV_ASSERT_FALSE_MSG( + RWKV_ERROR_UNSUPPORTED, + ggml_type != GGML_TYPE_UNKNOWN, + "Unsupported data type %s in parameter %s", + rwkv_type_to_string[header.data_type], + name.c_str() + ); + + struct ggml_tensor * tensor; + tensor = parameters[name]; + RWKV_ASSERT_FALSE_MSG( + RWKV_ERROR_ALLOC, + tensor != NULL, + "Parameter %s not found in the model", + name.c_str() + ); + + char * data = (char *) malloc(rwkv_tensor_nbytes(tensor)); + + RWKV_ASSERT_FALSE_MSG( + RWKV_ERROR_FILE_READ, + rwkv_fread_data(file, rwkv_tensor_nbytes(tensor), data), "Failed to read data of parameter %s", name.c_str() ); + ggml_backend_tensor_set(tensor, data, 0, rwkv_tensor_nbytes(tensor)); + + free(data); + return true; } diff --git a/rwkv_gpu_offload.inc b/rwkv_gpu_offload.inc deleted file mode 100644 index e0b54b1a..00000000 --- a/rwkv_gpu_offload.inc +++ /dev/null @@ -1,62 +0,0 @@ -#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) - -#if defined(GGML_USE_CUBLAS) -# include "ggml/src/ggml-cuda.h" -#elif defined(GGML_USE_CLBLAST) -# include "ggml/src/ggml-opencl.h" -#endif - -// API function. -bool rwkv_gpu_offload_layers(struct rwkv_context * ctx, const uint32_t n_layers) { - const auto offload = [&](struct ggml_tensor * tensor) { - // TODO Support multi-GPU - tensor->backend = GGML_BACKEND_GPU; -#if defined(GGML_USE_CUBLAS) - ggml_cuda_transform_tensor(tensor->data, tensor); -#elif defined(GGML_USE_CLBLAST) - ggml_cl_transform_tensor(tensor->data, tensor); -#endif - }; - - const size_t n_gpu = std::min(n_layers, ctx->model->header.n_layer + 1); - - if (ctx->model->offloaded_layer_count >= n_gpu) { - return false; - } - - for (size_t & i = ctx->model->offloaded_layer_count; i < n_gpu; i++) { - if (i == ctx->model->header.n_layer) { - // This is the index of the model head. - offload(ctx->model->head); - - continue; - } - - const struct rwkv_layer & layer = ctx->model->layers[i]; - - // TODO Also offload other supported operations to GPU - offload(layer.att_key); - offload(layer.att_value); - offload(layer.att_receptance); - offload(layer.att_output); - - if (layer.att_gate != NULL) { - offload(layer.att_gate); - } - - offload(layer.ffn_key); - offload(layer.ffn_value); - offload(layer.ffn_receptance); - } - - return true; -} - -#else - -// API function. -bool rwkv_gpu_offload_layers(struct rwkv_context * ctx, const uint32_t n_layers) { - return false; -} - -#endif diff --git a/rwkv_graph.inc b/rwkv_graph.inc index 80e79e6d..fd24555b 100644 --- a/rwkv_graph.inc +++ b/rwkv_graph.inc @@ -10,12 +10,68 @@ struct rwkv_layer_state { struct ggml_tensor * att_heads; }; +struct rwkv_ggml_cgraph_deleter { + void operator()(struct ggml_cgraph * cgraph) { + if (cgraph->nodes) + free(cgraph->nodes); + if (cgraph->leafs) + free(cgraph->leafs); + if (cgraph->visited_hash_table.keys) + free(cgraph->visited_hash_table.keys); + if (cgraph->grads) + free(cgraph->grads); + free(cgraph); + } +}; + +static struct ggml_cgraph * rwkv_ggml_cgraph_create(size_t size, bool grads) { + struct ggml_cgraph * cgraph = (struct ggml_cgraph *)calloc(1, sizeof(struct ggml_cgraph)); + cgraph->size = size; + cgraph->n_nodes = 0; + cgraph->n_leafs = 0; + cgraph->nodes = (struct ggml_tensor **)calloc(1, size * sizeof(struct ggml_tensor *)); + cgraph->leafs = (struct ggml_tensor **)calloc(1, size * sizeof(struct ggml_tensor *)); + + // next primes after powers of two + static const size_t primes[] = { + 2, 3, 5, 11, 17, 37, 67, 131, 257, 521, 1031, + 2053, 4099, 8209, 16411, 32771, 65537, 131101, + 262147, 524309, 1048583, 2097169, 4194319, 8388617, + 16777259, 33554467, 67108879, 134217757, 268435459, + 536870923, 1073741827, 2147483659 + }; + static const size_t n_primes = sizeof(primes)/sizeof(primes[0]); + + // find the smallest prime that is larger or equal to size + size_t l = 0; + size_t r = n_primes; + while (l < r) { + size_t m = (l + r)/2; + if (primes[m] < size * 2) { + l = m + 1; + } else { + r = m; + } + } + size_t hash_size = l < n_primes ? primes[l] : (size * 2 + 1); + + cgraph->visited_hash_table.size = hash_size; + cgraph->visited_hash_table.keys = (struct ggml_tensor **)calloc(1, hash_size * sizeof(struct ggml_tensor *)); + cgraph->order = GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT; + if (grads) { + cgraph->grads = (struct ggml_tensor **)calloc(1, size * sizeof(struct ggml_tensor *)); + } + return cgraph; +} + + // The computation graph holds ggml context and the ggml cgraph. // It can be either a serial or a sequential graph. struct rwkv_computation_graph { struct ggml_context * ggml_ctx; // ggml_cgraph is so large that it can cause stack overflows if not stored on the heap. - std::unique_ptr cgraph; + std::unique_ptr cgraph; + ggml_backend_sched_t sched; // Input tensors. struct ggml_tensor * tokens; @@ -114,7 +170,7 @@ static void rwkv_att_rkv( ); // r = torch.sigmoid(rw @ xr) - r = rwkv_sigmoid_inplace(ctx, ggml_mul_mat(ctx, layer.att_receptance, xr)); + r = ggml_sigmoid_inplace(ctx, ggml_mul_mat(ctx, layer.att_receptance, xr)); // k = kw @ xk k = ggml_mul_mat(ctx, layer.att_key, xk); // v = vw @ xv @@ -141,7 +197,7 @@ static struct ggml_tensor * rwkv_att_wkv( struct ggml_tensor * e2 = rwkv_exp(ctx, ggml_sub(ctx, ww, qq)); // a = e1 * aa + e2 * v - struct ggml_tensor * a = ggml_add_inplace(ctx, ggml_mul(ctx, e1, aa), ggml_mul(ctx, e2, v)); + struct ggml_tensor * a = ggml_add(ctx, ggml_mul(ctx, e1, aa), ggml_mul(ctx, e2, v)); // b = e1 * bb + e2 struct ggml_tensor * b = ggml_add_inplace(ctx, ggml_mul(ctx, e1, bb), e2); @@ -196,8 +252,8 @@ static struct ggml_tensor * rwkv_att_v5( if (sequence_length > 1) { x_prev = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embed, sequence_length); - x_prev = ggml_set_1d_inplace(ctx, x_prev, state.att_xx, 0); - x_prev = ggml_set_1d_inplace( + x_prev = ggml_set_1d(ctx, x_prev, state.att_xx, 0); + x_prev = ggml_set_1d( ctx, x_prev, ggml_view_1d(ctx, x, n_embed * (sequence_length - 1), 0), n_embed * sizeof(float) @@ -239,7 +295,7 @@ static struct ggml_tensor * rwkv_att_v5( struct ggml_tensor * xg = NULL; if (arch_version_minor >= 2) { - xg = ggml_add_inplace( + xg = ggml_add( ctx, ggml_mul(ctx, x, layer.att_time_mix_g), ggml_mul( @@ -300,7 +356,7 @@ static struct ggml_tensor * rwkv_att_v5( // ggml_group_norm considers groups in the third dimension. x = ggml_reshape_4d(ctx, x, 1, 1, n_embed, sequence_length); - x = ggml_group_norm_inplace(ctx, x, head_count); + x = rwkv_group_norm_eps_1e_minus5(ctx, x, head_count); // Convert back to a regular vector. x = ggml_reshape_2d(ctx, x, n_embed, sequence_length); x = ggml_add_inplace( @@ -314,7 +370,7 @@ static struct ggml_tensor * rwkv_att_v5( ); if (arch_version_minor >= 2) { - x = ggml_mul_inplace(ctx, x, g); + x = ggml_mul(ctx, x, g); } return ggml_mul_mat(ctx, layer.att_output, x); @@ -338,8 +394,8 @@ static struct ggml_tensor * rwkv_att_v6( if (sequence_length > 1) { x_prev = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embed, sequence_length); - x_prev = ggml_set_1d_inplace(ctx, x_prev, state.att_xx, 0); - x_prev = ggml_set_1d_inplace( + x_prev = ggml_set_1d(ctx, x_prev, state.att_xx, 0); + x_prev = ggml_set_1d( ctx, x_prev, ggml_view_1d(ctx, x, n_embed * (sequence_length - 1), 0), n_embed * sizeof(float) @@ -383,39 +439,47 @@ static struct ggml_tensor * rwkv_att_v6( xxx ); - xxx = ggml_reshape_2d(ctx, xxx, n_embed * sequence_length, 5); - - struct ggml_tensor * mw = ggml_reshape_2d( + struct ggml_tensor *mw = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embed * sequence_length); + mw = ggml_reshape_2d( ctx, - ggml_get_rows(ctx, xxx, ggml_new_i32(ctx, 0)), + ggml_set_1d(ctx, mw, ggml_view_1d(ctx, xxx, n_embed * sequence_length, 0), 0), n_embed, sequence_length ); - struct ggml_tensor * mk = ggml_reshape_2d( + + struct ggml_tensor *mk = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embed * sequence_length); + mk = ggml_reshape_2d( ctx, - ggml_get_rows(ctx, xxx, ggml_new_i32(ctx, 1)), + ggml_set_1d(ctx, mk, ggml_view_1d(ctx, xxx, n_embed * sequence_length, n_embed * sequence_length * sizeof(float)), 0), n_embed, sequence_length ); - struct ggml_tensor * mv = ggml_reshape_2d( + + struct ggml_tensor *mv = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embed * sequence_length); + mv = ggml_reshape_2d( ctx, - ggml_get_rows(ctx, xxx, ggml_new_i32(ctx, 2)), + ggml_set_1d(ctx, mv, ggml_view_1d(ctx, xxx, n_embed * sequence_length, n_embed * sequence_length * 2 * sizeof(float)), 0), n_embed, sequence_length ); - struct ggml_tensor * mr = ggml_reshape_2d( + + struct ggml_tensor *mr = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embed * sequence_length); + mr = ggml_reshape_2d( ctx, - ggml_get_rows(ctx, xxx, ggml_new_i32(ctx, 3)), + ggml_set_1d(ctx, mr, ggml_view_1d(ctx, xxx, n_embed * sequence_length, n_embed * sequence_length * 3 * sizeof(float)), 0), n_embed, sequence_length ); - struct ggml_tensor * mg = ggml_reshape_2d( + + struct ggml_tensor *mg = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embed * sequence_length); + mg = ggml_reshape_2d( ctx, - ggml_get_rows(ctx, xxx, ggml_new_i32(ctx, 4)), + ggml_set_1d(ctx, mg, ggml_view_1d(ctx, xxx, n_embed * sequence_length, n_embed * sequence_length * 4 * sizeof(float)), 0), n_embed, sequence_length ); + struct ggml_tensor * xw = ggml_add_inplace( ctx, ggml_mul_inplace( ctx, - ggml_add_inplace(ctx, mw, layer.att_time_maa_w), + ggml_add(ctx, mw, layer.att_time_maa_w), x_prev ), x @@ -425,7 +489,7 @@ static struct ggml_tensor * rwkv_att_v6( ctx, ggml_mul_inplace( ctx, - ggml_add_inplace(ctx, mk, layer.att_time_maa_k), + ggml_add(ctx, mk, layer.att_time_maa_k), x_prev ), x @@ -435,7 +499,7 @@ static struct ggml_tensor * rwkv_att_v6( ctx, ggml_mul_inplace( ctx, - ggml_add_inplace(ctx, mv, layer.att_time_maa_v), + ggml_add(ctx, mv, layer.att_time_maa_v), x_prev ), x @@ -445,7 +509,7 @@ static struct ggml_tensor * rwkv_att_v6( ctx, ggml_mul_inplace( ctx, - ggml_add_inplace(ctx, mr, layer.att_time_maa_r), + ggml_add(ctx, mr, layer.att_time_maa_r), x_prev ), x @@ -455,7 +519,7 @@ static struct ggml_tensor * rwkv_att_v6( ctx, ggml_mul_inplace( ctx, - ggml_add_inplace(ctx, mg, layer.att_time_maa_g), + ggml_add(ctx, mg, layer.att_time_maa_g), x_prev ), x @@ -507,12 +571,9 @@ static struct ggml_tensor * rwkv_att_v6( state.att_heads = state_out; - // rwkv/ggml ggml_group_norm uses eps=1e-5, while rwkv v6 uses eps=64e-5 - // Do 1/8 scale to x before group_norm for now. - x = ggml_scale_inplace(ctx, x, ggml_new_f32(ctx, 0.125)); // ggml_group_norm considers groups in the third dimension. x = ggml_reshape_4d(ctx, x, 1, 1, n_embed, sequence_length); - x = ggml_group_norm_inplace(ctx, x, head_count); + x = rwkv_group_norm_eps_64e_minus5(ctx, x, head_count); // Convert back to a regular vector. x = ggml_reshape_2d(ctx, x, n_embed, sequence_length); x = ggml_add_inplace( @@ -525,7 +586,7 @@ static struct ggml_tensor * rwkv_att_v6( layer.att_ln_x_bias ); - x = ggml_mul_inplace(ctx, x, g); + x = ggml_mul(ctx, x, g); return ggml_mul_mat(ctx, layer.att_output, x); } @@ -536,21 +597,21 @@ static struct ggml_tensor * rwkv_ffn(struct ggml_context * ctx, struct ggml_tens // xk = x * time_mix_k + state[5 * i + 1] * (1 - time_mix_k) // xk = x * time_mix_k + state[5 * i + 0] * (1 - time_mix_k) - struct ggml_tensor * xk = ggml_add_inplace( + struct ggml_tensor * xk = ggml_add( ctx, ggml_mul(ctx, x, layer.ffn_time_mix_k), ggml_mul(ctx, x_prev, rwkv_1_minus_x(ctx, layer.ffn_time_mix_k)) ); // xr = x * time_mix_r + state[5 * i + 0] * (1 - time_mix_r) - struct ggml_tensor * xr = ggml_add_inplace( + struct ggml_tensor * xr = ggml_add( ctx, ggml_mul(ctx, x, layer.ffn_time_mix_r), ggml_mul(ctx, x_prev, rwkv_1_minus_x(ctx, layer.ffn_time_mix_r)) ); // r = torch.sigmoid(rw @ xr) - struct ggml_tensor * r = rwkv_sigmoid_inplace(ctx, ggml_mul_mat(ctx, layer.ffn_receptance, xr)); + struct ggml_tensor * r = ggml_sigmoid_inplace(ctx, ggml_mul_mat(ctx, layer.ffn_receptance, xr)); // k = torch.square(torch.relu(kw @ xk)) struct ggml_tensor * k = ggml_sqr_inplace(ctx, ggml_relu_inplace(ctx, ggml_mul_mat(ctx, layer.ffn_key, xk))); @@ -579,7 +640,7 @@ static struct ggml_tensor * rwkv_ffn_v6(struct ggml_context * ctx, struct ggml_t ); // r = torch.sigmoid(rw @ xr) - struct ggml_tensor * r = rwkv_sigmoid_inplace(ctx, ggml_mul_mat(ctx, layer.ffn_receptance, xr)); + struct ggml_tensor * r = ggml_sigmoid_inplace(ctx, ggml_mul_mat(ctx, layer.ffn_receptance, xr)); // k = torch.square(torch.relu(kw @ xk)) struct ggml_tensor * k = ggml_sqr_inplace(ctx, ggml_relu_inplace(ctx, ggml_mul_mat(ctx, layer.ffn_key, xk))); @@ -614,22 +675,38 @@ static void rwkv_create_input_and_output_views( input_state.ffn_xx = ggml_view_1d(ctx, input, n_embed, n_embed * (i * vectors_per_layer + 0) * sz_float); input_state.att_xx = ggml_view_1d(ctx, input, n_embed, n_embed * (i * vectors_per_layer + 1) * sz_float); input_state.att_heads = ggml_view_1d(ctx, input, att_heads_size, n_embed * (i * vectors_per_layer + 2) * sz_float); + ggml_set_name(input_state.ffn_xx, ("ffn_xx.in." + std::to_string(i)).c_str()); + ggml_set_name(input_state.att_xx, ("att_xx.in." + std::to_string(i)).c_str()); + ggml_set_name(input_state.att_heads, ("att_heads.in." + std::to_string(i)).c_str()); output_state.ffn_xx = ggml_view_1d(ctx, output, n_embed, n_embed * (i * vectors_per_layer + 0) * sz_float); output_state.att_xx = ggml_view_1d(ctx, output, n_embed, n_embed * (i * vectors_per_layer + 1) * sz_float); output_state.att_heads = ggml_view_1d(ctx, output, att_heads_size, n_embed * (i * vectors_per_layer + 2) * sz_float); + ggml_set_name(output_state.ffn_xx, ("ffn_xx.out." + std::to_string(i)).c_str()); + ggml_set_name(output_state.att_xx, ("att_xx.out." + std::to_string(i)).c_str()); + ggml_set_name(output_state.att_heads, ("att_heads.out." + std::to_string(i)).c_str()); } else { input_state.ffn_xx = ggml_view_1d(ctx, input, n_embed, n_embed * (i * 5 + 0) * sz_float); input_state.att_xx = ggml_view_1d(ctx, input, n_embed, n_embed * (i * 5 + 1) * sz_float); input_state.att_aa = ggml_view_1d(ctx, input, n_embed, n_embed * (i * 5 + 2) * sz_float); input_state.att_bb = ggml_view_1d(ctx, input, n_embed, n_embed * (i * 5 + 3) * sz_float); input_state.att_pp = ggml_view_1d(ctx, input, n_embed, n_embed * (i * 5 + 4) * sz_float); + ggml_set_name(input_state.ffn_xx, ("ffn_xx.in." + std::to_string(i)).c_str()); + ggml_set_name(input_state.att_xx, ("att_xx.in." + std::to_string(i)).c_str()); + ggml_set_name(input_state.att_aa, ("att_aa.in." + std::to_string(i)).c_str()); + ggml_set_name(input_state.att_bb, ("att_bb.in." + std::to_string(i)).c_str()); + ggml_set_name(input_state.att_pp, ("att_pp.in." + std::to_string(i)).c_str()); output_state.ffn_xx = ggml_view_1d(ctx, output, n_embed, n_embed * (i * 5 + 0) * sz_float); output_state.att_xx = ggml_view_1d(ctx, output, n_embed, n_embed * (i * 5 + 1) * sz_float); output_state.att_aa = ggml_view_1d(ctx, output, n_embed, n_embed * (i * 5 + 2) * sz_float); output_state.att_bb = ggml_view_1d(ctx, output, n_embed, n_embed * (i * 5 + 3) * sz_float); output_state.att_pp = ggml_view_1d(ctx, output, n_embed, n_embed * (i * 5 + 4) * sz_float); + ggml_set_name(output_state.ffn_xx, ("ffn_xx.out." + std::to_string(i)).c_str()); + ggml_set_name(output_state.att_xx, ("att_xx.out." + std::to_string(i)).c_str()); + ggml_set_name(output_state.att_aa, ("att_aa.out." + std::to_string(i)).c_str()); + ggml_set_name(output_state.att_bb, ("att_bb.out." + std::to_string(i)).c_str()); + ggml_set_name(output_state.att_pp, ("att_pp.out." + std::to_string(i)).c_str()); } } @@ -639,7 +716,7 @@ static void rwkv_create_input_and_output_views( // Creates and sets the input and output ggml tensors, builds the computation graph. static bool rwkv_build_serial_graph(struct rwkv_model & model, struct rwkv_computation_graph & graph) { - graph.cgraph.reset(new(std::nothrow) struct ggml_cgraph()); + graph.cgraph.reset(rwkv_ggml_cgraph_create(RWKV_MAX_NODES, false)); struct rwkv_file_header & header = model.header; const size_t n_vocab = header.n_vocab; @@ -649,7 +726,7 @@ static bool rwkv_build_serial_graph(struct rwkv_model & model, struct rwkv_compu struct ggml_context * ctx = graph.ggml_ctx; // Creates a 1-element tensor. - graph.tokens = ggml_new_i32(ctx, 0); + graph.tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1); size_t vectors_per_layer = model.arch_version_major >= 5 ? 2 + model.head_size : @@ -670,6 +747,12 @@ static bool rwkv_build_serial_graph(struct rwkv_model & model, struct rwkv_compu graph.logits = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_vocab); + ggml_set_input(input); + ggml_set_output(output); + ggml_set_name(input, "state.in"); + ggml_set_name(output, "state.out"); + ggml_set_input(graph.tokens); + // x = self.w.emb.weight[token] struct ggml_tensor * x = ggml_get_rows(ctx, model.emb, graph.tokens); @@ -682,7 +765,7 @@ static bool rwkv_build_serial_graph(struct rwkv_model & model, struct rwkv_compu struct rwkv_layer_state state = inputs[i]; if (model.arch_version_major == 6) { - x = ggml_add_inplace(ctx, x, rwkv_att_v6( + x = ggml_add(ctx, x, rwkv_att_v6( ctx, x, layer, @@ -692,10 +775,10 @@ static bool rwkv_build_serial_graph(struct rwkv_model & model, struct rwkv_compu model.arch_version_minor )); - x = ggml_add_inplace(ctx, x, rwkv_ffn_v6(ctx, x, layer, state)); + x = ggml_add(ctx, x, rwkv_ffn_v6(ctx, x, layer, state)); } else { x = model.arch_version_major >= 5 ? - ggml_add_inplace(ctx, x, rwkv_att_v5( + ggml_add(ctx, x, rwkv_att_v5( ctx, x, layer, @@ -704,9 +787,9 @@ static bool rwkv_build_serial_graph(struct rwkv_model & model, struct rwkv_compu model.head_size, model.arch_version_minor )) : - ggml_add_inplace(ctx, x, rwkv_att(ctx, x, layer, state)); + ggml_add(ctx, x, rwkv_att(ctx, x, layer, state)); - x = ggml_add_inplace(ctx, x, rwkv_ffn(ctx, x, layer, state)); + x = ggml_add(ctx, x, rwkv_ffn(ctx, x, layer, state)); } struct rwkv_layer_state & output_state = outputs[i]; @@ -755,23 +838,10 @@ static bool rwkv_measure_and_build_serial_context(struct rwkv_model & model, str graph.ggml_ctx = NULL; } - // 1. Measure the space required for the ggml context. graph.ggml_ctx = rwkv_init_ggml_context(rwkv_ggml_overhead(), true); RWKV_ENSURE_OR_FALSE(rwkv_build_serial_graph(model, graph)); - size_t required_context_size = ggml_total_size_for_tensor_data(graph.ggml_ctx) + - // With the node limit set 80K, this overhead would be 28 MB. - + rwkv_ggml_overhead() - + tensor_alignment; - - ggml_free(graph.ggml_ctx); - - // 2. Create the real ggml context. - graph.ggml_ctx = rwkv_init_ggml_context(required_context_size, false); - - RWKV_ENSURE_OR_FALSE(rwkv_build_serial_graph(model, graph)); - return true; } @@ -779,7 +849,7 @@ static bool rwkv_measure_and_build_serial_context(struct rwkv_model & model, str // Creates and sets the input and output ggml tensors, builds the computation graph. static bool rwkv_build_sequential_graph(struct rwkv_model & model, struct rwkv_computation_graph & graph, const size_t sequence_length) { - graph.cgraph.reset(new(std::nothrow) struct ggml_cgraph()); + graph.cgraph.reset(rwkv_ggml_cgraph_create(RWKV_MAX_NODES, false)); struct rwkv_file_header & header = model.header; const size_t n_vocab = header.n_vocab; @@ -809,6 +879,12 @@ static bool rwkv_build_sequential_graph(struct rwkv_model & model, struct rwkv_c graph.logits = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_vocab); + ggml_set_input(input); + ggml_set_output(output); + ggml_set_name(input, "state.in"); + ggml_set_name(output, "state.out"); + ggml_set_input(graph.tokens); + // x = self.w.emb.weight[token] struct ggml_tensor * x = ggml_get_rows(ctx, model.emb, graph.tokens); @@ -821,7 +897,7 @@ static bool rwkv_build_sequential_graph(struct rwkv_model & model, struct rwkv_c struct rwkv_layer_state state = inputs[i]; if (model.arch_version_major == 6) { - x = ggml_add_inplace(ctx, x, rwkv_att_v6( + x = ggml_add(ctx, x, rwkv_att_v6( ctx, x, layer, @@ -831,7 +907,7 @@ static bool rwkv_build_sequential_graph(struct rwkv_model & model, struct rwkv_c model.arch_version_minor )); } else if (model.arch_version_major >= 5) { - x = ggml_add_inplace(ctx, x, rwkv_att_v5( + x = ggml_add(ctx, x, rwkv_att_v5( ctx, x, layer, @@ -854,22 +930,24 @@ static bool rwkv_build_sequential_graph(struct rwkv_model & model, struct rwkv_c struct ggml_tensor * vt = ggml_view_1d(ctx, v, n_embed, n_embed * sizeof(float) * t); struct ggml_tensor * xt = ggml_view_1d(ctx, x_prev, n_embed, n_embed * sizeof(float) * t); struct ggml_tensor * wkv = rwkv_att_wkv(ctx, layer.att_time_first, layer.att_time_decay, kt, vt, state.att_aa, state.att_bb, state.att_pp); - ggml_build_forward_expand(graph.cgraph.get(), ggml_cpy(ctx, wkv, xt)); + xt = ggml_set_1d_inplace(ctx, xt, wkv, 0); + ggml_build_forward_expand(graph.cgraph.get(), xt); } - x = ggml_add_inplace(ctx, x, ggml_mul_mat(ctx, layer.att_output, ggml_mul(ctx, r, x_prev))); + x = ggml_add(ctx, x, ggml_mul_mat(ctx, layer.att_output, ggml_mul(ctx, r, x_prev))); } // TODO Can we skip ffn for all but the last token, the same way we skip unembedding? if (model.arch_version_major == 6) { - x = ggml_add_inplace(ctx, x, rwkv_ffn_v6(ctx, x, layer, state)); + x = ggml_add(ctx, x, rwkv_ffn_v6(ctx, x, layer, state)); } else { - x = ggml_add_inplace(ctx, x, rwkv_ffn(ctx, x, layer, state)); + x = ggml_add(ctx, x, rwkv_ffn(ctx, x, layer, state)); } struct rwkv_layer_state & output_state = outputs[i]; - ggml_build_forward_expand(graph.cgraph.get(), ggml_cpy(ctx, state.att_xx, output_state.att_xx)); + output_state.att_xx = ggml_set_1d_inplace(ctx, output_state.att_xx, state.att_xx, 0); + ggml_build_forward_expand(graph.cgraph.get(), output_state.att_xx); ggml_build_forward_expand(graph.cgraph.get(), ggml_cpy(ctx, state.ffn_xx, output_state.ffn_xx)); if (model.arch_version_major >= 5) { @@ -910,22 +988,9 @@ static bool rwkv_measure_and_build_sequential_context(struct rwkv_model & model, graph.ggml_ctx = NULL; } - // 1. Measure the space required for the ggml context. graph.ggml_ctx = rwkv_init_ggml_context(rwkv_ggml_overhead(), true); RWKV_ENSURE_OR_FALSE(rwkv_build_sequential_graph(model, graph, sequence_length)); - size_t required_context_size = ggml_total_size_for_tensor_data(graph.ggml_ctx) + - // With the node limit set 80K, this overhead would be 28 MB. - + rwkv_ggml_overhead() - + tensor_alignment; - - ggml_free(graph.ggml_ctx); - - // 2. Create the real ggml context. - graph.ggml_ctx = rwkv_init_ggml_context(required_context_size, false); - - RWKV_ENSURE_OR_FALSE(rwkv_build_sequential_graph(model, graph, sequence_length)); - return true; } diff --git a/rwkv_model_loading.inc b/rwkv_model_loading.inc index ba58acb5..ae14c45c 100644 --- a/rwkv_model_loading.inc +++ b/rwkv_model_loading.inc @@ -58,6 +58,10 @@ struct rwkv_model { // It must not be used for computations. struct ggml_context * ggml_ctx; + std::vector backends; + std::vector buffers_w; + std::vector tallocrs; + struct rwkv_file_header header; uint32_t arch_version_major; uint32_t arch_version_minor; @@ -100,10 +104,14 @@ struct rwkv_file { // https://stackoverflow.com/a/6458689 template -static bool rwkv_set_params(struct rwkv_model & model, F callback) { - RWKV_ENSURE_OR_FALSE(callback("emb.weight", model.emb)); - RWKV_ENSURE_OR_FALSE(callback("blocks.0.ln0.weight", model.ln0_weight)); - RWKV_ENSURE_OR_FALSE(callback("blocks.0.ln0.bias", model.ln0_bias)); +static bool rwkv_set_params(struct rwkv_model & model, F callback, const uint32_t n_gpu_layers) { + const size_t n_gpu = std::min(n_gpu_layers, model.header.n_layer + 1); + bool offload_head = n_gpu == (model.header.n_layer + 1); + bool offload_default = false; + + RWKV_ENSURE_OR_FALSE(callback("emb.weight", model.emb, offload_default)); + RWKV_ENSURE_OR_FALSE(callback("blocks.0.ln0.weight", model.ln0_weight, (n_gpu_layers > 0))); + RWKV_ENSURE_OR_FALSE(callback("blocks.0.ln0.bias", model.ln0_bias, (n_gpu_layers > 0))); uint32_t n_layer = model.header.n_layer; std::unique_ptr layers(new(std::nothrow) struct rwkv_layer[n_layer]()); @@ -111,87 +119,96 @@ static bool rwkv_set_params(struct rwkv_model & model, F callback) { model.layers = std::move(layers); for (uint32_t i = 0; i < n_layer; i++) { + bool offload_layer = (i < n_gpu); char buffer[128]; size_t offset = sprintf(buffer, "blocks.%" PRId32 ".", i); rwkv_layer & layer = model.layers[i]; - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "ln1.weight"), buffer), layer.ln1_weight)); - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "ln1.bias"), buffer), layer.ln1_bias)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "ln1.weight"), buffer), layer.ln1_weight, offload_layer)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "ln1.bias"), buffer), layer.ln1_bias, offload_layer)); if (model.arch_version_major == 6) { - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.time_maa_x"), buffer), layer.att_time_maa_x)); - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.time_maa_w"), buffer), layer.att_time_maa_w)); - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.time_maa_k"), buffer), layer.att_time_maa_k)); - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.time_maa_v"), buffer), layer.att_time_maa_v)); - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.time_maa_r"), buffer), layer.att_time_maa_r)); - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.time_maa_g"), buffer), layer.att_time_maa_g)); - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.time_maa_w1"), buffer), layer.att_time_maa_w1)); - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.time_maa_w2"), buffer), layer.att_time_maa_w2)); - - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.time_faaaa"), buffer), layer.att_time_faaaa)); - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.time_decay"), buffer), layer.att_time_decay)); - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.time_decay_w1"), buffer), layer.att_time_decay_w1)); - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.time_decay_w2"), buffer), layer.att_time_decay_w2)); - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.key.weight"), buffer), layer.att_key)); - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.value.weight"), buffer), layer.att_value)); - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.receptance.weight"), buffer), layer.att_receptance)); - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.gate.weight"), buffer), layer.att_gate)); - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.output.weight"), buffer), layer.att_output)); - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.ln_x.weight"), buffer), layer.att_ln_x_weight)); - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.ln_x.bias"), buffer), layer.att_ln_x_bias)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.time_maa_x"), buffer), layer.att_time_maa_x, offload_layer)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.time_maa_w"), buffer), layer.att_time_maa_w, offload_layer)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.time_maa_k"), buffer), layer.att_time_maa_k, offload_layer)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.time_maa_v"), buffer), layer.att_time_maa_v, offload_layer)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.time_maa_r"), buffer), layer.att_time_maa_r, offload_layer)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.time_maa_g"), buffer), layer.att_time_maa_g, offload_layer)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.time_maa_w1"), buffer), layer.att_time_maa_w1, offload_layer)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.time_maa_w2"), buffer), layer.att_time_maa_w2, offload_layer)); + + // No gpu offloading for wkv yet + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.time_faaaa"), buffer), layer.att_time_faaaa, offload_default)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.time_decay"), buffer), layer.att_time_decay, offload_default)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.time_decay_w1"), buffer), layer.att_time_decay_w1, offload_default)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.time_decay_w2"), buffer), layer.att_time_decay_w2, offload_default)); + + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.key.weight"), buffer), layer.att_key, offload_layer)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.value.weight"), buffer), layer.att_value, offload_layer)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.receptance.weight"), buffer), layer.att_receptance, offload_layer)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.gate.weight"), buffer), layer.att_gate, offload_layer)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.output.weight"), buffer), layer.att_output, offload_layer)); + + // GroupNorm uses a custom epsilon value, which only has CPU implementation for now. + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.ln_x.weight"), buffer), layer.att_ln_x_weight, offload_default)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.ln_x.bias"), buffer), layer.att_ln_x_bias, offload_default)); } else { - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.time_mix_k"), buffer), layer.att_time_mix_k)); - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.time_mix_v"), buffer), layer.att_time_mix_v)); - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.time_mix_r"), buffer), layer.att_time_mix_r)); + // Custom rwkv_1_minus_x: cpu only + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.time_mix_k"), buffer), layer.att_time_mix_k, offload_default)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.time_mix_v"), buffer), layer.att_time_mix_v, offload_default)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.time_mix_r"), buffer), layer.att_time_mix_r, offload_default)); if (model.arch_version_major >= 5 && model.arch_version_minor >= 2) { - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.time_faaaa"), buffer), layer.att_time_faaaa)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.time_faaaa"), buffer), layer.att_time_faaaa, offload_default)); } else { - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.time_first"), buffer), layer.att_time_first)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.time_first"), buffer), layer.att_time_first, offload_default)); } - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.time_decay"), buffer), layer.att_time_decay)); - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.key.weight"), buffer), layer.att_key)); - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.value.weight"), buffer), layer.att_value)); - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.receptance.weight"), buffer), layer.att_receptance)); - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.output.weight"), buffer), layer.att_output)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.time_decay"), buffer), layer.att_time_decay, offload_default)); + + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.key.weight"), buffer), layer.att_key, offload_default)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.value.weight"), buffer), layer.att_value, offload_default)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.receptance.weight"), buffer), layer.att_receptance, offload_default)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.output.weight"), buffer), layer.att_output, offload_layer)); if (model.arch_version_major >= 5) { - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.ln_x.weight"), buffer), layer.att_ln_x_weight)); - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.ln_x.bias"), buffer), layer.att_ln_x_bias)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.ln_x.weight"), buffer), layer.att_ln_x_weight, offload_default)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.ln_x.bias"), buffer), layer.att_ln_x_bias, offload_default)); if (model.arch_version_minor >= 2) { - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.time_mix_g"), buffer), layer.att_time_mix_g)); - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.gate.weight"), buffer), layer.att_gate)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.time_mix_g"), buffer), layer.att_time_mix_g, offload_default)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "att.gate.weight"), buffer), layer.att_gate, offload_layer)); } } } - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "ln2.weight"), buffer), layer.ln2_weight)); - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "ln2.bias"), buffer), layer.ln2_bias)); if (model.arch_version_major == 6) { - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "ffn.time_maa_k"), buffer), layer.ffn_time_maa_k)); - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "ffn.time_maa_r"), buffer), layer.ffn_time_maa_r)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "ln2.weight"), buffer), layer.ln2_weight, offload_layer)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "ln2.bias"), buffer), layer.ln2_bias, offload_layer)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "ffn.time_maa_k"), buffer), layer.ffn_time_maa_k, offload_layer)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "ffn.time_maa_r"), buffer), layer.ffn_time_maa_r, offload_layer)); } else { - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "ffn.time_mix_k"), buffer), layer.ffn_time_mix_k)); - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "ffn.time_mix_r"), buffer), layer.ffn_time_mix_r)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "ln2.weight"), buffer), layer.ln2_weight, offload_default)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "ln2.bias"), buffer), layer.ln2_bias, offload_default)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "ffn.time_mix_k"), buffer), layer.ffn_time_mix_k, offload_default)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "ffn.time_mix_r"), buffer), layer.ffn_time_mix_r, offload_default)); } - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "ffn.key.weight"), buffer), layer.ffn_key)); - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "ffn.value.weight"), buffer), layer.ffn_value)); - RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "ffn.receptance.weight"), buffer), layer.ffn_receptance)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "ffn.key.weight"), buffer), layer.ffn_key, offload_layer)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "ffn.value.weight"), buffer), layer.ffn_value, offload_layer)); + RWKV_ENSURE_OR_FALSE(callback((strcpy(&buffer[offset], "ffn.receptance.weight"), buffer), layer.ffn_receptance, offload_layer)); } - RWKV_ENSURE_OR_FALSE(callback("ln_out.weight", model.ln_out_weight)); - RWKV_ENSURE_OR_FALSE(callback("ln_out.bias", model.ln_out_bias)); - RWKV_ENSURE_OR_FALSE(callback("head.weight", model.head)); + RWKV_ENSURE_OR_FALSE(callback("ln_out.weight", model.ln_out_weight, offload_head)); + RWKV_ENSURE_OR_FALSE(callback("ln_out.bias", model.ln_out_bias, offload_head)); + RWKV_ENSURE_OR_FALSE(callback("head.weight", model.head, offload_head)); return true; } // Creates a ggml context and loads all parameter tensors from a model file. -static bool rwkv_load_model_from_file(const char * file_path, struct rwkv_model & model) { +static bool rwkv_load_model_from_file(const char * file_path, struct rwkv_model & model, const uint32_t n_gpu_layers) { struct stat file_stat; std::unordered_map parameters; @@ -204,17 +221,20 @@ static bool rwkv_load_model_from_file(const char * file_path, struct rwkv_model RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_FILE, rwkv_fread_file_header(file.file, model.header), "Invalid file header"); model.ggml_ctx = rwkv_init_ggml_context( - // ggml tensors must be aligned; assuming here that overhead of parameter headers, included in the file size, will account for that. - file_stat.st_size + rwkv_ggml_overhead(), - false + rwkv_ggml_overhead(), + true // no-alloc; allocate tensors in different backend buffers later ); std::string name; struct ggml_tensor * tensor; + // Read all tensor information from the file first. + auto tensors_file_start = ftell(file.file); while ((size_t) ftell(file.file) < (size_t) file_stat.st_size) { - RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_MODEL_PARAMS, rwkv_fread_ggml_tensor(file.file, model.ggml_ctx, name, tensor), "Failed to read a model parameter"); + RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_MODEL_PARAMS, + rwkv_fread_ggml_tensor_info(file.file, model.ggml_ctx, name, tensor), // dry_run = true + "Failed to read a model parameter"); parameters[std::move(name)] = tensor; } @@ -237,17 +257,67 @@ static bool rwkv_load_model_from_file(const char * file_path, struct rwkv_model model.arch_version_minor = 0; } + size_t cpu_buffer_size = 0; + size_t gpu_buffer_size = 0; std::unordered_map & parameters_ref = parameters; + // Calculate buffer sizes for each backend. RWKV_ASSERT_NULL(RWKV_ERROR_MODEL_PARAMS | RWKV_ERROR_PARAM_MISSING, rwkv_set_params( model, - [&](const char * key, struct ggml_tensor *& dest) { + [&](const char * key, struct ggml_tensor *& dest, bool offload_gpu) { struct ggml_tensor * tensor = parameters_ref[key]; RWKV_ENSURE_OR_FALSE_MSG(tensor, "Model parameter %s not found", key); + if (offload_gpu && n_gpu_layers) + gpu_buffer_size += ggml_nbytes(tensor); + else + cpu_buffer_size += ggml_nbytes(tensor); dest = tensor; return true; - } + }, + n_gpu_layers + )); + + cpu_buffer_size += ggml_tensor_overhead() * RWKV_MAX_NODES; + if (n_gpu_layers) { + gpu_buffer_size += ggml_tensor_overhead() * RWKV_MAX_NODES; + } + + // Allocate buffers for each backend. + if (n_gpu_layers) { + ggml_backend_t backend_gpu = model.backends.front(); + ggml_backend_buffer_t gpu_buffer = ggml_backend_alloc_buffer(backend_gpu, gpu_buffer_size); + ggml_backend_buffer_set_usage(gpu_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + model.buffers_w.push_back(gpu_buffer); + model.tallocrs.push_back(ggml_tallocr_new(gpu_buffer)); + } + + ggml_backend_t backend_cpu = model.backends.back(); + ggml_backend_buffer_t cpu_buffer = ggml_backend_alloc_buffer(backend_cpu, cpu_buffer_size); + ggml_backend_buffer_set_usage(cpu_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + model.buffers_w.push_back(cpu_buffer); + model.tallocrs.push_back(ggml_tallocr_new(cpu_buffer)); + + // Allocate tensors in backend buffers. + RWKV_ASSERT_NULL(RWKV_ERROR_MODEL_PARAMS | RWKV_ERROR_PARAM_MISSING, rwkv_set_params( + model, + [&](const char * key, struct ggml_tensor *& dest, bool offload_gpu) { + struct ggml_tensor * tensor = parameters_ref[key]; + RWKV_ENSURE_OR_FALSE_MSG(tensor, "Model parameter %s not found", key); + ggml_tallocr * alloc = offload_gpu ? &model.tallocrs.front() : &model.tallocrs.back(); + ggml_tallocr_alloc(alloc, tensor); + dest = tensor; + return true; + }, + n_gpu_layers )); + // Read tensor data. + fseek(file.file, tensors_file_start, SEEK_SET); + while ((size_t) ftell(file.file) < (size_t) file_stat.st_size) { + RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_MODEL_PARAMS, + rwkv_fread_ggml_tensor_data(file.file, model.ggml_ctx, parameters_ref), + "Failed to read a model parameter"); + } + if (model.arch_version_major >= 5) { model.head_count = model.layers[0].att_time_decay->ne[2]; model.head_size = model.layers[0].ln1_weight->ne[0] / model.head_count; @@ -255,7 +325,8 @@ static bool rwkv_load_model_from_file(const char * file_path, struct rwkv_model // Verify order of dimensions. struct ggml_tensor * emb = model.emb; - RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_MODEL_PARAMS | RWKV_ERROR_SHAPE, emb->n_dims == 2, "Unexpected dimension count of embedding matrix %d", emb->n_dims); + int n_dims = ggml_n_dims(emb); + RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_MODEL_PARAMS | RWKV_ERROR_SHAPE, n_dims == 2, "Unexpected dimension count of embedding matrix %d", n_dims); RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_MODEL_PARAMS | RWKV_ERROR_DIMENSION, emb->ne[0] == model.header.n_embed, "Unexpected dimension of embedding matrix %" PRId64, emb->ne[0]); RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_MODEL_PARAMS | RWKV_ERROR_DIMENSION, emb->ne[1] == model.header.n_vocab, "Unexpected dimension of embedding matrix %" PRId64, emb->ne[1]); diff --git a/rwkv_operators.inc b/rwkv_operators.inc index af808e74..0f87bc51 100644 --- a/rwkv_operators.inc +++ b/rwkv_operators.inc @@ -15,10 +15,12 @@ static void rwkv_exp_impl(struct ggml_tensor * dest, const struct ggml_tensor * rwkv_validate_tensors_for_custom_unary_op(dest, src); int64_t element_count = src->ne[0] * src->ne[1]; + int64_t start = ith * element_count / nth; + int64_t end = (ith + 1) * element_count / nth; float * src_data = (float *) src->data; float * dest_data = (float *) dest->data; - for (int64_t i = 0; i < element_count; i++) { + for (int64_t i = start; i < end; i++) { dest_data[i] = expf(src_data[i]); } @@ -29,30 +31,18 @@ static void rwkv_1_minus_x_impl(struct ggml_tensor * dest, const struct ggml_ten rwkv_validate_tensors_for_custom_unary_op(dest, src); int64_t element_count = src->ne[0] * src->ne[1]; + int64_t start = ith * element_count / nth; + int64_t end = (ith + 1) * element_count / nth; float * src_data = (float *) src->data; float * dest_data = (float *) dest->data; - for (int64_t i = 0; i < element_count; i++) { + for (int64_t i = start; i < end; i++) { dest_data[i] = 1.0F - src_data[i]; } SUPPRESS_UNUSED_WARNINGS_IN_CUSTOM_OP(); } -static void rwkv_sigmoid_impl(struct ggml_tensor * dest, const struct ggml_tensor * src, int ith, int nth, void * userdata) { - rwkv_validate_tensors_for_custom_unary_op(dest, src); - - int64_t element_count = src->ne[0] * src->ne[1]; - float * src_data = (float *) src->data; - float * dest_data = (float *) dest->data; - - for (int64_t i = 0; i < element_count; i++) { - dest_data[i] = 1.0F / (1.0F + expf(-src_data[i])); - } - - SUPPRESS_UNUSED_WARNINGS_IN_CUSTOM_OP(); -} - static void rwkv_max_impl( struct ggml_tensor * dest, const struct ggml_tensor * src0, @@ -74,20 +64,101 @@ static void rwkv_max_impl( GGML_ASSERT(dest->ne[3] == 1); int64_t element_count = src0->ne[0] * src0->ne[1]; + int64_t start = ith * element_count / nth; + int64_t end = (ith + 1) * element_count / nth; float * src0_data = (float *) src0->data; float * src1_data = (float *) src1->data; float * dest_data = (float *) dest->data; - for (int64_t i = 0; i < element_count; i++) { + for (int64_t i = start; i < end; i++) { dest_data[i] = fmaxf(src0_data[i], src1_data[i]); } SUPPRESS_UNUSED_WARNINGS_IN_CUSTOM_OP(); } +// From ggml.c +static void rwkv_groupnorm_impl( + struct ggml_tensor * dst, + const struct ggml_tensor * src0, + int ith, + int nth, + void * userdata +) { + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_is_contiguous(dst)); + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + + GGML_ASSERT(src0->nb[0] == sizeof(float)); + + GGML_TENSOR_UNARY_OP_LOCALS + + const float eps = ((float*)userdata)[0]; + const int n_groups = ((int32_t*)userdata)[1]; + + int n_channels = src0->ne[2]; + int n_channels_per_group = (n_channels + n_groups - 1) / n_groups; + for (int i = ith; i < n_groups; i += nth) { + int start = i * n_channels_per_group; + int end = start + n_channels_per_group; + if (end > n_channels) { + end = n_channels; + } + int step = end - start; + + for (int64_t i03 = 0; i03 < ne03; i03++) { + float sum = 0.0; + for (int64_t i02 = start; i02 < end; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03); + + float sumr = 0.0; + for (int64_t i00 = 0; i00 < ne00; i00++) { + sumr += (float)x[i00]; + } + sum += sumr; + } + } + const float mean = sum / (ne00 * ne01 * step); + + float sum2 = 0.0; + for (int64_t i02 = start; i02 < end; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03); + + float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3); + + float sumr = 0.0; + for (int64_t i00 = 0; i00 < ne00; i00++) { + float v = x[i00] - mean; + y[i00] = v; + sumr += (float)(v * v); + } + sum2 += sumr; + } + } + const float variance = sum2 / (ne00 * ne01 * step); + const float scale = 1.0f / sqrtf(variance + eps); + + for (int64_t i02 = start; i02 < end; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3); + for (int i00 = 0; i00 < ne00; i00++) { + y[i00] *= scale; + } + } + } + } + } + + SUPPRESS_UNUSED_WARNINGS_IN_CUSTOM_OP(); +} + // Element-wise exp(x) struct ggml_tensor * rwkv_exp(struct ggml_context * ctx, struct ggml_tensor * x) { - return ggml_map_custom1(ctx, x, rwkv_exp_impl, 1, NULL); + return ggml_map_custom1_inplace(ctx, x, rwkv_exp_impl, 1, NULL); } // Element-wise 1 - x @@ -95,18 +166,28 @@ struct ggml_tensor * rwkv_1_minus_x(struct ggml_context * ctx, struct ggml_tenso return ggml_map_custom1(ctx, x, rwkv_1_minus_x_impl, 1, NULL); } -// Element-wise sigmoid(x) -struct ggml_tensor * rwkv_sigmoid_inplace(struct ggml_context * ctx, struct ggml_tensor * x) { - return ggml_map_custom1_inplace(ctx, x, rwkv_sigmoid_impl, 1, NULL); -} - // Element-wise max(x, y) struct ggml_tensor * rwkv_max(struct ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * y) { return ggml_map_custom2(ctx, x, y, rwkv_max_impl, 1, NULL); } +// GroupNorm with custom eps value; Remove when ggml_norm supports eps as an argument. +struct ggml_tensor * rwkv_group_norm_eps_1e_minus5(struct ggml_context * ctx, struct ggml_tensor * x, int n_groups) { + static float params[2]; + params[0] = 1e-5F; + ((int*)params)[1] = n_groups; + return ggml_map_custom1_inplace(ctx, x, rwkv_groupnorm_impl, 1, params); +} + +struct ggml_tensor * rwkv_group_norm_eps_64e_minus5(struct ggml_context * ctx, struct ggml_tensor * x, int n_groups) { + static float params[2]; + params[0] = 64e-5F; + ((int*)params)[1] = n_groups; + return ggml_map_custom1_inplace(ctx, x, rwkv_groupnorm_impl, 1, params); +} + struct ggml_tensor * rwkv_layer_norm(struct ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * weight, struct ggml_tensor * bias) { // LayerNorm in RWKV is `x = (x - mean(x)) / sqrt(variance(x) + 1e-5) * weight + bias` // Looks like ggml_norm does the first part, we only need to apply weight & bias. - return ggml_add_inplace(ctx, ggml_mul_inplace(ctx, ggml_norm(ctx, x, 1e-5F), weight), bias); + return ggml_add(ctx, ggml_mul(ctx, ggml_norm(ctx, x, 1e-5F), weight), bias); } diff --git a/rwkv_operators_wkv_v5.inc b/rwkv_operators_wkv_v5.inc index 570d4854..4c38531c 100644 --- a/rwkv_operators_wkv_v5.inc +++ b/rwkv_operators_wkv_v5.inc @@ -6,6 +6,10 @@ static void rwkv_wkv_v5_impl(struct ggml_tensor * result, const struct ggml_tens const size_t T = result->ne[1]; const size_t C = result->ne[0]; const size_t H = result->src[1]->ne[2]; + + // TODO: Multi-threading. + if (ith != 0) + return; float * result_data = (float *) result->data; @@ -65,7 +69,6 @@ static void rwkv_wkv_v5_impl(struct ggml_tensor * result, const struct ggml_tens // Suppress "unused parameter" warnings. (void) src; - (void) ith; (void) nth; (void) userdata; } @@ -123,9 +126,9 @@ static struct ggml_tensor * rwkv_wkv_v5( GGML_ASSERT(r->ne[0] == S && r->ne[1] == 1 && r->ne[2] == H && r->ne[3] == T); GGML_ASSERT(ggml_nelements(state) == S * S * H); - k = ggml_cont_inplace(ctx, ggml_transpose(ctx, k)); - v = ggml_cont_inplace(ctx, ggml_transpose(ctx, v)); - r = ggml_cont_inplace(ctx, ggml_transpose(ctx, r)); + k = ggml_transpose(ctx, k); + v = ggml_transpose(ctx, v); + r = ggml_transpose(ctx, r); struct ggml_tensor * result = ggml_map_custom1( ctx, @@ -139,7 +142,6 @@ static struct ggml_tensor * rwkv_wkv_v5( result->src[3] = r; result->src[4] = time_f; result->src[5] = time_decay; - // GGML_MAX_SRC must be increased from 6 to 8 for this. result->src[6] = state; return result; diff --git a/rwkv_operators_wkv_v6.inc b/rwkv_operators_wkv_v6.inc index 0cc3e131..a89bdaf6 100644 --- a/rwkv_operators_wkv_v6.inc +++ b/rwkv_operators_wkv_v6.inc @@ -7,6 +7,10 @@ static void rwkv_wkv_v6_impl(struct ggml_tensor * result, const struct ggml_tens const size_t C = result->ne[0]; const size_t H = result->src[1]->ne[2]; + // TODO: Multi-threading. + if (ith != 0) + return; + float * result_data = (float *) result->data; memset(result_data, 0, T * C * sizeof(float)); @@ -123,9 +127,9 @@ static struct ggml_tensor * rwkv_wkv_v6( GGML_ASSERT(w->ne[0] == 1 && w->ne[1] == S && w->ne[2] == H && w->ne[3] == T); GGML_ASSERT(ggml_nelements(state) == S * S * H); - k = ggml_cont_inplace(ctx, ggml_transpose(ctx, k)); - v = ggml_cont_inplace(ctx, ggml_transpose(ctx, v)); - r = ggml_cont_inplace(ctx, ggml_transpose(ctx, r)); + k = ggml_transpose(ctx, k); + v = ggml_transpose(ctx, v); + r = ggml_transpose(ctx, r); struct ggml_tensor * result = ggml_map_custom1( ctx, @@ -139,7 +143,6 @@ static struct ggml_tensor * rwkv_wkv_v6( result->src[3] = r; result->src[4] = time_faaaa; result->src[5] = w; - // GGML_MAX_SRC must be increased from 6 to 8 for this. result->src[6] = state; return result; diff --git a/rwkv_quantize.inc b/rwkv_quantize.inc index b3ddfc9c..c8fc2227 100644 --- a/rwkv_quantize.inc +++ b/rwkv_quantize.inc @@ -136,18 +136,12 @@ bool rwkv_quantize_model_file(const char * in_path, const char * out_path, const ggml_fp16_to_fp32_row((const ggml_fp16_t *) out_buf, (float *) in_buf, nelements); } - int64_t hist_cur[16] {}; - new_size = ggml_quantize_chunk(out_type, (const float *) in_buf, out_buf, 0, nelements, hist_cur); + new_size = ggml_quantize_chunk(out_type, (const float *) in_buf, out_buf, 0, header.size1, header.size0, NULL); header.data_type = rwkv_type_from_ggml[out_type]; data = out_buf; RWKV_MSG("size = %8.2f MB -> %8.2f MB | hist: ", orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); - for (int i = 0; i < 16; i++) { - RWKV_MSG("%5.3f ", hist_cur[i] / (float) nelements); - hist_all[i] += hist_cur[i]; - } - RWKV_MSG("\n"); } else { RWKV_MSG("size = %8.3f MB\n", orig_size / 1024.0 / 1024.0); @@ -162,19 +156,6 @@ bool rwkv_quantize_model_file(const char * in_path, const char * out_path, const RWKV_MSG("quantized size = %8.2f MB\n", new_total_size / 1024.0 / 1024.0); RWKV_MSG("compression ratio = %8.2f\n", orig_total_size / float(new_total_size)); - int64_t sum_all = 0; - - for (int i = 0; i < 16; i++) { - sum_all += hist_all[i]; - } - - RWKV_MSG("hist: "); - - for (int i = 0; i < 16; ++i) { - printf("%5.3f ", hist_all[i] / float(sum_all)); - } - - RWKV_MSG("\n"); return true; } diff --git a/rwkv_utilities.inc b/rwkv_utilities.inc index 44be324a..1108091f 100644 --- a/rwkv_utilities.inc +++ b/rwkv_utilities.inc @@ -10,7 +10,7 @@ static size_t rwkv_tensor_nbytes(const struct ggml_tensor * tensor) { // Minimum amount of memory required for a ggml context, not counting the tensor data. static size_t rwkv_ggml_overhead() { - return ggml_tensor_overhead() * GGML_MAX_NODES + ggml_graph_overhead(); + return ggml_tensor_overhead() * RWKV_MAX_NODES + ggml_graph_overhead(); } static struct ggml_context * rwkv_init_ggml_context(const size_t memory_size, const bool no_alloc) { diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 506591ff..851819f7 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -4,11 +4,7 @@ function(rwkv_add_test source) if (GGML_CUDA_SOURCES) set_property(TARGET ${TEST_TARGET} PROPERTY CUDA_ARCHITECTURES OFF) endif() - if(RWKV_HIPBLAS) - target_link_libraries(${TEST_TARGET} PRIVATE ggml-rocm ggml rwkv) - else() - target_link_libraries(${TEST_TARGET} PRIVATE ggml rwkv) - endif() + target_link_libraries(${TEST_TARGET} PRIVATE ggml rwkv) add_test(NAME ${TEST_TARGET} COMMAND $ ${ARGN}) if (RWKV_STATIC) if(RWKV_HIPBLAS) diff --git a/tests/expected-logits-4v0-660K.bin b/tests/expected-logits-4v0-660K.bin index 177a2085..58b44525 100644 Binary files a/tests/expected-logits-4v0-660K.bin and b/tests/expected-logits-4v0-660K.bin differ diff --git a/tests/expected-logits-5v1-730K.bin b/tests/expected-logits-5v1-730K.bin index 6637dee0..1358d80b 100644 Binary files a/tests/expected-logits-5v1-730K.bin and b/tests/expected-logits-5v1-730K.bin differ diff --git a/tests/expected-logits-5v2-730K.bin b/tests/expected-logits-5v2-730K.bin index 94ca07e6..8a1994ac 100644 Binary files a/tests/expected-logits-5v2-730K.bin and b/tests/expected-logits-5v2-730K.bin differ diff --git a/tests/expected-logits-6v0-3m.bin b/tests/expected-logits-6v0-3m.bin index 9504bb93..8a070d44 100644 Binary files a/tests/expected-logits-6v0-3m.bin and b/tests/expected-logits-6v0-3m.bin differ diff --git a/tests/logit_difference_validator.inc b/tests/logit_difference_validator.inc index 9269d36d..644d5013 100644 --- a/tests/logit_difference_validator.inc +++ b/tests/logit_difference_validator.inc @@ -13,6 +13,8 @@ // Also test multithreading. #define N_THREADS 2 +#define N_GPU_LAYERS 0 + void load_expected_logits(float * expected_logits, const char * version) { char file_name[128]; sprintf(file_name, "expected-logits-%s.bin", version); @@ -29,14 +31,10 @@ void test_model(const char * version, const char * format, const float * expecte fprintf(stderr, "Testing %s\n", file_name); - struct rwkv_context * model = rwkv_init_from_file(file_name, N_THREADS); + struct rwkv_context * model = rwkv_init_from_file(file_name, N_THREADS, N_GPU_LAYERS); enum rwkv_error_flags error = rwkv_get_last_error(NULL); ASSERT(error == 0, "Unexpected error %d", error); -#if defined(GGML_USE_CUBLAS) - ASSERT(rwkv_gpu_offload_layers(model, rwkv_get_n_layer(model) + 1), "Failed to offload layers to GPU"); -#endif - const size_t n_vocab = rwkv_get_logits_len(model); ASSERT(n_vocab == N_VOCAB, "Unexpected n_vocab in the model"); diff --git a/tests/test_context_cloning.c b/tests/test_context_cloning.c index 40d0495e..d175af9d 100644 --- a/tests/test_context_cloning.c +++ b/tests/test_context_cloning.c @@ -8,7 +8,7 @@ #include "assertions.inc" int main(void) { - struct rwkv_context * ctx = rwkv_init_from_file("tiny-rwkv-5v2-730K-FP32.bin", 2); + struct rwkv_context * ctx = rwkv_init_from_file("tiny-rwkv-5v2-730K-FP32.bin", 2, 0); ASSERT(ctx != NULL, "Unexpected error 0x%.8X", rwkv_get_last_error(NULL)); diff --git a/tests/test_eval_sequence_in_chunks.c b/tests/test_eval_sequence_in_chunks.c index 804c2446..0a35da1f 100644 --- a/tests/test_eval_sequence_in_chunks.c +++ b/tests/test_eval_sequence_in_chunks.c @@ -10,7 +10,7 @@ void test_on_prompt(const char * prompt, const size_t prompt_length) { fprintf(stderr, "Calculating expected state and logits for prompt of size %zd\n", prompt_length); - struct rwkv_context * ctx = rwkv_init_from_file("tiny-rwkv-5v2-730K-FP32.bin", 2); + struct rwkv_context * ctx = rwkv_init_from_file("tiny-rwkv-5v2-730K-FP32.bin", 2, 0); ASSERT(ctx != NULL, "Unexpected error 0x%.8X", rwkv_get_last_error(NULL)); diff --git a/tests/test_ggml_basics.c b/tests/test_ggml_basics.c index e767387a..ec0a50bf 100644 --- a/tests/test_ggml_basics.c +++ b/tests/test_ggml_basics.c @@ -34,10 +34,23 @@ void test_simple_computation(void) { // Allocation on heap instead of stack avoids SegFault when GGML_MAX_NODES is set to a large value. struct ggml_cgraph * graph = (struct ggml_cgraph *) calloc(1, sizeof(struct ggml_cgraph)); + graph->size = GGML_DEFAULT_GRAPH_SIZE; + graph->n_nodes = 0; + graph->n_leafs = 0; + graph->nodes = (struct ggml_tensor **) calloc(1, GGML_DEFAULT_GRAPH_SIZE * sizeof(struct ggml_tensor *)); + graph->leafs = (struct ggml_tensor **) calloc(1, GGML_DEFAULT_GRAPH_SIZE * sizeof(struct ggml_tensor *)); + size_t hash_size = GGML_DEFAULT_GRAPH_SIZE * 2 + 1; + graph->visited_hash_table.size = hash_size; + graph->visited_hash_table.keys = (struct ggml_tensor **) calloc(1, hash_size * sizeof(struct ggml_tensor *)); + graph->order = GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT; + ggml_build_forward_expand(graph, sum); - struct ggml_cplan * plan = ggml_graph_plan(graph, 2); - ggml_graph_compute(graph, plan); - free(plan); + struct ggml_cplan plan = ggml_graph_plan(graph, 2); + ggml_graph_compute(graph, &plan); + + free(graph->nodes); + free(graph->leafs); + free(graph->visited_hash_table.keys); free(graph); ASSERT_ELEMENT_F32(sum, 0, -9.0F); @@ -74,10 +87,22 @@ void test_computation_on_tensors_from_different_contexts(void) { // Allocation on heap instead of stack avoids SegFault when GGML_MAX_NODES is set to a large value. struct ggml_cgraph * graph = (struct ggml_cgraph *) calloc(1, sizeof(struct ggml_cgraph)); + graph->size = GGML_DEFAULT_GRAPH_SIZE; + graph->n_nodes = 0; + graph->n_leafs = 0; + graph->nodes = (struct ggml_tensor **) calloc(1, GGML_DEFAULT_GRAPH_SIZE * sizeof(struct ggml_tensor *)); + graph->leafs = (struct ggml_tensor **) calloc(1, GGML_DEFAULT_GRAPH_SIZE * sizeof(struct ggml_tensor *)); + size_t hash_size = GGML_DEFAULT_GRAPH_SIZE * 2 + 1; + graph->visited_hash_table.size = hash_size; + graph->visited_hash_table.keys = (struct ggml_tensor **) calloc(1, hash_size * sizeof(struct ggml_tensor *)); + graph->order = GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT; ggml_build_forward_expand(graph, sum); - struct ggml_cplan * plan = ggml_graph_plan(graph, 2); - ggml_graph_compute(graph, plan); - free(plan); + struct ggml_cplan plan = ggml_graph_plan(graph, 2); + ggml_graph_compute(graph, &plan); + + free(graph->nodes); + free(graph->leafs); + free(graph->visited_hash_table.keys); free(graph); ASSERT_ELEMENT_F32(sum, 0, -9.0F); diff --git a/tests/test_logit_calculation_skipping.c b/tests/test_logit_calculation_skipping.c index c9e97749..6bde8928 100644 --- a/tests/test_logit_calculation_skipping.c +++ b/tests/test_logit_calculation_skipping.c @@ -14,7 +14,7 @@ const char prompt[TOKEN_COUNT + 1] = "hello world"; void test_serial_mode(void) { fprintf(stderr, "Testing serial mode\n"); - struct rwkv_context * ctx = rwkv_init_from_file("tiny-rwkv-5v2-730K-FP32.bin", 2); + struct rwkv_context * ctx = rwkv_init_from_file("tiny-rwkv-5v2-730K-FP32.bin", 2, 0); ASSERT(ctx != NULL, "Unexpected error 0x%.8X", rwkv_get_last_error(NULL)); @@ -54,7 +54,7 @@ void test_serial_mode(void) { void test_sequential_mode(void) { fprintf(stderr, "Testing sequential mode\n"); - struct rwkv_context * ctx = rwkv_init_from_file("tiny-rwkv-5v2-730K-FP32.bin", 2); + struct rwkv_context * ctx = rwkv_init_from_file("tiny-rwkv-5v2-730K-FP32.bin", 2, 0); ASSERT(ctx != NULL, "Unexpected error 0x%.8X", rwkv_get_last_error(NULL)); diff --git a/tests/test_quantization_format_compatibility.c b/tests/test_quantization_format_compatibility.c index 6ad41136..ba90c6a9 100644 --- a/tests/test_quantization_format_compatibility.c +++ b/tests/test_quantization_format_compatibility.c @@ -30,7 +30,7 @@ int main(void) { +025.273308F, +048.068733F, // 6v0 - -019.400530F, + -021.151785F, +003.576909F }; diff --git a/tests/test_quantized_matmul_on_gpu.c b/tests/test_quantized_matmul_on_gpu.c index 854980c4..9666f57c 100644 --- a/tests/test_quantized_matmul_on_gpu.c +++ b/tests/test_quantized_matmul_on_gpu.c @@ -2,84 +2,118 @@ #include #include -#if defined(GGML_USE_CUBLAS) +#if defined(GGML_USE_CUDA) || defined(GGML_USE_METAL) #include +#include #include -#include "ggml/src/ggml-cuda.h" +#include +#include -#include "assertions.inc" +#if defined(GGML_USE_CUDA) +#include "ggml/include/ggml-cuda.h" +#elif defined(GGML_USE_METAL) +#include "ggml/include/ggml-metal.h" +#endif -#define SET_ELEMENT_F32(tensor, i, value) ((float *) tensor->data)[i] = value +#include "assertions.inc" -#define ELEMENT_COUNT 32 +// ELEMENT_COUNT >= 64 makes metal kernel happy +#define ELEMENT_COUNT 64 int main(void) { struct ggml_init_params params = { - .mem_size = 16 * 1024, + .mem_size = 96 * 1024, .mem_buffer = NULL, - .no_alloc = false, + .no_alloc = true, }; +#ifdef GGML_USE_CUDA + ggml_backend_t backend = ggml_backend_cuda_init(0); +#elif defined(GGML_USE_METAL) + ggml_backend_t backend = ggml_backend_metal_init(); +#endif + + ggml_backend_t backend_cpu = ggml_backend_cpu_init(); + + ggml_backend_t backends[2] = { backend, backend_cpu }; + + ASSERT(backend && backend_cpu, "ggml_backend init failed\n"); + struct ggml_context * ctx = ggml_init(params); // --- struct ggml_tensor * x = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, ELEMENT_COUNT, 1); + struct ggml_tensor * x_quantized = ggml_new_tensor_2d(ctx, GGML_TYPE_Q5_0, ELEMENT_COUNT, 1); - for (int i = 0; i < ELEMENT_COUNT; i++) { - SET_ELEMENT_F32(x, i, 1.0F * i); - } + struct ggml_tensor * y = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ELEMENT_COUNT); - // --- + struct ggml_tensor * mul0 = ggml_mul_mat(ctx, x, y); + struct ggml_tensor * mul1 = ggml_mul_mat(ctx, x_quantized, y); - struct ggml_tensor * x_quantized = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, ELEMENT_COUNT, 1); + ggml_backend_buffer_t buffer_gpu = ggml_backend_alloc_buffer(backend, ggml_nbytes(x_quantized) + 1024); + ggml_backend_buffer_t buffer_cpu = ggml_backend_alloc_buffer(backend_cpu, ggml_nbytes(x) + ggml_nbytes(y) + 1024); - int64_t hist[16]; - ggml_quantize_chunk(x_quantized->type, (const float *) x->data, x_quantized->data, 0, ELEMENT_COUNT, hist); + ggml_backend_buffer_set_usage(buffer_gpu, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + ggml_backend_buffer_set_usage(buffer_cpu, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); - x_quantized->backend = GGML_BACKEND_GPU; - ggml_cuda_transform_tensor(x_quantized->data, x_quantized); + struct ggml_tallocr tallocr_gpu = ggml_tallocr_new(buffer_gpu); + struct ggml_tallocr tallocr_cpu = ggml_tallocr_new(buffer_cpu); - // --- + ggml_tallocr_alloc(&tallocr_gpu, x_quantized); + ggml_tallocr_alloc(&tallocr_gpu, mul1); + ggml_tallocr_alloc(&tallocr_cpu, x); + ggml_tallocr_alloc(&tallocr_cpu, y); + ggml_tallocr_alloc(&tallocr_cpu, mul0); - struct ggml_tensor * y = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ELEMENT_COUNT); + struct ggml_cgraph * graph = ggml_new_graph(ctx); + ggml_build_forward_expand(graph, mul0); + ggml_build_forward_expand(graph, mul1); + // --- + + float * data = (float *) malloc(ELEMENT_COUNT * ggml_type_size(GGML_TYPE_F32)); for (int i = 0; i < ELEMENT_COUNT; i++) { - SET_ELEMENT_F32(y, i, 1.0F * i); + if (i % 2 == 0) + data[i] = 1.0F * i / 2; + else + data[i] = 0; } + uint8_t * data_quantized = (uint8_t *) malloc(ELEMENT_COUNT * ggml_type_size(GGML_TYPE_Q5_0)); + ggml_quantize_chunk(x_quantized->type, (const float *) data, data_quantized, 0, 1, ELEMENT_COUNT, NULL); - // --- - - struct ggml_tensor * mul0 = ggml_mul_mat(ctx, x, y); - struct ggml_tensor * mul1 = ggml_mul_mat(ctx, x_quantized, y); + memcpy(x->data, data, ggml_nbytes(x)); + memcpy(y->data, data, ggml_nbytes(y)); - // Allocation on heap instead of stack avoids SegFault when GGML_MAX_NODES is set to a large value. - struct ggml_cgraph * graph = (struct ggml_cgraph *) calloc(1, sizeof(struct ggml_cgraph)); - ggml_build_forward_expand(graph, mul0); - ggml_build_forward_expand(graph, mul1); + // --- - struct ggml_cplan * plan = ggml_graph_plan(graph, 2); +#if defined(GGML_USE_METAL) + memcpy(x_quantized->data, data_quantized, ggml_nbytes(x_quantized)); +#else + ggml_backend_tensor_set(x_quantized, data_quantized, 0, ggml_nbytes(x_quantized)); +#endif - uint8_t * work_data = (uint8_t *) malloc(plan->work_size); - plan->work_data = work_data; + ggml_backend_sched_t sched = ggml_backend_sched_new(backends, NULL, 2, 4096, false); - ggml_graph_compute(graph, plan); + ggml_backend_sched_reset(sched); + ggml_backend_sched_graph_compute(sched, graph); - free(plan); - free(graph); - free(work_data); + float result0; + float result1; - float result0 = ((float *) mul0->data)[0]; - float result1 = ((float *) mul1->data)[0]; + ggml_backend_tensor_get(mul0, &result0, 0, ggml_nbytes(mul0)); + ggml_backend_tensor_get(mul1, &result1, 0, ggml_nbytes(mul1)); fprintf(stderr, "FP32 CPU result = %f\n", result0); - fprintf(stderr, "Q4_0 GPU result = %f\n", result1); + fprintf(stderr, "Q5_0 GPU result = %f\n", result1); ASSERT(fabsf(result0 - result1) <= 100.0F, "Results differ too much"); ggml_free(ctx); + free(data); + free(data_quantized); return 0; } @@ -87,7 +121,7 @@ int main(void) { #else int main(void) { - fprintf(stderr, "Skipping test_quantized_matmul_on_gpu.c: GGML_USE_CUBLAS is not defined\n"); + fprintf(stderr, "Skipping test_quantized_matmul_on_gpu.c: GGML_USE_CUDA is not defined\n"); return 0; } diff --git a/tests/test_tiny_rwkv.c b/tests/test_tiny_rwkv.c index 5b0b7769..9072ae3c 100644 --- a/tests/test_tiny_rwkv.c +++ b/tests/test_tiny_rwkv.c @@ -45,7 +45,7 @@ int main(void) { +0.001000F, // FP32 +0.455912F, // FP16 // 6v0 - +0.001566F, // FP32 + +0.001000F, // FP32 -0.416620F // FP16 }; @@ -78,14 +78,14 @@ int main(void) { +000.585238F, // Q8_0 // 5v2 +035.271305F, // Q4_0 - +061.719509F, // Q4_1 + +067.015076F, // Q4_1 +025.273308F, // Q5_0 +048.068733F, // Q5_1 -009.441034F, // Q8_0 // 6v0 - -003.824263F, // Q4_0 + -007.588121F, // Q4_0 +021.939022F, // Q4_1 - -021.191444F, // Q5_0 + -027.332073F, // Q5_0 +003.576909F, // Q5_1 -009.539596F // Q8_0 }; @@ -105,14 +105,14 @@ int main(void) { -000.962695F, // Q8_0 // 5v2 +034.135971F, // Q4_0 - +059.066830F, // Q4_1 + +065.573822F, // Q4_1 +021.588751F, // Q5_0 +029.726818F, // Q5_1 -007.242277F, // Q8_0 // 6v0 - -003.487368F, // Q4_0 + -007.660988F, // Q4_0 +021.797060F, // Q4_1 - -021.271053F, // Q5_0 + -027.269241F, // Q5_0 +003.405264F, // Q5_1 -009.734720F // Q8_0 };