Merge branch 'coverage-ggerganov#295' of https://github.com/goerch/ggml…

… into coverage-ggerganov#295
goerch · Jul 23, 2023 · 4ec0afb · 4ec0afb
2 parents 3582eb3 + f634bbb
commit 4ec0afb
Show file tree

Hide file tree

Showing 14 changed files with 1,142 additions and 1,092 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -32,6 +32,7 @@ option(GGML_NO_ACCELERATE           "ggml: disable Accelerate framework" OFF)
 option(GGML_OPENBLAS                "ggml: use OpenBLAS"                 OFF)
 option(GGML_CLBLAST                 "ggml: use clBLAST"                  OFF)
 option(GGML_CUBLAS                  "ggml: use cuBLAS"                   OFF)
+option(GGML_METAL                   "ggml: use Metal"                    OFF)
 
 # sanitizers
 

diff --git a/README.md b/README.md
@@ -16,6 +16,7 @@ Some of the development is currently happening in the [llama.cpp](https://github
 - ADAM and L-BFGS optimizers
 - Optimized for Apple Silicon
 - On x86 architectures utilizes AVX / AVX2 intrinsics
+- On ppc64 architectures utilizes VSX intrinsics
 - No third-party dependencies
 - Zero memory allocations during runtime
 
@@ -42,6 +43,7 @@ Some of the development is currently happening in the [llama.cpp](https://github
 - [X] Example of BioGPT inference [PABannier/biogpt.cpp](https://github.com/PABannier/biogpt.cpp)
 - [X] Example of Encodec inference [PABannier/encodec.cpp](https://github.com/PABannier/encodec.cpp) 
 - [X] Example of CLIP inference [monatis/clip.cpp](https://github.com/monatis/clip.cpp)
+- [X] Example of MiniGPT4 inference [Maknee/minigpt4.cpp](https://github.com/Maknee/minigpt4.cpp) 
 
 ## Whisper inference (example)
 

diff --git a/ci/run.sh b/ci/run.sh
@@ -1,4 +1,15 @@
 #/bin/bash
+#
+# sample usage:
+#
+# mkdir tmp
+#
+# # CPU-only build
+# bash ./ci/run.sh ./tmp/results ./tmp/mnt
+#
+# # with CUDA support
+# GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+#
 
 if [ -z "$2" ]; then
     echo "usage: $0 <output-dir> <mnt-dir>"
@@ -134,8 +145,8 @@ function gg_run_gpt_2 {
     model="../models-mnt/gpt-2/ggml-model-gpt-2-117M.bin"
     prompts="../examples/prompts/gpt-2.txt"
 
-    (time ./bin/gpt-2 --model ${model} -s 1234 -n 64 -t 4 -tt ${prompts}                       ) 2>&1 | tee -a $OUT/${ci}-tg.log
-    (time ./bin/gpt-2 --model ${model} -s 1234 -n 64 -t 4 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log
+    (time ./bin/gpt-2 --model ${model} -s 1234 -n 64 -tt ${prompts}                       ) 2>&1 | tee -a $OUT/${ci}-tg.log
+    (time ./bin/gpt-2 --model ${model} -s 1234 -n 64 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log
 
     set +e
 }
@@ -174,8 +185,8 @@ function gg_run_mpt {
     python3 ../examples/mpt/convert-h5-to-ggml.py ${path_models} 1
     ./bin/mpt-quantize ${model_f16} ${model_q4_0} q4_0
 
-    (time ./bin/mpt --model ${model_f16} -s 1234 -n 64 -t 8 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log
-    (time ./bin/mpt --model ${model_q4_0} -s 1234 -n 64 -t 8 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log
+    (time ./bin/mpt --model ${model_f16}  -s 1234 -n 64 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log
+    (time ./bin/mpt --model ${model_q4_0} -s 1234 -n 64 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log
 
     set +e
 }
@@ -190,23 +201,57 @@ function gg_sum_mpt {
     gg_printf '```\n'
 }
 
+# mnist
+
+function gg_run_mnist {
+    cd ${SRC}
+
+    cd build-ci-release
+
+    set -e
+
+    mkdir -p models/mnist
+    python3 ../examples/mnist/convert-h5-to-ggml.py ../examples/mnist/models/mnist/mnist_model.state_dict
+
+    model_f32="./models/mnist/ggml-model-f32.bin"
+    samples="../examples/mnist/models/mnist/t10k-images.idx3-ubyte"
+
+    # first command runs and exports "mnist.ggml", the second command runs the exported model
+
+    (time ./bin/mnist     ${model_f32} ${samples} ) 2>&1 | tee -a $OUT/${ci}-mnist.log
+    (time ./bin/mnist-cpu ./mnist.ggml ${samples} ) 2>&1 | tee -a $OUT/${ci}-mnist.log
+
+    set +e
+}
+
+function gg_sum_mnist {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'MNIST\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '```\n'
+    gg_printf '%s\n' "$(cat $OUT/${ci}-mnist.log)"
+    gg_printf '```\n'
+}
+
 ## main
 
 if [ -z $GG_BUILD_LOW_PERF ]; then
     rm -rf ${SRC}/models-mnt
 
-    mnt_models=$(realpath ${MNT}/models)
+    mnt_models=${MNT}/models
     mkdir -p ${mnt_models}
     ln -sfn ${mnt_models} ${SRC}/models-mnt
-
-    python3 -m pip install -r ${SRC}/requirements.txt
 fi
 
+python3 -m pip install -r ${SRC}/requirements.txt
+
 ret=0
 
 test $ret -eq 0 && gg_run ctest_debug
 test $ret -eq 0 && gg_run ctest_release
 test $ret -eq 0 && gg_run gpt_2
+test $ret -eq 0 && gg_run mnist
 
 if [ -z $GG_BUILD_LOW_PERF ]; then
     test $ret -eq 0 && gg_run mpt

diff --git a/examples/gpt-neox/README.md b/examples/gpt-neox/README.md
@@ -21,10 +21,10 @@ git clone https://huggingface.co/stabilityai/gpt_neox-base-alpha-3b
 python3 -m pip install -r ../requirements.txt
 
 # convert model to FP16
-python3 ../examples/gpt_neox/convert-h5-to-ggml.py ./stablelm-base-alpha-3b/ 1
+python3 ../examples/gpt-neox/convert-h5-to-ggml.py ./stablelm-base-alpha-3b/ 1
 
 # run inference using FP16 precision
-make -j && ./bin/gpt_neox -m ./stablelm-base-alpha-3b/ggml-model-f16.bin -p "I believe the meaning of life is" -t 8 -n 64
+make -j && ./bin/gpt-neox -m ./stablelm-base-alpha-3b/ggml-model-f16.bin -p "I believe the meaning of life is" -t 8 -n 64
 
 main: seed = 1681940611
 gpt_neox_model_load: loading model from 'models/stablelm-base-alpha-3b/ggml-model-f16.bin' - please wait ...
@@ -63,10 +63,10 @@ main:    total time =  6911.26 ms
 
 ```bash
 # quantize the model to 5-bits using Q5_0 quantization
-./bin/gpt_neox-quantize ./stablelm-base-alpha-3b/ggml-model-f16.bin ./stablelm-base-alpha-3b/ggml-model-q5_0.bin q5_0
+./bin/gpt-neox-quantize ./stablelm-base-alpha-3b/ggml-model-f16.bin ./stablelm-base-alpha-3b/ggml-model-q5_0.bin q5_0
 
 # run the quantized model
-./bin/gpt_neox -m ./stablelm-base-alpha-3b/ggml-model-q5_0.bin -p "I believe the meaning of life is" -t 8 -n 64
+./bin/gpt-neox -m ./stablelm-base-alpha-3b/ggml-model-q5_0.bin -p "I believe the meaning of life is" -t 8 -n 64
 
 main: seed = 1682021489
 gpt_neox_model_load: loading model from 'models/stablelm-base-alpha-3b/ggml-model-q5_0.bin' - please wait ...

diff --git a/examples/mnist/README.md b/examples/mnist/README.md
@@ -41,8 +41,13 @@ mkdir build && cd build
 cmake ..
 make -j4 mnist
 
+# Generate ggml model
+mkdir -p models/mnist
+python3 ../examples/mnist/convert-h5-to-ggml.py ../examples/mnist/models/mnist/mnist_model.state_dict
+
 # Run the MNIST model
-./bin/mnist ../examples/mnist/models/mnist/ggml-model-f32.bin ../examples/mnist/models/mnist/t10k-images.idx3-ubyte
+
+./bin/mnist ./models/mnist/ggml-model-f32.bin ../examples/mnist/models/mnist/t10k-images.idx3-ubyte
 ```
 
 For more information, checkout the corresponding programs in the [examples](examples) folder.

diff --git a/examples/mnist/main-cpu.cpp b/examples/mnist/main-cpu.cpp
@@ -42,6 +42,9 @@ int mnist_eval(
 
     struct ggml_cgraph gfi = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
 
+    // param export/import test
+    GGML_ASSERT(ggml_graph_get_tensor(&gfi, "fc1_bias")->op_params[0] == 0xdeadbeef);
+
     // allocate work context
     // needed during ggml_graph_compute() to allocate a work tensor
     static size_t buf_size = 128ull*1024*1024; // TODO

diff --git a/examples/mnist/main.cpp b/examples/mnist/main.cpp
@@ -119,6 +119,9 @@ bool mnist_model_load(const std::string & fname, mnist_model & model) {
             model.fc1_bias = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_hidden);
             fin.read(reinterpret_cast<char *>(model.fc1_bias->data), ggml_nbytes(model.fc1_bias));
             ggml_set_name(model.fc1_bias, "fc1_bias");
+
+            // just for testing purposes, set some parameters to non-zero
+            model.fc1_bias->op_params[0] = 0xdeadbeef;
         }
     }
 

diff --git a/include/ggml/ggml.h b/include/ggml/ggml.h
@@ -199,6 +199,7 @@
 #define GGML_MAX_CONTEXTS      64
 #define GGML_MAX_SRC           6
 #define GGML_MAX_NAME          48
+#define GGML_MAX_OP_PARAMS     32
 #define GGML_DEFAULT_N_THREADS 4
 
 
@@ -418,6 +419,9 @@ extern "C" {
         // compute data
         enum ggml_op op;
 
+        // op params - allocated as int32_t for alignment
+        int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(uint32_t)];
+
         bool is_param;
 
         struct ggml_tensor * grad;
@@ -1128,9 +1132,9 @@ extern "C" {
             int                   n_past,
             int                   n_dims,
             int                   mode,
+            int                   n_ctx,
             float                 freq_base,
-            float                 freq_scale,
-            int                   n_ctx);
+            float                 freq_scale);
 
     // rotary position embedding backward, i.e compute dx from dy
     // a - dy
@@ -1139,7 +1143,8 @@ extern "C" {
             struct ggml_tensor  * a,
             int                   n_past,
             int                   n_dims,
-            int                   mode);
+            int                   mode,
+            int                   n_ctx);
 
     // alibi position embedding
     // in-place, returns view(a)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -53,6 +53,9 @@ endif()
 if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
     message(STATUS "ARM detected")
     #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=apple-m1")
+elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
+    message(STATUS "PPC64 detected")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mpower9-vector")
 else()
     message(STATUS "x86 detected")
     #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx -mavx2 -mfma -mf16c")
@@ -203,6 +206,30 @@ if (GGML_CUBLAS)
     endif()
 endif()
 
+if (GGML_METAL)
+    find_library(FOUNDATION_LIBRARY         Foundation              REQUIRED)
+    find_library(METAL_FRAMEWORK            Metal                   REQUIRED)
+    find_library(METALKIT_FRAMEWORK         MetalKit                REQUIRED)
+    find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)
+
+    set(GGML_METAL_SOURCES ggml-metal.m ggml-metal.h)
+
+    add_compile_definitions(GGML_USE_METAL)
+    add_compile_definitions(GGML_METAL_NDEBUG)
+
+    # get full path to the file
+    #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
+
+    # copy ggml-metal.metal to bin directory
+    configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
+
+    set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS}
+        ${FOUNDATION_LIBRARY}
+        ${METAL_FRAMEWORK}
+        ${METALKIT_FRAMEWORK}
+        ${METALPERFORMANCE_FRAMEWORK}
+        )
+endif()
 
 if (GGML_PERF)
     set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_PERF)
@@ -212,7 +239,9 @@ add_library(${TARGET}
     ggml.c
     ../include/ggml/ggml.h
     ${GGML_CUDA_SOURCES}
-    ${GGML_OPENCL_SOURCES})
+    ${GGML_OPENCL_SOURCES}
+    ${GGML_METAL_SOURCES}
+    )
 
 target_include_directories(${TARGET} PUBLIC
     .