Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
… into coverage-ggerganov#295
  • Loading branch information
goerch committed Jul 23, 2023
2 parents 3582eb3 + f634bbb commit 4ec0afb
Show file tree
Hide file tree
Showing 14 changed files with 1,142 additions and 1,092 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ option(GGML_NO_ACCELERATE "ggml: disable Accelerate framework" OFF)
option(GGML_OPENBLAS "ggml: use OpenBLAS" OFF)
option(GGML_CLBLAST "ggml: use clBLAST" OFF)
option(GGML_CUBLAS "ggml: use cuBLAS" OFF)
option(GGML_METAL "ggml: use Metal" OFF)

# sanitizers

Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Some of the development is currently happening in the [llama.cpp](https://github
- ADAM and L-BFGS optimizers
- Optimized for Apple Silicon
- On x86 architectures utilizes AVX / AVX2 intrinsics
- On ppc64 architectures utilizes VSX intrinsics
- No third-party dependencies
- Zero memory allocations during runtime

Expand All @@ -42,6 +43,7 @@ Some of the development is currently happening in the [llama.cpp](https://github
- [X] Example of BioGPT inference [PABannier/biogpt.cpp](https://github.com/PABannier/biogpt.cpp)
- [X] Example of Encodec inference [PABannier/encodec.cpp](https://github.com/PABannier/encodec.cpp)
- [X] Example of CLIP inference [monatis/clip.cpp](https://github.com/monatis/clip.cpp)
- [X] Example of MiniGPT4 inference [Maknee/minigpt4.cpp](https://github.com/Maknee/minigpt4.cpp)

## Whisper inference (example)

Expand Down
59 changes: 52 additions & 7 deletions ci/run.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,15 @@
#/bin/bash
#
# sample usage:
#
# mkdir tmp
#
# # CPU-only build
# bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
# # with CUDA support
# GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#

if [ -z "$2" ]; then
echo "usage: $0 <output-dir> <mnt-dir>"
Expand Down Expand Up @@ -134,8 +145,8 @@ function gg_run_gpt_2 {
model="../models-mnt/gpt-2/ggml-model-gpt-2-117M.bin"
prompts="../examples/prompts/gpt-2.txt"

(time ./bin/gpt-2 --model ${model} -s 1234 -n 64 -t 4 -tt ${prompts} ) 2>&1 | tee -a $OUT/${ci}-tg.log
(time ./bin/gpt-2 --model ${model} -s 1234 -n 64 -t 4 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log
(time ./bin/gpt-2 --model ${model} -s 1234 -n 64 -tt ${prompts} ) 2>&1 | tee -a $OUT/${ci}-tg.log
(time ./bin/gpt-2 --model ${model} -s 1234 -n 64 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log

set +e
}
Expand Down Expand Up @@ -174,8 +185,8 @@ function gg_run_mpt {
python3 ../examples/mpt/convert-h5-to-ggml.py ${path_models} 1
./bin/mpt-quantize ${model_f16} ${model_q4_0} q4_0

(time ./bin/mpt --model ${model_f16} -s 1234 -n 64 -t 8 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log
(time ./bin/mpt --model ${model_q4_0} -s 1234 -n 64 -t 8 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log
(time ./bin/mpt --model ${model_f16} -s 1234 -n 64 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log
(time ./bin/mpt --model ${model_q4_0} -s 1234 -n 64 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log

set +e
}
Expand All @@ -190,23 +201,57 @@ function gg_sum_mpt {
gg_printf '```\n'
}

# mnist

function gg_run_mnist {
cd ${SRC}

cd build-ci-release

set -e

mkdir -p models/mnist
python3 ../examples/mnist/convert-h5-to-ggml.py ../examples/mnist/models/mnist/mnist_model.state_dict

model_f32="./models/mnist/ggml-model-f32.bin"
samples="../examples/mnist/models/mnist/t10k-images.idx3-ubyte"

# first command runs and exports "mnist.ggml", the second command runs the exported model

(time ./bin/mnist ${model_f32} ${samples} ) 2>&1 | tee -a $OUT/${ci}-mnist.log
(time ./bin/mnist-cpu ./mnist.ggml ${samples} ) 2>&1 | tee -a $OUT/${ci}-mnist.log

set +e
}

function gg_sum_mnist {
gg_printf '### %s\n\n' "${ci}"

gg_printf 'MNIST\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '```\n'
gg_printf '%s\n' "$(cat $OUT/${ci}-mnist.log)"
gg_printf '```\n'
}

## main

if [ -z $GG_BUILD_LOW_PERF ]; then
rm -rf ${SRC}/models-mnt

mnt_models=$(realpath ${MNT}/models)
mnt_models=${MNT}/models
mkdir -p ${mnt_models}
ln -sfn ${mnt_models} ${SRC}/models-mnt

python3 -m pip install -r ${SRC}/requirements.txt
fi

python3 -m pip install -r ${SRC}/requirements.txt

ret=0

test $ret -eq 0 && gg_run ctest_debug
test $ret -eq 0 && gg_run ctest_release
test $ret -eq 0 && gg_run gpt_2
test $ret -eq 0 && gg_run mnist

if [ -z $GG_BUILD_LOW_PERF ]; then
test $ret -eq 0 && gg_run mpt
Expand Down
8 changes: 4 additions & 4 deletions examples/gpt-neox/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ git clone https://huggingface.co/stabilityai/gpt_neox-base-alpha-3b
python3 -m pip install -r ../requirements.txt

# convert model to FP16
python3 ../examples/gpt_neox/convert-h5-to-ggml.py ./stablelm-base-alpha-3b/ 1
python3 ../examples/gpt-neox/convert-h5-to-ggml.py ./stablelm-base-alpha-3b/ 1

# run inference using FP16 precision
make -j && ./bin/gpt_neox -m ./stablelm-base-alpha-3b/ggml-model-f16.bin -p "I believe the meaning of life is" -t 8 -n 64
make -j && ./bin/gpt-neox -m ./stablelm-base-alpha-3b/ggml-model-f16.bin -p "I believe the meaning of life is" -t 8 -n 64

main: seed = 1681940611
gpt_neox_model_load: loading model from 'models/stablelm-base-alpha-3b/ggml-model-f16.bin' - please wait ...
Expand Down Expand Up @@ -63,10 +63,10 @@ main: total time = 6911.26 ms
```bash
# quantize the model to 5-bits using Q5_0 quantization
./bin/gpt_neox-quantize ./stablelm-base-alpha-3b/ggml-model-f16.bin ./stablelm-base-alpha-3b/ggml-model-q5_0.bin q5_0
./bin/gpt-neox-quantize ./stablelm-base-alpha-3b/ggml-model-f16.bin ./stablelm-base-alpha-3b/ggml-model-q5_0.bin q5_0
# run the quantized model
./bin/gpt_neox -m ./stablelm-base-alpha-3b/ggml-model-q5_0.bin -p "I believe the meaning of life is" -t 8 -n 64
./bin/gpt-neox -m ./stablelm-base-alpha-3b/ggml-model-q5_0.bin -p "I believe the meaning of life is" -t 8 -n 64
main: seed = 1682021489
gpt_neox_model_load: loading model from 'models/stablelm-base-alpha-3b/ggml-model-q5_0.bin' - please wait ...
Expand Down
7 changes: 6 additions & 1 deletion examples/mnist/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,13 @@ mkdir build && cd build
cmake ..
make -j4 mnist

# Generate ggml model
mkdir -p models/mnist
python3 ../examples/mnist/convert-h5-to-ggml.py ../examples/mnist/models/mnist/mnist_model.state_dict

# Run the MNIST model
./bin/mnist ../examples/mnist/models/mnist/ggml-model-f32.bin ../examples/mnist/models/mnist/t10k-images.idx3-ubyte

./bin/mnist ./models/mnist/ggml-model-f32.bin ../examples/mnist/models/mnist/t10k-images.idx3-ubyte
```

For more information, checkout the corresponding programs in the [examples](examples) folder.
Expand Down
3 changes: 3 additions & 0 deletions examples/mnist/main-cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ int mnist_eval(

struct ggml_cgraph gfi = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);

// param export/import test
GGML_ASSERT(ggml_graph_get_tensor(&gfi, "fc1_bias")->op_params[0] == 0xdeadbeef);

// allocate work context
// needed during ggml_graph_compute() to allocate a work tensor
static size_t buf_size = 128ull*1024*1024; // TODO
Expand Down
3 changes: 3 additions & 0 deletions examples/mnist/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,9 @@ bool mnist_model_load(const std::string & fname, mnist_model & model) {
model.fc1_bias = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_hidden);
fin.read(reinterpret_cast<char *>(model.fc1_bias->data), ggml_nbytes(model.fc1_bias));
ggml_set_name(model.fc1_bias, "fc1_bias");

// just for testing purposes, set some parameters to non-zero
model.fc1_bias->op_params[0] = 0xdeadbeef;
}
}

Expand Down
11 changes: 8 additions & 3 deletions include/ggml/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@
#define GGML_MAX_CONTEXTS 64
#define GGML_MAX_SRC 6
#define GGML_MAX_NAME 48
#define GGML_MAX_OP_PARAMS 32
#define GGML_DEFAULT_N_THREADS 4


Expand Down Expand Up @@ -418,6 +419,9 @@ extern "C" {
// compute data
enum ggml_op op;

// op params - allocated as int32_t for alignment
int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(uint32_t)];

bool is_param;

struct ggml_tensor * grad;
Expand Down Expand Up @@ -1128,9 +1132,9 @@ extern "C" {
int n_past,
int n_dims,
int mode,
int n_ctx,
float freq_base,
float freq_scale,
int n_ctx);
float freq_scale);

// rotary position embedding backward, i.e compute dx from dy
// a - dy
Expand All @@ -1139,7 +1143,8 @@ extern "C" {
struct ggml_tensor * a,
int n_past,
int n_dims,
int mode);
int mode,
int n_ctx);

// alibi position embedding
// in-place, returns view(a)
Expand Down
31 changes: 30 additions & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
message(STATUS "ARM detected")
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=apple-m1")
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
message(STATUS "PPC64 detected")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mpower9-vector")
else()
message(STATUS "x86 detected")
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx -mavx2 -mfma -mf16c")
Expand Down Expand Up @@ -203,6 +206,30 @@ if (GGML_CUBLAS)
endif()
endif()

if (GGML_METAL)
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
find_library(METAL_FRAMEWORK Metal REQUIRED)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)

set(GGML_METAL_SOURCES ggml-metal.m ggml-metal.h)

add_compile_definitions(GGML_USE_METAL)
add_compile_definitions(GGML_METAL_NDEBUG)

# get full path to the file
#add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")

# copy ggml-metal.metal to bin directory
configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)

set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS}
${FOUNDATION_LIBRARY}
${METAL_FRAMEWORK}
${METALKIT_FRAMEWORK}
${METALPERFORMANCE_FRAMEWORK}
)
endif()

if (GGML_PERF)
set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_PERF)
Expand All @@ -212,7 +239,9 @@ add_library(${TARGET}
ggml.c
../include/ggml/ggml.h
${GGML_CUDA_SOURCES}
${GGML_OPENCL_SOURCES})
${GGML_OPENCL_SOURCES}
${GGML_METAL_SOURCES}
)

target_include_directories(${TARGET} PUBLIC
.
Expand Down
Loading

0 comments on commit 4ec0afb

Please sign in to comment.