Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

build : on Mac OS enable Metal by default #2901

Merged
merged 12 commits into from
Sep 4, 2023
29 changes: 15 additions & 14 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -31,28 +31,29 @@ tmp/
models/*
models-mnt

/main
/quantize
/quantize-stats
/result
/perplexity
/embedding
/train-text-from-scratch
/convert-llama2c-to-ggml
/simple
/benchmark-matmult
/vdot
/server
/Pipfile
/baby-llama
/beam-search
/benchmark-matmult
/convert-llama2c-to-ggml
/embd-input-test
/embedding
/gguf
/gguf-llama-simple
/libllama.so
/llama-bench
/baby-llama
/beam-search
/main
/metal
/perplexity
/quantize
/quantize-stats
/result
/save-load-state
/server
/simple
/speculative
/train-text-from-scratch
/vdot
build-info.h
arm_neon.h
compile_commands.json
Expand Down
56 changes: 32 additions & 24 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,12 @@ endif()
# Option list
#

if (APPLE)
set(LLAMA_METAL_DEFAULT ON)
else()
set(LLAMA_METAL_DEFAULT OFF)
endif()

# general
option(LLAMA_STATIC "llama: static link libraries" OFF)
option(LLAMA_NATIVE "llama: enable -march=native flag" OFF)
Expand Down Expand Up @@ -76,7 +82,7 @@ option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_METAL "llama: use Metal" OFF)
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
option(LLAMA_MPI "llama: use MPI" OFF)
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
Expand Down Expand Up @@ -158,6 +164,31 @@ if (APPLE AND LLAMA_ACCELERATE)
endif()
endif()

if (LLAMA_METAL)
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
find_library(METAL_FRAMEWORK Metal REQUIRED)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)

message(STATUS "Metal framework found")

set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)

add_compile_definitions(GGML_USE_METAL)
#add_compile_definitions(GGML_METAL_NDEBUG)

# get full path to the file
#add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")

# copy ggml-metal.metal to bin directory
configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)

set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
${FOUNDATION_LIBRARY}
${METAL_FRAMEWORK}
${METALKIT_FRAMEWORK}
)
endif()

if (LLAMA_BLAS)
if (LLAMA_STATIC)
set(BLA_STATIC ON)
Expand Down Expand Up @@ -293,29 +324,6 @@ if (LLAMA_CUBLAS)
endif()
endif()

if (LLAMA_METAL)
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
find_library(METAL_FRAMEWORK Metal REQUIRED)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)

set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)

add_compile_definitions(GGML_USE_METAL)
#add_compile_definitions(GGML_METAL_NDEBUG)

# get full path to the file
#add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")

# copy ggml-metal.metal to bin directory
configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)

set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
${FOUNDATION_LIBRARY}
${METAL_FRAMEWORK}
${METALKIT_FRAMEWORK}
)
endif()

if (LLAMA_MPI)
cmake_minimum_required(VERSION 3.10)
find_package(MPI)
Expand Down
76 changes: 45 additions & 31 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,39 @@ TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-dou
# Code coverage output files
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report

ifndef UNAME_S
UNAME_S := $(shell uname -s)
endif

ifndef UNAME_P
UNAME_P := $(shell uname -p)
endif

ifndef UNAME_M
UNAME_M := $(shell uname -m)
endif

# Mac OS + Arm can report x86_64
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
ifeq ($(UNAME_S),Darwin)
ifndef LLAMA_NO_METAL
LLAMA_METAL := 1
endif

ifneq ($(UNAME_P),arm)
SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
ifeq ($(SYSCTL_M),1)
# UNAME_P := arm
# UNAME_M := arm64
warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789)
endif
endif
endif

ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))'
BUILD_TARGETS += metal
endif

default: $(BUILD_TARGETS)

test:
Expand Down Expand Up @@ -38,18 +71,6 @@ gcovr-report: coverage ## Generate gcovr report
mkdir -p gcovr-report
gcovr --root . --html --html-details --output gcovr-report/coverage.html

ifndef UNAME_S
UNAME_S := $(shell uname -s)
endif

ifndef UNAME_P
UNAME_P := $(shell uname -p)
endif

ifndef UNAME_M
UNAME_M := $(shell uname -m)
endif

ifdef RISCV_CROSS_COMPILE
CC := riscv64-unknown-linux-gnu-gcc
CXX := riscv64-unknown-linux-gnu-g++
Expand All @@ -58,19 +79,6 @@ endif
CCV := $(shell $(CC) --version | head -n 1)
CXXV := $(shell $(CXX) --version | head -n 1)

# Mac OS + Arm can report x86_64
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
ifeq ($(UNAME_S),Darwin)
ifneq ($(UNAME_P),arm)
SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
ifeq ($(SYSCTL_M),1)
# UNAME_P := arm
# UNAME_M := arm64
warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789)
endif
endif
endif

#
# Compile flags
#
Expand Down Expand Up @@ -231,14 +239,24 @@ endif
endif

ifndef LLAMA_NO_ACCELERATE
# Mac M1 - include Accelerate framework.
# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
# Mac OS - include Accelerate framework.
# `-framework Accelerate` works both with Apple Silicon and Mac Intel
ifeq ($(UNAME_S),Darwin)
MK_CPPFLAGS += -DGGML_USE_ACCELERATE
MK_LDFLAGS += -framework Accelerate
endif
endif # LLAMA_NO_ACCELERATE

ifdef LLAMA_METAL
# By default - use GPU acceleration on Mac OS
ifeq ($(UNAME_S),Darwin)
CFLAGS += -DGGML_USE_METAL #-DGGML_METAL_NDEBUG
CXXFLAGS += -DGGML_USE_METAL
LDFLAGS += -framework Foundation -framework Metal -framework MetalKit
OBJS += ggml-metal.o
endif
endif # LLAMA_METAL

ifdef LLAMA_MPI
MK_CPPFLAGS += -DGGML_USE_MPI
MK_CFLAGS += -Wno-cast-qual
Expand Down Expand Up @@ -480,10 +498,6 @@ beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o co
speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))'
BUILD_TARGETS += metal
endif
ggerganov marked this conversation as resolved.
Show resolved Hide resolved

ifdef LLAMA_METAL
metal: examples/metal/metal.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
Expand Down
26 changes: 4 additions & 22 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -280,29 +280,11 @@ In order to build llama.cpp you have three different options.

### Metal Build

Using Metal allows the computation to be executed on the GPU for Apple devices:
On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
To disable the Metal build at compile time use the `LLAMA_NO_METAL=1` flag or the `LLAMA_METAL=OFF` cmake option.

- Using `make`:

```bash
LLAMA_METAL=1 make
```

- Using `CMake`:

```bash
mkdir build-metal
cd build-metal
cmake -DLLAMA_METAL=ON ..
cmake --build . --config Release
```

When built with Metal support, you can enable GPU inference with the `--gpu-layers|-ngl` command-line argument.
Any value larger than 0 will offload the computation to the GPU. For example:

```bash
./main -m ./models/7B/ggml-model-q4_0.gguf -n 128 -ngl 1
```
When built with Metal support, you can explicitly disable GPU inference with the `--gpu-layers|-ngl 0` command-line
argument.

### MPI Build

Expand Down
6 changes: 4 additions & 2 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -717,7 +717,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param

lparams.n_ctx = params.n_ctx;
lparams.n_batch = params.n_batch;
lparams.n_gpu_layers = params.n_gpu_layers;
if (params.n_gpu_layers != -1) {
lparams.n_gpu_layers = params.n_gpu_layers;
}
lparams.main_gpu = params.main_gpu;
lparams.tensor_split = params.tensor_split;
lparams.low_vram = params.low_vram;
Expand Down Expand Up @@ -1212,7 +1214,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
fprintf(stream, "mtest: %s # default: false\n", params.mem_test ? "true" : "false");
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
fprintf(stream, "n_gpu_layers: %d # default: 0\n", params.n_gpu_layers);
fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", params.n_probs);
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
Expand Down
2 changes: 1 addition & 1 deletion common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ struct gpt_params {
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_draft = 16; // number of tokens to draft during speculative decoding
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
Expand Down
15 changes: 7 additions & 8 deletions examples/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,14 +151,6 @@ int main(int argc, char ** argv) {
LOG_TEE("%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
}

if (params.n_ctx > 2048) {
// TODO: determine the actual max context of the model (e.g. 4096 for LLaMA v2) and use that instead of 2048
LOG_TEE("%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified)\n", __func__, params.n_ctx);
} else if (params.n_ctx < 8) {
LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
params.n_ctx = 8;
}

LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);

if (params.seed == LLAMA_DEFAULT_SEED) {
Expand Down Expand Up @@ -194,6 +186,13 @@ int main(int argc, char ** argv) {
return 1;
}

if (params.n_ctx > llama_n_ctx(ctx)) {
LOG_TEE("%s: warning: base model only supports context sizes no greater than %d tokens (%d specified)\n", __func__, llama_n_ctx(ctx), params.n_ctx);
} else if (params.n_ctx < 8) {
LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
params.n_ctx = 8;
}

// print system information
{
LOG_TEE("\n");
Expand Down
12 changes: 6 additions & 6 deletions examples/perplexity/perplexity.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -368,7 +368,7 @@ results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
// Example, we have a context window of 512, we will compute perplexity for each of the
// last 256 tokens. Then, we split the input up into context window size chunks to
// process the entire prompt.
const int first = std::min(512, params.n_ctx/2);
const int first = params.n_ctx/2;
ggerganov marked this conversation as resolved.
Show resolved Hide resolved
process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, params.n_ctx - 1 - first,
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
count += params.n_ctx - first - 1;
Expand Down Expand Up @@ -668,11 +668,6 @@ int main(int argc, char ** argv) {
params.n_ctx += params.ppl_stride/2;
}

if (params.n_ctx > 2048) {
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
"expect poor results\n", __func__, params.n_ctx);
}

fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);

if (params.seed == LLAMA_DEFAULT_SEED) {
Expand All @@ -698,6 +693,11 @@ int main(int argc, char ** argv) {
return 1;
}

if (params.n_ctx > llama_n_ctx(ctx)) {
fprintf(stderr, "%s: warning: model might not support context sizes greater than %d tokens (%d specified);"
"expect poor results\n", __func__, llama_n_ctx(ctx), params.n_ctx);
}

// print system information
{
fprintf(stderr, "\n");
Expand Down
Loading
Loading