diff --git a/llama.cpp/common.cpp b/llama.cpp/common.cpp index 7381efa255..c868359b1c 100644 --- a/llama.cpp/common.cpp +++ b/llama.cpp/common.cpp @@ -521,6 +521,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { params.infill = true; } else if (arg == "--unsecure") { params.unsecure = true; + } else if (arg == "--nocompile") { + FLAG_nocompile = true; } else if (arg == "-dkvc" || arg == "--dump-kv-cache") { params.dump_kv_cache = true; } else if (arg == "-nkvo" || arg == "--no-kv-offload") { @@ -929,6 +931,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -ld LOGDIR, --logdir LOGDIR\n"); printf(" path under which to save YAML logs (no logging if unset)\n"); printf(" --unsecure disables pledge() sandboxing on Linux and OpenBSD\n"); + printf(" --nocompile disables runtime compilation of gpu support\n"); printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" advanced option to override model metadata by key. may be specified multiple times.\n"); printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); diff --git a/llama.cpp/ggml-cuda.cu b/llama.cpp/ggml-cuda.cu index 4143fb4731..37ab1d8657 100644 --- a/llama.cpp/ggml-cuda.cu +++ b/llama.cpp/ggml-cuda.cu @@ -37,6 +37,7 @@ #define CUDA_R_16F HIPBLAS_R_16F #define CUDA_R_32F HIPBLAS_R_32F #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) +#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6 #define cublasCreate hipblasCreate #define cublasGemmEx hipblasGemmEx #define cublasGemmBatchedEx hipblasGemmBatchedEx @@ -46,6 +47,7 @@ #define cublasSetStream hipblasSetStream #define cublasSgemm hipblasSgemm #define cublasStatus_t hipblasStatus_t +#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6 #define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess diff --git a/llama.cpp/server/server.cpp b/llama.cpp/server/server.cpp index 4da15d9424..e44d29b434 100644 --- a/llama.cpp/server/server.cpp +++ b/llama.cpp/server/server.cpp @@ -1981,6 +1981,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" --log-disable disables logging to a file.\n"); printf(" --nobrowser Do not attempt to open a web browser tab at startup.\n"); printf(" --unsecure disables pledge() sandboxing on Linux and OpenBSD\n"); + printf(" --nocompile disables runtime compilation of gpu support\n"); printf("\n"); } @@ -2329,6 +2330,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, else if (arg == "--server") { } + else if (arg == "--nocompile") + { + FLAG_nocompile = true; + } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); diff --git a/llamafile/cuda.c b/llamafile/cuda.c index c42b510913..5157bc7921 100644 --- a/llamafile/cuda.c +++ b/llamafile/cuda.c @@ -307,7 +307,7 @@ static dontinline bool GetNvccArchFlag(char *nvcc, char flag[static 32]) { return true; } -static bool CompileNativeCuda(char dso[static PATH_MAX]) { +static bool CompileNativeCuda(const char *dso) { // extract source code char src[PATH_MAX]; @@ -336,9 +336,6 @@ static bool CompileNativeCuda(char dso[static PATH_MAX]) { } // check if dso is already compiled - llamafile_get_app_dir(dso, PATH_MAX); - strlcat(dso, "ggml-cuda.", PATH_MAX); - strlcat(dso, GetDsoExtension(), PATH_MAX); if (!needs_rebuild) { switch (llamafile_is_file_newer_than(src, dso)) { case -1: @@ -390,7 +387,7 @@ static bool CompileNativeCuda(char dso[static PATH_MAX]) { return false; } -static bool ExtractCudaDso(char dso[static PATH_MAX]) { +static bool ExtractCudaDso(const char *dso) { // see if prebuilt dso is bundled in zip assets char zip[80]; @@ -401,11 +398,6 @@ static bool ExtractCudaDso(char dso[static PATH_MAX]) { return false; } - // get destination path - llamafile_get_app_dir(dso, PATH_MAX); - strlcat(dso, "ggml-cuda.", PATH_MAX); - strlcat(dso, GetDsoExtension(), PATH_MAX); - // extract prebuilt dso return llamafile_extract(zip, dso); } @@ -418,7 +410,7 @@ static bool LinkCudaDso(char *dso) { if (!lib) { tinyprint(2, Dlerror(), ": failed to load library\n", NULL); if ((IsLinux() || IsBsd()) && !commandv("cc", dso, PATH_MAX)) { - tinyprint(2, "you need to install a c compiler for gpu support\n", NULL); + tinyprint(2, "you need to install cc for gpu support\n", NULL); } return false; } @@ -457,21 +449,29 @@ static bool LinkCudaDso(char *dso) { } static bool ImportCudaImpl(void) { - char path[PATH_MAX]; // No dynamic linking support on OpenBSD yet. if (IsOpenbsd()) { return false; } - // try building cuda code from source using cublas - if (CompileNativeCuda(path)) { - return LinkCudaDso(path); + // Get path of CUDA support DSO. + char dso[PATH_MAX]; + llamafile_get_app_dir(dso, PATH_MAX); + strlcat(dso, "ggml-cuda.", PATH_MAX); + strlcat(dso, GetDsoExtension(), PATH_MAX); + if (FLAG_nocompile) { + return LinkCudaDso(dso); + } + + // Try building CUDA from source with mighty cuBLAS. + if (CompileNativeCuda(dso)) { + return LinkCudaDso(dso); } - // try using a prebuilt path - if (ExtractCudaDso(path)) { - return LinkCudaDso(path); + // Try extracting prebuilt tinyBLAS DSO from PKZIP. + if (ExtractCudaDso(dso)) { + return LinkCudaDso(dso); } // too bad diff --git a/llamafile/llamafile.h b/llamafile/llamafile.h index 73d91d64bc..a830b9f09e 100644 --- a/llamafile/llamafile.h +++ b/llamafile/llamafile.h @@ -6,6 +6,8 @@ extern "C" { #endif +extern bool FLAG_nocompile; + struct llamafile; struct llamafile *llamafile_open(const char *, const char *); void llamafile_close(struct llamafile *); @@ -19,7 +21,7 @@ FILE *llamafile_fp(struct llamafile *); void llamafile_init(void); void llamafile_check_cpu(void); -void llamafile_help(const char *) wontreturn; +void llamafile_help(const char *) __attribute__((__noreturn__)); const char *llamafile_get_tmp_dir(void); bool llamafile_extract(const char *, const char *); int llamafile_is_file_newer_than(const char *, const char *); diff --git a/llamafile/metal.c b/llamafile/metal.c index 136c232685..94172b11d0 100644 --- a/llamafile/metal.c +++ b/llamafile/metal.c @@ -80,7 +80,12 @@ static const char *Dlerror(void) { return msg; } -static bool ImportMetalImpl(void) { +static bool FileExists(const char *path) { + struct stat st; + return !stat(path, &st); +} + +static bool BuildMetal(const char *dso) { // extract source code char src[PATH_MAX]; @@ -109,9 +114,6 @@ static bool ImportMetalImpl(void) { } // determine if we need to build - char dso[PATH_MAX]; - llamafile_get_app_dir(dso, PATH_MAX); - strlcat(dso, "ggml-metal.dylib", sizeof(dso)); if (!needs_rebuild) { switch (llamafile_is_file_newer_than(src, dso)) { case -1: @@ -180,6 +182,11 @@ static bool ImportMetalImpl(void) { } } + return true; +} + +static bool LinkMetal(const char *dso) { + // runtime link dynamic shared object void *lib; lib = cosmo_dlopen(dso, RTLD_LAZY); @@ -210,8 +217,31 @@ static bool ImportMetalImpl(void) { return true; } +static bool ImportMetalImpl(void) { + + // Ensure this is MacOS ARM64. + if (!IsXnuSilicon()) { + return false; + } + + // Get path of DSO. + char dso[PATH_MAX]; + llamafile_get_app_dir(dso, PATH_MAX); + strlcat(dso, "ggml-metal.dylib", sizeof(dso)); + if (FLAG_nocompile) { + return LinkMetal(dso); + } + + // Build and link Metal support DSO if possible. + if (BuildMetal(dso)) { + return LinkMetal(dso); + } else { + return false; + } +} + static void ImportMetal(void) { - if (IsXnuSilicon() && ImportMetalImpl()) { + if (ImportMetalImpl()) { ggml_metal.supported = true; ggml_metal.backend_init(); tinyprint(2, "Apple Metal GPU support successfully loaded\n", NULL); diff --git a/llamafile/nocompile.c b/llamafile/nocompile.c new file mode 100644 index 0000000000..ad0e1df8db --- /dev/null +++ b/llamafile/nocompile.c @@ -0,0 +1,20 @@ +// -*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*- +// vi: set et ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi +// +// Copyright 2023 Mozilla Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "llamafile.h" + +bool FLAG_nocompile;