Introduce --nocompile flag

This flag will prevent llamafile from compiling GPU support at runtime. It's useful for cases where you want to build the DSO yourself by hand.
Mozilla-Ocho · Jan 3, 2024 · 8762f13 · 8762f13
1 parent f6ee33c
commit 8762f13
Show file tree

Hide file tree

Showing 7 changed files with 86 additions and 24 deletions.
diff --git a/llama.cpp/common.cpp b/llama.cpp/common.cpp
@@ -521,6 +521,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             params.infill = true;
         } else if (arg == "--unsecure") {
             params.unsecure = true;
+        } else if (arg == "--nocompile") {
+            FLAG_nocompile = true;
         } else if (arg == "-dkvc" || arg == "--dump-kv-cache") {
             params.dump_kv_cache = true;
         } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
@@ -929,6 +931,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  -ld LOGDIR, --logdir LOGDIR\n");
     printf("                        path under which to save YAML logs (no logging if unset)\n");
     printf("  --unsecure            disables pledge() sandboxing on Linux and OpenBSD\n");
+    printf("  --nocompile           disables runtime compilation of gpu support\n");
     printf("  --override-kv KEY=TYPE:VALUE\n");
     printf("                        advanced option to override model metadata by key. may be specified multiple times.\n");
     printf("                        types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");

diff --git a/llama.cpp/ggml-cuda.cu b/llama.cpp/ggml-cuda.cu
@@ -37,6 +37,7 @@
 #define CUDA_R_16F  HIPBLAS_R_16F
 #define CUDA_R_32F  HIPBLAS_R_32F
 #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
+#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
 #define cublasCreate hipblasCreate
 #define cublasGemmEx hipblasGemmEx
 #define cublasGemmBatchedEx hipblasGemmBatchedEx
@@ -46,6 +47,7 @@
 #define cublasSetStream hipblasSetStream
 #define cublasSgemm hipblasSgemm
 #define cublasStatus_t hipblasStatus_t
+#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
 #define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
 #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
 #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess

diff --git a/llama.cpp/server/server.cpp b/llama.cpp/server/server.cpp
@@ -1981,6 +1981,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
     printf("  --log-disable         disables logging to a file.\n");
     printf("  --nobrowser           Do not attempt to open a web browser tab at startup.\n");
     printf("  --unsecure            disables pledge() sandboxing on Linux and OpenBSD\n");
+    printf("  --nocompile           disables runtime compilation of gpu support\n");
     printf("\n");
 }
 
@@ -2329,6 +2330,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
         else if (arg == "--server")
         {
         }
+        else if (arg == "--nocompile")
+        {
+            FLAG_nocompile = true;
+        }
         else
         {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());

diff --git a/llamafile/cuda.c b/llamafile/cuda.c
@@ -307,7 +307,7 @@ static dontinline bool GetNvccArchFlag(char *nvcc, char flag[static 32]) {
     return true;
 }
 
-static bool CompileNativeCuda(char dso[static PATH_MAX]) {
+static bool CompileNativeCuda(const char *dso) {
 
     // extract source code
     char src[PATH_MAX];
@@ -336,9 +336,6 @@ static bool CompileNativeCuda(char dso[static PATH_MAX]) {
     }
 
     // check if dso is already compiled
-    llamafile_get_app_dir(dso, PATH_MAX);
-    strlcat(dso, "ggml-cuda.", PATH_MAX);
-    strlcat(dso, GetDsoExtension(), PATH_MAX);
     if (!needs_rebuild) {
         switch (llamafile_is_file_newer_than(src, dso)) {
             case -1:
@@ -390,7 +387,7 @@ static bool CompileNativeCuda(char dso[static PATH_MAX]) {
     return false;
 }
 
-static bool ExtractCudaDso(char dso[static PATH_MAX]) {
+static bool ExtractCudaDso(const char *dso) {
 
     // see if prebuilt dso is bundled in zip assets
     char zip[80];
@@ -401,11 +398,6 @@ static bool ExtractCudaDso(char dso[static PATH_MAX]) {
         return false;
     }
 
-    // get destination path
-    llamafile_get_app_dir(dso, PATH_MAX);
-    strlcat(dso, "ggml-cuda.", PATH_MAX);
-    strlcat(dso, GetDsoExtension(), PATH_MAX);
-
     // extract prebuilt dso
     return llamafile_extract(zip, dso);
 }
@@ -418,7 +410,7 @@ static bool LinkCudaDso(char *dso) {
     if (!lib) {
         tinyprint(2, Dlerror(), ": failed to load library\n", NULL);
         if ((IsLinux() || IsBsd()) && !commandv("cc", dso, PATH_MAX)) {
-            tinyprint(2, "you need to install a c compiler for gpu support\n", NULL);
+            tinyprint(2, "you need to install cc for gpu support\n", NULL);
         }
         return false;
     }
@@ -457,21 +449,29 @@ static bool LinkCudaDso(char *dso) {
 }
 
 static bool ImportCudaImpl(void) {
-    char path[PATH_MAX];
 
     // No dynamic linking support on OpenBSD yet.
     if (IsOpenbsd()) {
         return false;
     }
 
-    // try building cuda code from source using cublas
-    if (CompileNativeCuda(path)) {
-        return LinkCudaDso(path);
+    // Get path of CUDA support DSO.
+    char dso[PATH_MAX];
+    llamafile_get_app_dir(dso, PATH_MAX);
+    strlcat(dso, "ggml-cuda.", PATH_MAX);
+    strlcat(dso, GetDsoExtension(), PATH_MAX);
+    if (FLAG_nocompile) {
+        return LinkCudaDso(dso);
+    }
+
+    // Try building CUDA from source with mighty cuBLAS.
+    if (CompileNativeCuda(dso)) {
+        return LinkCudaDso(dso);
     }
 
-    // try using a prebuilt path
-    if (ExtractCudaDso(path)) {
-        return LinkCudaDso(path);
+    // Try extracting prebuilt tinyBLAS DSO from PKZIP.
+    if (ExtractCudaDso(dso)) {
+        return LinkCudaDso(dso);
     }
 
     // too bad

diff --git a/llamafile/llamafile.h b/llamafile/llamafile.h
@@ -6,6 +6,8 @@
 extern "C" {
 #endif
 
+extern bool FLAG_nocompile;
+
 struct llamafile;
 struct llamafile *llamafile_open(const char *, const char *);
 void llamafile_close(struct llamafile *);
@@ -19,7 +21,7 @@ FILE *llamafile_fp(struct llamafile *);
 
 void llamafile_init(void);
 void llamafile_check_cpu(void);
-void llamafile_help(const char *) wontreturn;
+void llamafile_help(const char *) __attribute__((__noreturn__));
 const char *llamafile_get_tmp_dir(void);
 bool llamafile_extract(const char *, const char *);
 int llamafile_is_file_newer_than(const char *, const char *);

diff --git a/llamafile/metal.c b/llamafile/metal.c
@@ -80,7 +80,12 @@ static const char *Dlerror(void) {
     return msg;
 }
 
-static bool ImportMetalImpl(void) {
+static bool FileExists(const char *path) {
+    struct stat st;
+    return !stat(path, &st);
+}
+
+static bool BuildMetal(const char *dso) {
 
     // extract source code
     char src[PATH_MAX];
@@ -109,9 +114,6 @@ static bool ImportMetalImpl(void) {
     }
 
     // determine if we need to build
-    char dso[PATH_MAX];
-    llamafile_get_app_dir(dso, PATH_MAX);
-    strlcat(dso, "ggml-metal.dylib", sizeof(dso));
     if (!needs_rebuild) {
         switch (llamafile_is_file_newer_than(src, dso)) {
             case -1:
@@ -180,6 +182,11 @@ static bool ImportMetalImpl(void) {
         }
     }
 
+    return true;
+}
+
+static bool LinkMetal(const char *dso) {
+
     // runtime link dynamic shared object
     void *lib;
     lib = cosmo_dlopen(dso, RTLD_LAZY);
@@ -210,8 +217,31 @@ static bool ImportMetalImpl(void) {
     return true;
 }
 
+static bool ImportMetalImpl(void) {
+
+    // Ensure this is MacOS ARM64.
+    if (!IsXnuSilicon()) {
+        return false;
+    }
+
+    // Get path of DSO.
+    char dso[PATH_MAX];
+    llamafile_get_app_dir(dso, PATH_MAX);
+    strlcat(dso, "ggml-metal.dylib", sizeof(dso));
+    if (FLAG_nocompile) {
+        return LinkMetal(dso);
+    }
+
+    // Build and link Metal support DSO if possible.
+    if (BuildMetal(dso)) {
+        return LinkMetal(dso);
+    } else {
+        return false;
+    }
+}
+
 static void ImportMetal(void) {
-    if (IsXnuSilicon() && ImportMetalImpl()) {
+    if (ImportMetalImpl()) {
         ggml_metal.supported = true;
         ggml_metal.backend_init();
         tinyprint(2, "Apple Metal GPU support successfully loaded\n", NULL);

diff --git a/llamafile/nocompile.c b/llamafile/nocompile.c
@@ -0,0 +1,20 @@
+// -*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
+// Copyright 2023 Mozilla Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "llamafile.h"
+
+bool FLAG_nocompile;