Skip to content

Commit

Permalink
Introduce --nocompile flag
Browse files Browse the repository at this point in the history
This flag will prevent llamafile from compiling GPU support at runtime.
It's useful for cases where you want to build the DSO yourself by hand.
  • Loading branch information
jart committed Jan 3, 2024
1 parent f6ee33c commit 8762f13
Show file tree
Hide file tree
Showing 7 changed files with 86 additions and 24 deletions.
3 changes: 3 additions & 0 deletions llama.cpp/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
params.infill = true;
} else if (arg == "--unsecure") {
params.unsecure = true;
} else if (arg == "--nocompile") {
FLAG_nocompile = true;
} else if (arg == "-dkvc" || arg == "--dump-kv-cache") {
params.dump_kv_cache = true;
} else if (arg == "-nkvo" || arg == "--no-kv-offload") {
Expand Down Expand Up @@ -929,6 +931,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" -ld LOGDIR, --logdir LOGDIR\n");
printf(" path under which to save YAML logs (no logging if unset)\n");
printf(" --unsecure disables pledge() sandboxing on Linux and OpenBSD\n");
printf(" --nocompile disables runtime compilation of gpu support\n");
printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
Expand Down
2 changes: 2 additions & 0 deletions llama.cpp/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#define CUDA_R_16F HIPBLAS_R_16F
#define CUDA_R_32F HIPBLAS_R_32F
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
#define cublasCreate hipblasCreate
#define cublasGemmEx hipblasGemmEx
#define cublasGemmBatchedEx hipblasGemmBatchedEx
Expand All @@ -46,6 +47,7 @@
#define cublasSetStream hipblasSetStream
#define cublasSgemm hipblasSgemm
#define cublasStatus_t hipblasStatus_t
#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
Expand Down
5 changes: 5 additions & 0 deletions llama.cpp/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1981,6 +1981,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
printf(" --log-disable disables logging to a file.\n");
printf(" --nobrowser Do not attempt to open a web browser tab at startup.\n");
printf(" --unsecure disables pledge() sandboxing on Linux and OpenBSD\n");
printf(" --nocompile disables runtime compilation of gpu support\n");
printf("\n");
}

Expand Down Expand Up @@ -2329,6 +2330,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
else if (arg == "--server")
{
}
else if (arg == "--nocompile")
{
FLAG_nocompile = true;
}
else
{
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
Expand Down
36 changes: 18 additions & 18 deletions llamafile/cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ static dontinline bool GetNvccArchFlag(char *nvcc, char flag[static 32]) {
return true;
}

static bool CompileNativeCuda(char dso[static PATH_MAX]) {
static bool CompileNativeCuda(const char *dso) {

// extract source code
char src[PATH_MAX];
Expand Down Expand Up @@ -336,9 +336,6 @@ static bool CompileNativeCuda(char dso[static PATH_MAX]) {
}

// check if dso is already compiled
llamafile_get_app_dir(dso, PATH_MAX);
strlcat(dso, "ggml-cuda.", PATH_MAX);
strlcat(dso, GetDsoExtension(), PATH_MAX);
if (!needs_rebuild) {
switch (llamafile_is_file_newer_than(src, dso)) {
case -1:
Expand Down Expand Up @@ -390,7 +387,7 @@ static bool CompileNativeCuda(char dso[static PATH_MAX]) {
return false;
}

static bool ExtractCudaDso(char dso[static PATH_MAX]) {
static bool ExtractCudaDso(const char *dso) {

// see if prebuilt dso is bundled in zip assets
char zip[80];
Expand All @@ -401,11 +398,6 @@ static bool ExtractCudaDso(char dso[static PATH_MAX]) {
return false;
}

// get destination path
llamafile_get_app_dir(dso, PATH_MAX);
strlcat(dso, "ggml-cuda.", PATH_MAX);
strlcat(dso, GetDsoExtension(), PATH_MAX);

// extract prebuilt dso
return llamafile_extract(zip, dso);
}
Expand All @@ -418,7 +410,7 @@ static bool LinkCudaDso(char *dso) {
if (!lib) {
tinyprint(2, Dlerror(), ": failed to load library\n", NULL);
if ((IsLinux() || IsBsd()) && !commandv("cc", dso, PATH_MAX)) {
tinyprint(2, "you need to install a c compiler for gpu support\n", NULL);
tinyprint(2, "you need to install cc for gpu support\n", NULL);
}
return false;
}
Expand Down Expand Up @@ -457,21 +449,29 @@ static bool LinkCudaDso(char *dso) {
}

static bool ImportCudaImpl(void) {
char path[PATH_MAX];

// No dynamic linking support on OpenBSD yet.
if (IsOpenbsd()) {
return false;
}

// try building cuda code from source using cublas
if (CompileNativeCuda(path)) {
return LinkCudaDso(path);
// Get path of CUDA support DSO.
char dso[PATH_MAX];
llamafile_get_app_dir(dso, PATH_MAX);
strlcat(dso, "ggml-cuda.", PATH_MAX);
strlcat(dso, GetDsoExtension(), PATH_MAX);
if (FLAG_nocompile) {
return LinkCudaDso(dso);
}

// Try building CUDA from source with mighty cuBLAS.
if (CompileNativeCuda(dso)) {
return LinkCudaDso(dso);
}

// try using a prebuilt path
if (ExtractCudaDso(path)) {
return LinkCudaDso(path);
// Try extracting prebuilt tinyBLAS DSO from PKZIP.
if (ExtractCudaDso(dso)) {
return LinkCudaDso(dso);
}

// too bad
Expand Down
4 changes: 3 additions & 1 deletion llamafile/llamafile.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
extern "C" {
#endif

extern bool FLAG_nocompile;

struct llamafile;
struct llamafile *llamafile_open(const char *, const char *);
void llamafile_close(struct llamafile *);
Expand All @@ -19,7 +21,7 @@ FILE *llamafile_fp(struct llamafile *);

void llamafile_init(void);
void llamafile_check_cpu(void);
void llamafile_help(const char *) wontreturn;
void llamafile_help(const char *) __attribute__((__noreturn__));
const char *llamafile_get_tmp_dir(void);
bool llamafile_extract(const char *, const char *);
int llamafile_is_file_newer_than(const char *, const char *);
Expand Down
40 changes: 35 additions & 5 deletions llamafile/metal.c
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,12 @@ static const char *Dlerror(void) {
return msg;
}

static bool ImportMetalImpl(void) {
static bool FileExists(const char *path) {
struct stat st;
return !stat(path, &st);
}

static bool BuildMetal(const char *dso) {

// extract source code
char src[PATH_MAX];
Expand Down Expand Up @@ -109,9 +114,6 @@ static bool ImportMetalImpl(void) {
}

// determine if we need to build
char dso[PATH_MAX];
llamafile_get_app_dir(dso, PATH_MAX);
strlcat(dso, "ggml-metal.dylib", sizeof(dso));
if (!needs_rebuild) {
switch (llamafile_is_file_newer_than(src, dso)) {
case -1:
Expand Down Expand Up @@ -180,6 +182,11 @@ static bool ImportMetalImpl(void) {
}
}

return true;
}

static bool LinkMetal(const char *dso) {

// runtime link dynamic shared object
void *lib;
lib = cosmo_dlopen(dso, RTLD_LAZY);
Expand Down Expand Up @@ -210,8 +217,31 @@ static bool ImportMetalImpl(void) {
return true;
}

static bool ImportMetalImpl(void) {

// Ensure this is MacOS ARM64.
if (!IsXnuSilicon()) {
return false;
}

// Get path of DSO.
char dso[PATH_MAX];
llamafile_get_app_dir(dso, PATH_MAX);
strlcat(dso, "ggml-metal.dylib", sizeof(dso));
if (FLAG_nocompile) {
return LinkMetal(dso);
}

// Build and link Metal support DSO if possible.
if (BuildMetal(dso)) {
return LinkMetal(dso);
} else {
return false;
}
}

static void ImportMetal(void) {
if (IsXnuSilicon() && ImportMetalImpl()) {
if (ImportMetalImpl()) {
ggml_metal.supported = true;
ggml_metal.backend_init();
tinyprint(2, "Apple Metal GPU support successfully loaded\n", NULL);
Expand Down
20 changes: 20 additions & 0 deletions llamafile/nocompile.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
// -*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
// vi: set et ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi
//
// Copyright 2023 Mozilla Foundation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "llamafile.h"

bool FLAG_nocompile;

0 comments on commit 8762f13

Please sign in to comment.