Skip to content

Commit

Permalink
Include llamafiler in llamafile binary
Browse files Browse the repository at this point in the history
It's now possible to say:

    llamafile --server --v2 --help
    llamafile --server --v2

To use llamafiler from any llamafile binary.
  • Loading branch information
jart committed Jan 5, 2025
1 parent c293359 commit e64c7e2
Show file tree
Hide file tree
Showing 18 changed files with 507 additions and 417 deletions.
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,17 @@ ChatCompletionMessage(content='There once was a programmer named Mike\nWho wrote

</details>

## New v2 Server

We have a new server that has a better web gui. It also implements
OpenAI API compatible endpoints, including embeddings. It's designed to
be more reliable. It's better able to recycle context windows across
multiple slots. To try it, run:

```
llamafile --server --v2 --help
llamafile --server --v2
```

## Other example llamafiles

Expand Down
6 changes: 3 additions & 3 deletions build/config.mk
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#── vi: set noet ft=make ts=8 sw=8 fenc=utf-8 :vi ────────────────────┘

PREFIX = /usr/local
COSMOCC = .cosmocc/4.0.0
COSMOCC = .cosmocc/4.0.2
TOOLCHAIN = $(COSMOCC)/bin/cosmo

CC = $(TOOLCHAIN)cc
Expand Down Expand Up @@ -52,5 +52,5 @@ clean:; rm -rf o
.PHONY: distclean
distclean:; rm -rf o .cosmocc

.cosmocc/4.0.0:
build/download-cosmocc.sh $@ 4.0.0 15d8ab4442c94ce925f1d59884c772ab817af5e2889549c21ce5fa11c5d773bc
.cosmocc/4.0.2:
build/download-cosmocc.sh $@ 4.0.2 85b8c37a406d862e656ad4ec14be9f6ce474c1b436b9615e91a55208aced3f44
5 changes: 5 additions & 0 deletions llama.cpp/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
FLAG_fast = true;
return true;
}
if (arg == "--v2") {
FLAG_v2 = true;
return true;
}
if (arg == "--iq") {
FLAG_iq = true;
return true;
Expand Down Expand Up @@ -633,6 +637,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
if (arg == "-m" || arg == "--model") {
CHECK_ARG
params.model = argv[i];
FLAG_model = params.model.c_str(); // [jart]
return true;
}
if (arg == "-md" || arg == "--model-draft") {
Expand Down
5 changes: 4 additions & 1 deletion llama.cpp/main/BUILD.mk
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,16 @@ LLAMA_CPP_MAIN_OBJS = $(LLAMA_CPP_MAIN_SRCS:%.cpp=o/$(MODE)/%.o)
o/$(MODE)/llama.cpp/main/main: \
o/$(MODE)/llama.cpp/main/main.o \
o/$(MODE)/llama.cpp/main/embedding.o \
o/$(MODE)/llamafile/server/server.a \
o/$(MODE)/llama.cpp/server/server.a \
o/$(MODE)/llama.cpp/llava/llava.a \
o/$(MODE)/llama.cpp/llama.cpp.a \
o/$(MODE)/llamafile/highlight/highlight.a \
o/$(MODE)/third_party/stb/stb.a \
o/$(MODE)/llama.cpp/main/main.1.asc.zip.o \
$(LLAMA_CPP_SERVER_ASSETS:%=o/$(MODE)/%.zip.o)
o/$(MODE)/llamafile/server/main.1.asc.zip.o \
$(LLAMA_CPP_SERVER_ASSETS:%=o/$(MODE)/%.zip.o) \
$(LLAMAFILE_SERVER_ASSETS:%=o/$(MODE)/%.zip.o) \

$(LLAMA_CPP_MAIN_OBJS): llama.cpp/main/BUILD.mk

Expand Down
29 changes: 21 additions & 8 deletions llama.cpp/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include "llama.cpp/ggml-metal.h"
#include "llama.cpp/llava/llava.h"
#include "llama.cpp/server/server.h"
#include "llamafile/server/prog.h"

static llama_context ** g_ctx;
static llama_model ** g_model;
Expand Down Expand Up @@ -150,9 +151,11 @@ enum Program {
SERVER,
CHATBOT,
EMBEDDING,
LLAMAFILER,
};

enum Program determine_program(char *argv[]) {
bool v2 = false;
enum Program prog = UNKNOWN;
for (int i = 0; argv[i]; ++i) {
if (!strcmp(argv[i], "--cli")) {
Expand All @@ -163,16 +166,18 @@ enum Program determine_program(char *argv[]) {
prog = SERVER;
} else if (!strcmp(argv[i], "--embedding")) {
prog = EMBEDDING;
} else if (!strcmp(argv[i], "--v2")) {
v2 = true;
}
}
if (prog == SERVER && v2) {
prog = LLAMAFILER;
}
return prog;
}

int main(int argc, char ** argv) {

mallopt(M_GRANULARITY, 2 * 1024 * 1024);
mallopt(M_MMAP_THRESHOLD, 16 * 1024 * 1024);
mallopt(M_TRIM_THRESHOLD, 128 * 1024 * 1024);
llamafile_check_cpu();

if (llamafile_has(argv, "--version")) {
puts("llamafile v" LLAMAFILE_VERSION_STRING);
Expand All @@ -182,16 +187,24 @@ int main(int argc, char ** argv) {
if (llamafile_has(argv, "-h") ||
llamafile_has(argv, "-help") ||
llamafile_has(argv, "--help")) {
llamafile_help("/zip/llama.cpp/main/main.1.asc");
if (llamafile_has(argv, "--v2")) {
llamafile_help("/zip/llamafile/server/main.1.asc");
} else {
llamafile_help("/zip/llama.cpp/main/main.1.asc");
}
__builtin_unreachable();
}

llamafile_check_cpu();
enum Program prog = determine_program(argv);
if (prog == LLAMAFILER)
return lf::server::main(argc, argv);

mallopt(M_GRANULARITY, 2 * 1024 * 1024);
mallopt(M_MMAP_THRESHOLD, 16 * 1024 * 1024);
mallopt(M_TRIM_THRESHOLD, 128 * 1024 * 1024);
ShowCrashReports();
argc = cosmo_args("/zip/.args", &argv);

enum Program prog = determine_program(argv);

if (prog == SERVER)
return server_cli(argc, argv);

Expand Down
3 changes: 2 additions & 1 deletion llamafile/chatbot_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ bool is_base_model() {
}

int main(int argc, char **argv) {
signal(SIGPIPE, SIG_IGN);

// print logo
logo(argv);
Expand Down Expand Up @@ -147,7 +148,7 @@ int main(int argc, char **argv) {
if (g_params.n_ctx < g_params.n_batch)
g_params.n_batch = g_params.n_ctx;

bool want_server = !llamafile_has(argv, "--chat");
bool want_server = !llamafile_has(argv, "--chat") && !llamafile_has(argv, "--v2");
if (want_server) {
print_ephemeral("launching server...");
pthread_t thread;
Expand Down
9 changes: 9 additions & 0 deletions llamafile/flags.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ bool FLAG_recompile = false;
bool FLAG_tinyblas = false;
bool FLAG_trace = false;
bool FLAG_unsecure = false;
bool FLAG_v2 = false;
const char *FLAG_chat_template = "";
const char *FLAG_db = nullptr;
const char *FLAG_db_startup_sql = "PRAGMA journal_mode=WAL;"
Expand Down Expand Up @@ -160,6 +161,11 @@ void llamafile_get_flags(int argc, char **argv) {
//////////////////////////////////////////////////////////////////////
// chatbot flags

if (!strcmp(flag, "--v2")) {
FLAG_v2 = true;
continue;
}

if (!strcmp(flag, "--ascii")) {
FLAG_ascii = true;
continue;
Expand Down Expand Up @@ -215,6 +221,9 @@ void llamafile_get_flags(int argc, char **argv) {
//////////////////////////////////////////////////////////////////////
// server flags

if (!strcmp(flag, "--server"))
continue;

if (!strcmp(flag, "-l") || !strcmp(flag, "--listen")) {
if (i == argc)
missing("--listen");
Expand Down
1 change: 1 addition & 0 deletions llamafile/llamafile.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ extern bool FLAG_tinyblas;
extern bool FLAG_trace;
extern bool FLAG_trap;
extern bool FLAG_unsecure;
extern bool FLAG_v2;
extern const char *FLAG_chat_template;
extern const char *FLAG_db;
extern const char *FLAG_db_startup_sql;
Expand Down
3 changes: 3 additions & 0 deletions llamafile/server/client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -704,6 +704,8 @@ Client::dispatcher()
if (p1 == "flagz")
return flagz();

#if 0
// TODO: implement frontend for database
if (p1 == "db/chats" || p1 == "db/chats/")
return db_chats();
if (p1.starts_with("db/chat/")) {
Expand All @@ -721,6 +723,7 @@ Client::dispatcher()
if (id != -1)
return db_messages(id);
}
#endif

// serve static endpoints
int infd;
Expand Down
19 changes: 13 additions & 6 deletions llamafile/server/listen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,21 +42,21 @@ print_listening_url(unsigned ip, int port)
}

int
create_listening_socket(const char* hostport)
create_listening_socket(const char* hostport, unsigned* out_ip, int* out_port)
{
// parse hostname:port
char* p;
char* host;
char* port;
char addr[128];
const char* host;
const char* port;
strlcpy(addr, hostport, sizeof(addr));
if ((p = strrchr(addr, ':'))) {
*p = '\0';
host = addr;
port = p + 1;
} else {
host = NULL;
port = addr;
host = addr;
port = "8080";
}

// turn listen address names into numbers
Expand Down Expand Up @@ -103,14 +103,21 @@ create_listening_socket(const char* hostport)
exit(1);
}
struct sockaddr_in* in = (struct sockaddr_in*)ai->ai_addr;
if (out_port)
*out_port = ntohs(in->sin_port);
if (ntohl(in->sin_addr.s_addr) == INADDR_ANY) {
int i;
uint32_t* hostips;
for (hostips = GetHostIps(), i = 0; hostips[i]; ++i)
for (hostips = GetHostIps(), i = 0; hostips[i]; ++i) {
print_listening_url(hostips[i], ntohs(in->sin_port));
if (out_ip)
*out_ip = hostips[i];
}
free(hostips);
} else {
print_listening_url(ntohl(in->sin_addr.s_addr), ntohs(in->sin_port));
if (out_ip)
*out_ip = ntohl(in->sin_addr.s_addr);
}

freeaddrinfo(ai);
Expand Down
2 changes: 2 additions & 0 deletions llamafile/server/log.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
namespace lf {
namespace server {

bool g_log_disable;

static thread_local char g_thread_name[128];

const char*
Expand Down
15 changes: 9 additions & 6 deletions llamafile/server/log.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,19 @@
#include <libc/intrin/kprintf.h>

#define SLOG(FMT, ...) \
kprintf("%s %s:%d %s " FMT "\n", \
get_log_timestamp(), \
__FILE__, \
__LINE__, \
get_thread_name(), \
##__VA_ARGS__)
(!lf::server::g_log_disable && (kprintf("%s %s:%d %s " FMT "\n", \
get_log_timestamp(), \
__FILE__, \
__LINE__, \
get_thread_name(), \
##__VA_ARGS__), \
0))

namespace lf {
namespace server {

extern bool g_log_disable;

const char*
get_thread_name(void);

Expand Down
13 changes: 6 additions & 7 deletions llamafile/server/main.1
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,18 @@
.Os Mozilla Ocho
.Sh NAME
.Nm llamafiler
.Nd fast reliable embedding server
.Nd fast reliable large language model server
.Sh SYNOPSIS
.Nm
.Fl m Ar model.gguf
.Op flags...
.Sh DESCRIPTION
.Nm
is a brand new HTTP server for Large Language Models (LLMs). To date,
its development has been focused on doing fewer things really well, and
right now that's serving embeddings. It offers you 3.4x the performance,
stronger security, client request prioritization, request preemption, as
well as request isolation that helps ensure software bugs won't cause
the whole server to crash.
llamafiler is an HTTP server for Large Language Models (LLMs). It
includes a web GUI for both chatbot and text completion. It can be your
OpenAI API compatible embeddings / completions / chat completions
server. It's able to more intelligently recycle context windows across
multiple slots serving multiple clients.
.Sh OPTIONS
The following options are available:
.Bl -tag -width indent
Expand Down
Loading

0 comments on commit e64c7e2

Please sign in to comment.