Include llamafiler in llamafile binary

It's now possible to say: llamafile --server --v2 --help llamafile --server --v2 To use llamafiler from any llamafile binary.
Mozilla-Ocho · Jan 5, 2025 · e64c7e2 · e64c7e2
1 parent c293359
commit e64c7e2
Show file tree

Hide file tree

Showing 18 changed files with 507 additions and 417 deletions.
diff --git a/README.md b/README.md
@@ -164,6 +164,17 @@ ChatCompletionMessage(content='There once was a programmer named Mike\nWho wrote
 
 </details>
 
+## New v2 Server
+
+We have a new server that has a better web gui. It also implements
+OpenAI API compatible endpoints, including embeddings. It's designed to
+be more reliable. It's better able to recycle context windows across
+multiple slots. To try it, run:
+
+```
+llamafile --server --v2 --help
+llamafile --server --v2
+```
 
 ## Other example llamafiles
 

diff --git a/build/config.mk b/build/config.mk
@@ -2,7 +2,7 @@
 #── vi: set noet ft=make ts=8 sw=8 fenc=utf-8 :vi ────────────────────┘
 
 PREFIX = /usr/local
-COSMOCC = .cosmocc/4.0.0
+COSMOCC = .cosmocc/4.0.2
 TOOLCHAIN = $(COSMOCC)/bin/cosmo
 
 CC = $(TOOLCHAIN)cc
@@ -52,5 +52,5 @@ clean:; rm -rf o
 .PHONY: distclean
 distclean:; rm -rf o .cosmocc
 
-.cosmocc/4.0.0:
-	build/download-cosmocc.sh $@ 4.0.0 15d8ab4442c94ce925f1d59884c772ab817af5e2889549c21ce5fa11c5d773bc
+.cosmocc/4.0.2:
+	build/download-cosmocc.sh $@ 4.0.2 85b8c37a406d862e656ad4ec14be9f6ce474c1b436b9615e91a55208aced3f44
diff --git a/llama.cpp/common.cpp b/llama.cpp/common.cpp
@@ -217,6 +217,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         FLAG_fast = true;
         return true;
     }
+    if (arg == "--v2") {
+        FLAG_v2 = true;
+        return true;
+    }
     if (arg == "--iq") {
         FLAG_iq = true;
         return true;
@@ -633,6 +637,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
     if (arg == "-m" || arg == "--model") {
         CHECK_ARG
         params.model = argv[i];
+        FLAG_model = params.model.c_str(); // [jart]
         return true;
     }
     if (arg == "-md" || arg == "--model-draft") {

diff --git a/llama.cpp/main/BUILD.mk b/llama.cpp/main/BUILD.mk
@@ -11,13 +11,16 @@ LLAMA_CPP_MAIN_OBJS = $(LLAMA_CPP_MAIN_SRCS:%.cpp=o/$(MODE)/%.o)
 o/$(MODE)/llama.cpp/main/main:					\
 		o/$(MODE)/llama.cpp/main/main.o			\
 		o/$(MODE)/llama.cpp/main/embedding.o		\
+		o/$(MODE)/llamafile/server/server.a		\
 		o/$(MODE)/llama.cpp/server/server.a		\
 		o/$(MODE)/llama.cpp/llava/llava.a		\
 		o/$(MODE)/llama.cpp/llama.cpp.a			\
 		o/$(MODE)/llamafile/highlight/highlight.a	\
 		o/$(MODE)/third_party/stb/stb.a			\
 		o/$(MODE)/llama.cpp/main/main.1.asc.zip.o	\
-		$(LLAMA_CPP_SERVER_ASSETS:%=o/$(MODE)/%.zip.o)
+		o/$(MODE)/llamafile/server/main.1.asc.zip.o	\
+		$(LLAMA_CPP_SERVER_ASSETS:%=o/$(MODE)/%.zip.o)	\
+		$(LLAMAFILE_SERVER_ASSETS:%=o/$(MODE)/%.zip.o)	\
 
 $(LLAMA_CPP_MAIN_OBJS): llama.cpp/main/BUILD.mk
 

diff --git a/llama.cpp/main/main.cpp b/llama.cpp/main/main.cpp
@@ -27,6 +27,7 @@
 #include "llama.cpp/ggml-metal.h"
 #include "llama.cpp/llava/llava.h"
 #include "llama.cpp/server/server.h"
+#include "llamafile/server/prog.h"
 
 static llama_context           ** g_ctx;
 static llama_model             ** g_model;
@@ -150,9 +151,11 @@ enum Program {
     SERVER,
     CHATBOT,
     EMBEDDING,
+    LLAMAFILER,
 };
 
 enum Program determine_program(char *argv[]) {
+    bool v2 = false;
     enum Program prog = UNKNOWN;
     for (int i = 0; argv[i]; ++i) {
         if (!strcmp(argv[i], "--cli")) {
@@ -163,16 +166,18 @@ enum Program determine_program(char *argv[]) {
             prog = SERVER;
         } else if (!strcmp(argv[i], "--embedding")) {
             prog = EMBEDDING;
+        } else if (!strcmp(argv[i], "--v2")) {
+            v2 = true;
         }
     }
+    if (prog == SERVER && v2) {
+        prog = LLAMAFILER;
+    }
     return prog;
 }
 
 int main(int argc, char ** argv) {
-
-    mallopt(M_GRANULARITY, 2 * 1024 * 1024);
-    mallopt(M_MMAP_THRESHOLD, 16 * 1024 * 1024);
-    mallopt(M_TRIM_THRESHOLD, 128 * 1024 * 1024);
+    llamafile_check_cpu();
 
     if (llamafile_has(argv, "--version")) {
         puts("llamafile v" LLAMAFILE_VERSION_STRING);
@@ -182,16 +187,24 @@ int main(int argc, char ** argv) {
     if (llamafile_has(argv, "-h") ||
         llamafile_has(argv, "-help") ||
         llamafile_has(argv, "--help")) {
-        llamafile_help("/zip/llama.cpp/main/main.1.asc");
+        if (llamafile_has(argv, "--v2")) {
+            llamafile_help("/zip/llamafile/server/main.1.asc");
+        } else {
+            llamafile_help("/zip/llama.cpp/main/main.1.asc");
+        }
         __builtin_unreachable();
     }
 
-    llamafile_check_cpu();
+    enum Program prog = determine_program(argv);
+    if (prog == LLAMAFILER)
+        return lf::server::main(argc, argv);
+
+    mallopt(M_GRANULARITY, 2 * 1024 * 1024);
+    mallopt(M_MMAP_THRESHOLD, 16 * 1024 * 1024);
+    mallopt(M_TRIM_THRESHOLD, 128 * 1024 * 1024);
     ShowCrashReports();
     argc = cosmo_args("/zip/.args", &argv);
 
-    enum Program prog = determine_program(argv);
-
     if (prog == SERVER)
         return server_cli(argc, argv);
 

diff --git a/llamafile/chatbot_main.cpp b/llamafile/chatbot_main.cpp
@@ -107,6 +107,7 @@ bool is_base_model() {
 }
 
 int main(int argc, char **argv) {
+    signal(SIGPIPE, SIG_IGN);
 
     // print logo
     logo(argv);
@@ -147,7 +148,7 @@ int main(int argc, char **argv) {
     if (g_params.n_ctx < g_params.n_batch)
         g_params.n_batch = g_params.n_ctx;
 
-    bool want_server = !llamafile_has(argv, "--chat");
+    bool want_server = !llamafile_has(argv, "--chat") && !llamafile_has(argv, "--v2");
     if (want_server) {
         print_ephemeral("launching server...");
         pthread_t thread;

diff --git a/llamafile/flags.cpp b/llamafile/flags.cpp
@@ -52,6 +52,7 @@ bool FLAG_recompile = false;
 bool FLAG_tinyblas = false;
 bool FLAG_trace = false;
 bool FLAG_unsecure = false;
+bool FLAG_v2 = false;
 const char *FLAG_chat_template = "";
 const char *FLAG_db = nullptr;
 const char *FLAG_db_startup_sql = "PRAGMA journal_mode=WAL;"
@@ -160,6 +161,11 @@ void llamafile_get_flags(int argc, char **argv) {
         //////////////////////////////////////////////////////////////////////
         // chatbot flags
 
+        if (!strcmp(flag, "--v2")) {
+            FLAG_v2 = true;
+            continue;
+        }
+
         if (!strcmp(flag, "--ascii")) {
             FLAG_ascii = true;
             continue;
@@ -215,6 +221,9 @@ void llamafile_get_flags(int argc, char **argv) {
         //////////////////////////////////////////////////////////////////////
         // server flags
 
+        if (!strcmp(flag, "--server"))
+            continue;
+
         if (!strcmp(flag, "-l") || !strcmp(flag, "--listen")) {
             if (i == argc)
                 missing("--listen");

diff --git a/llamafile/llamafile.h b/llamafile/llamafile.h
@@ -23,6 +23,7 @@ extern bool FLAG_tinyblas;
 extern bool FLAG_trace;
 extern bool FLAG_trap;
 extern bool FLAG_unsecure;
+extern bool FLAG_v2;
 extern const char *FLAG_chat_template;
 extern const char *FLAG_db;
 extern const char *FLAG_db_startup_sql;

diff --git a/llamafile/server/client.cpp b/llamafile/server/client.cpp
@@ -704,6 +704,8 @@ Client::dispatcher()
     if (p1 == "flagz")
         return flagz();
 
+#if 0
+    // TODO: implement frontend for database
     if (p1 == "db/chats" || p1 == "db/chats/")
         return db_chats();
     if (p1.starts_with("db/chat/")) {
@@ -721,6 +723,7 @@ Client::dispatcher()
         if (id != -1)
             return db_messages(id);
     }
+#endif
 
     // serve static endpoints
     int infd;

diff --git a/llamafile/server/listen.cpp b/llamafile/server/listen.cpp
@@ -42,21 +42,21 @@ print_listening_url(unsigned ip, int port)
 }
 
 int
-create_listening_socket(const char* hostport)
+create_listening_socket(const char* hostport, unsigned* out_ip, int* out_port)
 {
     // parse hostname:port
     char* p;
-    char* host;
-    char* port;
     char addr[128];
+    const char* host;
+    const char* port;
     strlcpy(addr, hostport, sizeof(addr));
     if ((p = strrchr(addr, ':'))) {
         *p = '\0';
         host = addr;
         port = p + 1;
     } else {
-        host = NULL;
-        port = addr;
+        host = addr;
+        port = "8080";
     }
 
     // turn listen address names into numbers
@@ -103,14 +103,21 @@ create_listening_socket(const char* hostport)
         exit(1);
     }
     struct sockaddr_in* in = (struct sockaddr_in*)ai->ai_addr;
+    if (out_port)
+        *out_port = ntohs(in->sin_port);
     if (ntohl(in->sin_addr.s_addr) == INADDR_ANY) {
         int i;
         uint32_t* hostips;
-        for (hostips = GetHostIps(), i = 0; hostips[i]; ++i)
+        for (hostips = GetHostIps(), i = 0; hostips[i]; ++i) {
             print_listening_url(hostips[i], ntohs(in->sin_port));
+            if (out_ip)
+                *out_ip = hostips[i];
+        }
         free(hostips);
     } else {
         print_listening_url(ntohl(in->sin_addr.s_addr), ntohs(in->sin_port));
+        if (out_ip)
+            *out_ip = ntohl(in->sin_addr.s_addr);
     }
 
     freeaddrinfo(ai);

diff --git a/llamafile/server/log.cpp b/llamafile/server/log.cpp
@@ -23,6 +23,8 @@
 namespace lf {
 namespace server {
 
+bool g_log_disable;
+
 static thread_local char g_thread_name[128];
 
 const char*

diff --git a/llamafile/server/log.h b/llamafile/server/log.h
@@ -19,16 +19,19 @@
 #include <libc/intrin/kprintf.h>
 
 #define SLOG(FMT, ...) \
-    kprintf("%s %s:%d %s " FMT "\n", \
-            get_log_timestamp(), \
-            __FILE__, \
-            __LINE__, \
-            get_thread_name(), \
-            ##__VA_ARGS__)
+    (!lf::server::g_log_disable && (kprintf("%s %s:%d %s " FMT "\n", \
+                                            get_log_timestamp(), \
+                                            __FILE__, \
+                                            __LINE__, \
+                                            get_thread_name(), \
+                                            ##__VA_ARGS__), \
+                                    0))
 
 namespace lf {
 namespace server {
 
+extern bool g_log_disable;
+
 const char*
 get_thread_name(void);
 

diff --git a/llamafile/server/main.1 b/llamafile/server/main.1
@@ -3,19 +3,18 @@
 .Os Mozilla Ocho
 .Sh NAME
 .Nm llamafiler
-.Nd fast reliable embedding server
+.Nd fast reliable large language model server
 .Sh SYNOPSIS
 .Nm
 .Fl m Ar model.gguf
 .Op flags...
 .Sh DESCRIPTION
 .Nm
-is a brand new HTTP server for Large Language Models (LLMs). To date,
-its development has been focused on doing fewer things really well, and
-right now that's serving embeddings. It offers you 3.4x the performance,
-stronger security, client request prioritization, request preemption, as
-well as request isolation that helps ensure software bugs won't cause
-the whole server to crash.
+llamafiler is an HTTP server for Large Language Models (LLMs). It
+includes a web GUI for both chatbot and text completion. It can be your
+OpenAI API compatible embeddings / completions / chat completions
+server. It's able to more intelligently recycle context windows across
+multiple slots serving multiple clients.
 .Sh OPTIONS
 The following options are available:
 .Bl -tag -width indent