ggerganov · FSSRepo · Oct 5, 2023 · Oct 5, 2023 · Oct 5, 2023 · Oct 6, 2023
diff --git a/.gitignore b/.gitignore
@@ -53,6 +53,7 @@ models-mnt
 /result
 /save-load-state
 /server
+/server-parallel
 /simple
 /batched
 /export-lora

diff --git a/Makefile b/Makefile
@@ -1,5 +1,5 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server server-parallel embd-input-test gguf llama-bench baby-llama beam-search speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o
 
 # Binaries only useful for tests
 TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe
@@ -520,7 +520,7 @@ OBJS += ggml-alloc.o ggml-backend.o
 llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
-common.o: common/common.cpp common/common.h build-info.h common/log.h
+common.o: common/common.cpp common/common.h build-info.h common/log.h common/httplib.h common/json.hpp
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
 console.o: common/console.cpp common/console.h
@@ -572,9 +572,12 @@ embedding: examples/embedding/embedding.cpp                   build-info.h ggml.
 save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
+server: examples/server/server.cpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)
 
+server-parallel: examples/server-parallel/server.cpp examples/server-parallel/frontend.h build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) -Iexamples/server-parallel $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)
+
 $(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
 

diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
@@ -11,6 +11,8 @@ add_library(${TARGET} OBJECT
     grammar-parser.cpp
     train.h
     train.cpp
+    json.hpp
+    httplib.h
     )
 
 if (BUILD_SHARED_LIBS)

diff --git a/examples/server/httplib.h → common/httplib.h b/examples/server/httplib.h → common/httplib.h
diff --git a/examples/server/json.hpp → common/json.hpp b/examples/server/json.hpp → common/json.hpp
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -35,6 +35,7 @@ else()
     endif()
     if (LLAMA_BUILD_SERVER)
         add_subdirectory(server)
+        add_subdirectory(server-parallel)
     endif()
     add_subdirectory(export-lora)
 endif()
diff --git a/examples/server-parallel/CMakeLists.txt b/examples/server-parallel/CMakeLists.txt
@@ -0,0 +1,15 @@
+set(TARGET server-parallel)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+add_executable(${TARGET} server.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_compile_definitions(${TARGET} PRIVATE
+    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
+)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+if (WIN32)
+    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
+endif()
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
diff --git a/examples/server-parallel/README.md b/examples/server-parallel/README.md
@@ -0,0 +1,91 @@
+# llama.cpp/example/server-parallel
+
+This example demonstrates a PoC HTTP API server that handles simulataneus requests. Long prompts are not supported.
+
+Command line options:
+
+-   `--threads N`, `-t N`: Set the number of threads to use during generation.
+-   `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
+-   `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
+-   `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
+-   `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
+-   `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
+-   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
+-   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
+-   `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `512`.
+-   `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
+-   `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
+-   `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
+-   `--numa`: Attempt optimizations that help on some NUMA systems.
+-   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
+-   `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
+-   `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
+-   `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
+-   `--port`: Set the port to listen. Default: `8080`.
+-   `--path`: path from which to serve static files (default examples/server/public)
+-   `-np N`, `--parallel N`: Set the number of slots for process requests (default: 1)
+-   `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled)
+-   `-r ANTI_PROMPT`, `--reverse-prompt ANTI_PROMPT`: Set a anti prompt, used as user name in prompt generation
+
+## Quick Start
+
+To get started right away, run the following command, making sure to use the correct path for the model you have:
+
+### Unix-based systems (Linux, macOS, etc.):
+
+```bash
+./server-parallel -m models/7B/ggml-model.gguf --ctx_size 2048 -t 4 -ngl 33 --batch-size 512 --parallel 3 -n 512 --cont-batching
+```
+
+### Windows:
+
+```powershell
+server-parallel.exe -m models\7B\ggml-model.gguf --ctx_size 2048 -t 4 -ngl 33 --batch-size 512 --parallel 3 -n 512 --cont-batching
+```
+The above command will start a server that by default listens on `127.0.0.1:8080`.
+
+## API Endpoints
+
+-   **GET** `/props`: Return the user and assistant name for generate the prompt.
+
+*Response:*
+```json
+{
+    "user_name": "User:",
+    "assistant_name": "Assistant:"
+}
+```
+
+-   **POST** `/completion`: Given a prompt, it returns the predicted completion, just streaming mode.
+
+    *Options:*
+
+    `temperature`: Adjust the randomness of the generated text (default: 0.1).
+
+    `prompt`: Provide a prompt as a string, It should be a coherent continuation of the system prompt.
+
+    `system_prompt`: Provide a system prompt as a string.
+
+    `anti_prompt`: Provide the name of the user coherent with the system prompt.
+
+    `assistant_name`: Provide the name of the assistant coherent with the system prompt.
+
+*Example request:*
+```json
+{
+    "system_prompt": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n\nHuman: Hello\nAssistant: Hi, how may I help you?\nHuman:",
+    "anti_prompt": "Human:",
+    "assistant_name": "Assistant:",
+    "prompt": "When is the day of independency of US?",
+    "temperature": 0.2
+}
+```
+
+*Response:*
+```json
+{
+    "content": "<token_str>"
+}
+```
+
+# This example is a Proof of Concept, have some bugs and unexpected behaivors, this not supports long prompts.