server : add a REST Whisper server example with OAI-like API (ggergan…

…ov#1380) * Add first draft of server * Added json support and base funcs for server.cpp * Add more user input via api-request also some clean up * Add reqest params and load post function Also some general clean up * Remove unused function * Add readme * Add exception handlers * Update examples/server/server.cpp * make : add server target * Add magic curl syntax Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
iThalay · Sep 23, 2024 · e8e1f1a · e8e1f1a
1 parent 6a9a4d0
commit e8e1f1a
Show file tree

Hide file tree

Showing 9 changed files with 34,631 additions and 4 deletions.
diff --git a/.gitignore b/.gitignore
@@ -31,6 +31,7 @@ build-sanitize-thread/
 /talk-llama
 /bench
 /quantize
+/server
 /lsp
 
 arm_neon.h

diff --git a/Makefile b/Makefile
@@ -1,4 +1,4 @@
-default: main bench quantize
+default: main bench quantize server
 
 ifndef UNAME_S
 UNAME_S := $(shell uname -s)
@@ -338,7 +338,7 @@ libwhisper.so: $(WHISPER_OBJ)
  $(CXX) $(CXXFLAGS) -shared -o libwhisper.so $(WHISPER_OBJ) $(LDFLAGS)
 
 clean:
- rm -f *.o main stream command talk talk-llama bench quantize lsp libwhisper.a libwhisper.so
+ rm -f *.o main stream command talk talk-llama bench quantize server lsp libwhisper.a libwhisper.so
 
 #
 # Examples
@@ -359,6 +359,9 @@ bench: examples/bench/bench.cpp $(WHISPER_OBJ)
 quantize: examples/quantize/quantize.cpp $(WHISPER_OBJ) $(SRC_COMMON)
  $(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp $(SRC_COMMON) $(WHISPER_OBJ) -o quantize $(LDFLAGS)
 
+server: examples/server/server.cpp $(SRC_COMMON) $(WHISPER_OBJ)
+ $(CXX) $(CXXFLAGS) examples/server/server.cpp $(SRC_COMMON) $(WHISPER_OBJ) -o server $(LDFLAGS)
+
 stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
  $(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS)
 

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -65,6 +65,7 @@ elseif(CMAKE_JS_VERSION)
 else()
  add_subdirectory(main)
  add_subdirectory(stream)
+ add_subdirectory(server)
  add_subdirectory(command)
  add_subdirectory(bench)
  add_subdirectory(quantize)

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
@@ -165,8 +165,8 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
  else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
  else if (arg == "-f" || arg == "--file") { params.fname_inp.emplace_back(argv[++i]); }
  else if (arg == "-oved" || arg == "--ov-e-device") { params.openvino_encode_device = argv[++i]; }
- else if (arg == "-ls" || arg == "--log-score") { params.log_score = true; }
- else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
+ else if (arg == "-ls" || arg == "--log-score") { params.log_score  = true; }
+ else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu  = false; }
  else {
  fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
  whisper_print_usage(argc, argv, params);

diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
@@ -0,0 +1,6 @@
+set(TARGET server)
+add_executable(${TARGET} server.cpp httplib.h json.hpp)
+
+include(DefaultTargetOptions)
+
+target_link_libraries(${TARGET} PRIVATE common whisper ${CMAKE_THREAD_LIBS_INIT})
diff --git a/examples/server/README.md b/examples/server/README.md
@@ -0,0 +1,59 @@
+# whisper.cpp http server
+
+Simple http server. WAV Files are passed to the inference model via http requests.
+
+```
+./server -h
+
+usage: ./bin/server [options]
+
+options:
+ -h, --help [default] show this help message and exit
+ -t N, --threads N [4 ] number of threads to use during computation
+ -p N, --processors N [1 ] number of processors to use during computation
+ -ot N, --offset-t N [0 ] time offset in milliseconds
+ -on N, --offset-n N [0 ] segment index offset
+ -d N, --duration N [0 ] duration of audio to process in milliseconds
+ -mc N, --max-context N [-1 ] maximum number of text context tokens to store
+ -ml N, --max-len N [0 ] maximum segment length in characters
+ -sow, --split-on-word [false ] split on word rather than on token
+ -bo N, --best-of N [2 ] number of best candidates to keep
+ -bs N, --beam-size N [-1 ] beam size for beam search
+ -wt N, --word-thold N [0.01 ] word timestamp probability threshold
+ -et N, --entropy-thold N [2.40 ] entropy threshold for decoder fail
+ -lpt N, --logprob-thold N [-1.00 ] log probability threshold for decoder fail
+ -debug, --debug-mode [false ] enable debug mode (eg. dump log_mel)
+ -tr, --translate [false ] translate from source language to english
+ -di, --diarize [false ] stereo audio diarization
+ -tdrz, --tinydiarize [false ] enable tinydiarize (requires a tdrz model)
+ -nf, --no-fallback [false ] do not use temperature fallback while decoding
+ -ps, --print-special [false ] print special tokens
+ -pc, --print-colors [false ] print colors
+ -pp, --print-progress [false ] print progress
+ -nt, --no-timestamps [false ] do not print timestamps
+ -l LANG, --language LANG [en ] spoken language ('auto' for auto-detect)
+ -dl, --detect-language [false ] exit after automatically detecting language
+ --prompt PROMPT [ ] initial prompt
+ -m FNAME, --model FNAME [models/ggml-base.en.bin] model path
+ -oved D, --ov-e-device DNAME [CPU ] the OpenVINO device used for encode inference
+ --host HOST, [127.0.0.1] Hostname/ip-adress for the server
+ --port PORT, [8080 ] Port number for the server
+```
+
+## request examples
+
+**/inference**
+```
+curl 127.0.0.1:8080/inference \
+-H "Content-Type: multipart/form-data" \
+-F file="@<file-path>" \
+-F temperature="0.2" \
+-F response-format="json"
+```
+
+**/load**
+```
+curl 127.0.0.1:8080/load \
+-H "Content-Type: multipart/form-data" \
+-F model="<path-to-model-file>"
+```