Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: use llama cpp server #350

Open
wants to merge 21 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file modified .github/scripts/e2e-test-server-linux-and-mac.sh
100644 → 100755
Empty file.
18 changes: 2 additions & 16 deletions .github/scripts/e2e-test-server-windows.bat
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ echo BINARY_NAME=%BINARY_NAME%

del %TEMP%\response1.log 2>nul
del %TEMP%\response2.log 2>nul
del %TEMP%\response3.log 2>nul
del %TEMP%\response4.log 2>nul
del %TEMP%\response5.log 2>nul
del %TEMP%\response6.log 2>nul
Expand Down Expand Up @@ -65,18 +64,18 @@ call set "MODEL_LLM_PATH_STRING=%%MODEL_LLM_PATH:\=\\%%"
call set "MODEL_EMBEDDING_PATH_STRING=%%MODEL_EMBEDDING_PATH:\=\\%%"
set "curl_data1={\"llama_model_path\":\"%MODEL_LLM_PATH_STRING%\"}"
set "curl_data2={\"messages\":[{\"content\":\"Hello there\",\"role\":\"assistant\"},{\"content\":\"Write a long and sad story for me\",\"role\":\"user\"}],\"stream\":false,\"model\":\"testllm\",\"max_tokens\":50,\"stop\":[\"hello\"],\"frequency_penalty\":0,\"presence_penalty\":0,\"temperature\":0.1}"
set "curl_data3={\"llama_model_path\":\"%MODEL_LLM_PATH_STRING%\"}"
set "curl_data4={\"llama_model_path\":\"%MODEL_EMBEDDING_PATH_STRING%\", \"embedding\": true, \"model_type\": \"embedding\"}"
set "curl_data5={}"
set "curl_data6={\"input\": \"Hello\", \"model\": \"test-embedding\", \"encoding_format\": \"float\"}"
@REM set "curl_data7={\"model\": \"test-embedding\"}"

rem Print the values of curl_data for debugging
echo curl_data1=%curl_data1%
echo curl_data2=%curl_data2%
echo curl_data3=%curl_data3%
echo curl_data4=%curl_data4%
echo curl_data5=%curl_data5%
echo curl_data6=%curl_data6%
@REM echo curl_data7=%curl_data7%

rem Run the curl commands and capture the status code
curl.exe --connect-timeout 60 -o "%TEMP%\response1.log" -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/loadmodel" --header "Content-Type: application/json" --data "%curl_data1%" > %TEMP%\response1.log 2>&1
Expand All @@ -85,8 +84,6 @@ curl.exe --connect-timeout 60 -o "%TEMP%\response2.log" -s -w "%%{http_code}" --
--header "Content-Type: application/json" ^
--data "%curl_data2%" > %TEMP%\response2.log 2>&1

curl.exe --connect-timeout 60 -o "%TEMP%\response3.log" -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/unloadmodel" --header "Content-Type: application/json" --data "%curl_data3%" > %TEMP%\response3.log 2>&1

curl.exe --connect-timeout 60 -o "%TEMP%\response4.log" --request POST -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/loadmodel" --header "Content-Type: application/json" --data "%curl_data4%" > %TEMP%\response4.log 2>&1

curl.exe --connect-timeout 60 -o "%TEMP%\response5.log" --request GET -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/models" --header "Content-Type: application/json" --data "%curl_data5%" > %TEMP%\response5.log 2>&1
Expand All @@ -100,7 +97,6 @@ set "error_occurred=0"
rem Read the status codes from the log files
for /f %%a in (%TEMP%\response1.log) do set "response1=%%a"
for /f %%a in (%TEMP%\response2.log) do set "response2=%%a"
for /f %%a in (%TEMP%\response3.log) do set "response3=%%a"
for /f %%a in (%TEMP%\response4.log) do set "response4=%%a"
for /f %%a in (%TEMP%\response5.log) do set "response5=%%a"
for /f %%a in (%TEMP%\response6.log) do set "response6=%%a"
Expand All @@ -117,12 +113,6 @@ if "%response2%" neq "200" (
set "error_occurred=1"
)

if "%response3%" neq "200" (
echo The third curl command failed with status code: %response3%
type %TEMP%\response3.log
set "error_occurred=1"
)

if "%response4%" neq "200" (
echo The fourth curl command failed with status code: %response4%
type %TEMP%\response4.log
Expand Down Expand Up @@ -158,10 +148,6 @@ echo ----------------------
echo Log run test:
type %TEMP%\response2.log

echo ----------------------
echo Log unload model:
type %TEMP%\response3.log

echo ----------------------
echo Log load embedding model:
type %TEMP%\response4.log
Expand Down
68 changes: 34 additions & 34 deletions .github/workflows/template-quality-gate-pr.yml

Large diffs are not rendered by default.

11 changes: 9 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -54,15 +54,18 @@ endif
pre-package:
ifeq ($(OS),Windows_NT)
@powershell -Command "mkdir -p cortex.llamacpp; cp build\engine.dll cortex.llamacpp\;"
@powershell -Command "cp build\bin\llama-server.exe cortex.llamacpp\;"
@powershell -Command "cp .\.github\patches\windows\msvcp140.dll cortex.llamacpp\;"
@powershell -Command "cp .\.github\patches\windows\vcruntime140_1.dll cortex.llamacpp\;"
@powershell -Command "cp .\.github\patches\windows\vcruntime140.dll cortex.llamacpp\;"
@powershell -Command "cp .\.github\patches\windows\vcomp140.dll cortex.llamacpp\;"
else ifeq ($(shell uname -s),Linux)
@mkdir -p cortex.llamacpp; \
cp build/bin/llama-server cortex.llamacpp/; \
cp build/libengine.so cortex.llamacpp/;
else
@mkdir -p cortex.llamacpp; \
cp build/bin/llama-server cortex.llamacpp/; \
cp build/libengine.dylib cortex.llamacpp/;
endif

Expand Down Expand Up @@ -97,16 +100,18 @@ ifeq ($(RUN_TESTS),false)
@exit 0
endif
ifeq ($(OS),Windows_NT)
@powershell -Command "mkdir -p examples\server\build\engines\cortex.llamacpp; cd examples\server\build; cp ..\..\..\build\engine.dll engines\cortex.llamacpp; ..\..\..\.github\scripts\e2e-test-server-windows.bat server.exe $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL);"
@powershell -Command "mkdir -p examples\server\build\engines\cortex.llamacpp; cd examples\server\build; cp ..\..\..\build\engine.dll engines\cortex.llamacpp; cp ..\..\..\build\bin\llama-server.exe engines\cortex.llamacpp; ..\..\..\.github\scripts\e2e-test-server-windows.bat server.exe $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL);"
else ifeq ($(shell uname -s),Linux)
@mkdir -p examples/server/build/engines/cortex.llamacpp; \
cd examples/server/build/; \
cp ../../../build/libengine.so engines/cortex.llamacpp/; \
cp ../../../build/bin/llama-server engines/cortex.llamacpp/; \
chmod +x ../../../.github/scripts/e2e-test-server-linux-and-mac.sh && ../../../.github/scripts/e2e-test-server-linux-and-mac.sh ./server $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL);
else
@mkdir -p examples/server/build/engines/cortex.llamacpp; \
cd examples/server/build/; \
cp ../../../build/libengine.dylib engines/cortex.llamacpp/; \
cp ../../../build/bin/llama-server engines/cortex.llamacpp/; \
chmod +x ../../../.github/scripts/e2e-test-server-linux-and-mac.sh && ../../../.github/scripts/e2e-test-server-linux-and-mac.sh ./server $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL);
endif

Expand All @@ -118,20 +123,22 @@ endif
ifeq ($(OS),Windows_NT)
@powershell -Command "python -m pip install --upgrade pip"
@powershell -Command "python -m pip install requests;"
@powershell -Command "mkdir -p examples\server\build\engines\cortex.llamacpp; cd examples\server\build; cp ..\..\..\build\engine.dll engines\cortex.llamacpp; python ..\..\..\.github\scripts\e2e-test-server.py server $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL);"
@powershell -Command "mkdir -p examples\server\build\engines\cortex.llamacpp; cd examples\server\build; cp ..\..\..\build\engine.dll engines\cortex.llamacpp; cp ..\..\..\build\bin\llama-server.exe engines\cortex.llamacpp; python ..\..\..\.github\scripts\e2e-test-server.py server $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL);"
else ifeq ($(shell uname -s),Linux)
python -m pip install --upgrade pip;
python -m pip install requests;
@mkdir -p examples/server/build/engines/cortex.llamacpp; \
cd examples/server/build/; \
cp ../../../build/libengine.so engines/cortex.llamacpp/; \
cp ../../../build/bin/llama-server engines/cortex.llamacpp/; \
python ../../../.github/scripts/e2e-test-server.py server $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL);
else
python -m pip install --upgrade pip;
python -m pip install requests;
@mkdir -p examples/server/build/engines/cortex.llamacpp; \
cd examples/server/build/; \
cp ../../../build/libengine.dylib engines/cortex.llamacpp/; \
cp ../../../build/bin/llama-server engines/cortex.llamacpp/; \
python ../../../.github/scripts/e2e-test-server.py server $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL);
endif

Expand Down
1 change: 1 addition & 0 deletions base/cortex-common/enginei.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <vector>

#include "json/value.h"
#include "trantor/utils/AsyncFileLogger.h"
#include "trantor/utils/Logger.h"

// Interface for inference engine.
Expand Down
1 change: 1 addition & 0 deletions examples/server/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ add_executable(${PROJECT_NAME}
server.cc
dylib.h
httplib.h
${CMAKE_CURRENT_SOURCE_DIR}/../../src/file_logger.cc
)

set(THIRD_PARTY_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../../build_deps/_install)
Expand Down
97 changes: 97 additions & 0 deletions examples/server/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
This application allows you to perform inference with various AI models, including vision, text, and embedding models. It employs different strategies for serving these models efficiently, tailored to their specific requirements.

- Vision Models:
For vision models, the application utilizes a dedicated, customized server that runs within the same process as the main application. This approach ensures efficient resource utilization and minimizes overhead, providing seamless inference for vision-based tasks.

- Text and Embedding Models:
To handle text and embedding models, the application spawns a separate child process for each model. This isolation technique prevents potential resource contention and ensures optimal performance, as each model operates independently within its dedicated process environment.

By adopting distinct serving strategies based on model types, the application optimizes resource allocation, maximizes performance, and ensures reliable and efficient inference across a diverse range of AI models.

**API Endpoints**

The application provides the following API endpoints for interacting with the models:

**Load Model**

Endpoint: /loadmodel

Method: curl -X POST

Description: Loads a specified model into the application. For text and embedding models, this endpoint will spawn a new child process to serve the model.

```bash title="Load model"
curl --location 'http://localhost:3928/loadmodel' \
--header 'Content-Type: application/json' \
--data '{
"llama_model_path": "/model/llama-2-7b-model.gguf",
"model_alias": "llama-2-7b-model",
"ctx_len": 512,
"ngl": 100,
"model_type": "llm"
}'
```

**Chat Completion**

Endpoint: /v1/chat/completions

Method: curl -X POST

Description: Performs chat completion using a loaded text model.

```bash title="Inference"
curl --location 'http://localhost:3928/v1/chat/completions' \
--header 'Content-Type: application/json' \
--data '{
"messages": [
{
"role": "user",
"content": "Who won the world series in 2020?"
},
],
"model": "llama-2-7b-model"
}'
```

**Embedding**

Endpoint: /v1/embeddings

Method: curl -X POST

Description: Requests an embedding using a loaded embedding model.
```bash title="Embeddings"
curl --location '127.0.0.1:3928/v1/embeddings' \
--header 'Content-Type: application/json' \
--data '{
"input": ["hello", "The food was delicious and the waiter..."],
"model":"llama-2-7b-model",
"encoding_format": "base64"
}'
```

**Unload Model**

Endpoint: /unloadmodel
Method: curl -X POST

Description: Unloads a specified model from the application. For text and embedding models, this endpoint will also stop the associated child process.
```bash title="Unload Model"
curl --location '127.0.0.1:3928/unloadmodel' \
--header 'Content-Type: application/json' \
--data '{
"model": "test"
}'
```

**Multiple Models**

This application is designed to handle multiple AI models concurrently, ensuring efficient resource utilization and optimal performance. The serving strategy employed varies based on the model type:
- Vision models: multiple instances can run within the same process.
- Text and embedding models: each model will have its own child process.

**Notes**

For vision models, a customized model is started within the same process to serve the model. No new process is needed.
For text and embedding models, a new child process is spawned to serve each model.
2 changes: 2 additions & 0 deletions examples/server/dylib.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
* The `dylib` class represents a single dynamic library instance,
* allowing the access of symbols like functions or global variables
*/
namespace cortex_cpp {
class dylib {
public:
struct filename_components {
Expand Down Expand Up @@ -311,6 +312,7 @@ class dylib {
}
};

}
#undef DYLIB_WIN_MAC_OTHER
#undef DYLIB_WIN_OTHER
#undef DYLIB_CPP17
51 changes: 32 additions & 19 deletions examples/server/server.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,32 @@
#include <condition_variable>
#include <mutex>
#include <queue>
#include "trantor/utils/Logger.h"
#include "../../src/file_logger.h"
#include "../../src/llama_utils.h"

class Server {
public:
Server() {
dylib_ = std::make_unique<dylib>("./engines/cortex.llamacpp", "engine");
auto func = dylib_->get_function<EngineI*()>("get_engine");
engine_ = func();
}
Server() {}

~Server() {
if (engine_) {
delete engine_;
}
}

void Initialize(trantor::AsyncFileLogger* logger) {
dylib_ = std::make_unique<cortex_cpp::dylib>("./engines/cortex.llamacpp",
"engine");
auto func = dylib_->get_function<EngineI*()>("get_engine");
engine_ = func();
EngineI::EngineLoadOption opts;
opts.engine_path = llama_utils::GetExecutableFolderContainerPath() /
"engines" / "cortex.llamacpp";
opts.log_path = "./logs/cortex.log";
opts.max_log_lines = 10000;
engine_->Load(opts);
}

void ForceStopInferencing(const std::string& model_id) {
if (engine_) {
engine_->StopInferencing(model_id);
Expand All @@ -32,7 +43,7 @@ class Server {
}

public:
std::unique_ptr<dylib> dylib_;
std::unique_ptr<cortex_cpp::dylib> dylib_;
EngineI* engine_;

struct SyncQueue {
Expand Down Expand Up @@ -86,16 +97,16 @@ inline void signal_handler(int signal) {
using SyncQueue = Server::SyncQueue;

int main(int argc, char** argv) {
// std::filesystem::create_directories("./logs");
// trantor::AsyncFileLogger asyncFileLogger;
// asyncFileLogger.setFileName("logs/cortex");
// asyncFileLogger.startLogging();
// trantor::Logger::setOutputFunction(
// [&](const char* msg, const uint64_t len) {
// asyncFileLogger.output(msg, len);
// },
// [&]() { asyncFileLogger.flush(); });
// asyncFileLogger.setFileSizeLimit(100000000);
std::filesystem::create_directories("./logs");
trantor::FileLogger async_file_logger;
async_file_logger.setFileName("logs/cortex.log");
async_file_logger.startLogging();
trantor::Logger::setOutputFunction(
[&](const char* msg, const uint64_t len) {
async_file_logger.output_(msg, len);
},
[&]() { async_file_logger.flush(); });
async_file_logger.setFileSizeLimit(100000000);

std::string hostname = "127.0.0.1";
int port = 3928;
Expand All @@ -109,8 +120,9 @@ int main(int argc, char** argv) {
}

Server server;

server.Initialize(&async_file_logger);
//set logger here
// server.engine_->SetFileLogger();

SyncJsonReader r;
auto svr = std::make_unique<httplib::Server>();
Expand Down Expand Up @@ -277,7 +289,8 @@ int main(int argc, char** argv) {
});

shutdown_handler = [&](int) {
running = false;
// only shutdown by /destroy or sent SIGINT twice
// running = false;
};
#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__))
struct sigaction sigint_action;
Expand Down
Loading
Loading